skillguard/artifacts/api-server/src/lib/skillParser.ts

import { Unzip, UnzipInflate, strFromU8 } from "fflate";
import type { FileKind, ParsedFile } from "./ruleCatalog";
import { hashBytes } from "./skillFingerprint";

const LANG_BY_EXT: Record<string, string> = {
  sh: "shell",
  bash: "shell",
  zsh: "shell",
  py: "python",
  js: "javascript",
  mjs: "javascript",
  cjs: "javascript",
  ts: "typescript",
  rb: "ruby",
  pl: "perl",
  php: "php",
  ps1: "powershell",
  go: "go",
  rs: "rust",
  md: "markdown",
  txt: "text",
  json: "json",
  yaml: "yaml",
  yml: "yaml",
  toml: "toml",
  env: "dotenv",
};

const SCRIPT_EXTS = new Set([
  "sh",
  "bash",
  "zsh",
  "py",
  "js",
  "mjs",
  "cjs",
  "ts",
  "rb",
  "pl",
  "php",
  "ps1",
  "go",
  "rs",
]);

const SKIP_DIRS = ["__macosx/", ".git/", "node_modules/"];

const MAX_ZIP_FILES = 2000;
const MAX_ZIP_TOTAL_BYTES = 60 * 1024 * 1024;
const MAX_ZIP_FILE_BYTES = 5 * 1024 * 1024;

/**
 * Detects the local-file-header signature ("PK\x03\x04") that every real ZIP
 * archive begins with. `.skill` files exported by the Skill tooling are ZIP
 * containers, so we sniff the bytes rather than trusting the file extension.
 */
function looksLikeZip(buffer: Buffer): boolean {
  return (
    buffer.length >= 4 &&
    buffer[0] === 0x50 &&
    buffer[1] === 0x4b &&
    buffer[2] === 0x03 &&
    buffer[3] === 0x04
  );
}

function extOf(path: string): string {
  const base = path.split("/").pop() ?? path;
  const dot = base.lastIndexOf(".");
  return dot >= 0 ? base.slice(dot + 1).toLowerCase() : "";
}

function classify(path: string): FileKind {
  const base = (path.split("/").pop() ?? path).toLowerCase();
  const ext = extOf(path);
  if (base === "skill.md") return "instruction";
  if (SCRIPT_EXTS.has(ext)) return "script";
  if (ext === "md" || ext === "txt") return "instruction";
  return "resource";
}

function isProbablyBinary(bytes: Uint8Array): boolean {
  const len = Math.min(bytes.length, 4000);
  let nontext = 0;
  for (let i = 0; i < len; i++) {
    const b = bytes[i];
    if (b === 0) return true;
    if (b < 9 || (b > 13 && b < 32)) nontext++;
  }
  return len > 0 && nontext / len > 0.3;
}

function concatChunks(chunks: Uint8Array[], total: number): Uint8Array {
  const out = new Uint8Array(total);
  let offset = 0;
  for (const c of chunks) {
    out.set(c, offset);
    offset += c.length;
  }
  return out;
}

/**
 * Streaming ZIP extraction. Limits (file count, total uncompressed bytes,
 * per-file bytes) are enforced WHILE decompressing — input is pushed in small
 * chunks and decompression is aborted as soon as a cap is exceeded, so a
 * crafted "zip bomb" cannot be fully inflated into memory before checks apply.
 */
export function parseZip(buffer: Buffer): ParsedFile[] {
  const data = new Uint8Array(buffer);
  const result: ParsedFile[] = [];
  let totalBytes = 0;
  let fileCount = 0;
  let abortReason: string | null = null;

  const unzip = new Unzip();
  unzip.register(UnzipInflate);

  unzip.onfile = (file) => {
    if (abortReason) return;
    const path = file.name.replace(/\\/g, "/");
    if (path.endsWith("/")) return;
    const lower = path.toLowerCase();
    if (SKIP_DIRS.some((d) => lower.includes(d))) return;

    // Early skip using the declared uncompressed size (when present). Not
    // calling start() causes fflate to skip the file's data without inflating.
    if (
      typeof file.originalSize === "number" &&
      file.originalSize > MAX_ZIP_FILE_BYTES
    ) {
      return;
    }

    fileCount += 1;
    if (fileCount > MAX_ZIP_FILES) {
      abortReason = "ZIP-Archiv enthält zu viele Dateien.";
      return;
    }

    const chunks: Uint8Array[] = [];
    let fileBytes = 0;
    let skipFile = false;

    file.ondata = (err, chunk, final) => {
      if (abortReason) return;
      if (err) {
        abortReason = "ZIP-Archiv konnte nicht entpackt werden.";
        return;
      }
      if (chunk && chunk.length > 0) {
        fileBytes += chunk.length;
        totalBytes += chunk.length;
        if (totalBytes > MAX_ZIP_TOTAL_BYTES) {
          abortReason = "ZIP-Archiv ist zu groß (entpackt).";
          return;
        }
        if (fileBytes > MAX_ZIP_FILE_BYTES) {
          // Per-file cap hit (e.g. spoofed header size): drop buffered data,
          // keep counting toward the total cap as a backstop.
          skipFile = true;
          chunks.length = 0;
          return;
        }
        if (!skipFile) chunks.push(chunk);
      }
      if (final && !abortReason && !skipFile) {
        const bytes = concatChunks(chunks, fileBytes);
        chunks.length = 0;
        if (bytes.length === 0) return;
        const hash = hashBytes(bytes);
        if (isProbablyBinary(bytes)) {
          result.push({
            path,
            kind: "resource",
            language: null,
            content: "",
            size: bytes.length,
            hash,
            isBinary: true,
          });
        } else {
          result.push({
            path,
            kind: classify(path),
            language: LANG_BY_EXT[extOf(path)] ?? null,
            content: strFromU8(bytes),
            size: bytes.length,
            hash,
            isBinary: false,
          });
        }
      }
    };

    file.start();
  };

  const CHUNK = 64 * 1024;
  try {
    for (let i = 0; i < data.length; i += CHUNK) {
      if (abortReason) break;
      const end = Math.min(i + CHUNK, data.length);
      unzip.push(data.subarray(i, end), end >= data.length);
    }
  } catch {
    throw new Error("ZIP-Archiv konnte nicht entpackt werden.");
  }

  if (abortReason) throw new Error(abortReason);
  return result;
}

/**
 * Entry point for uploaded files (single-file *and* ZIP-area uploads). A
 * `.skill` file is really a ZIP container, so we treat any upload that either
 * carries the ZIP signature or uses a `.zip`/`.skill` extension as an archive
 * and extract it via the streaming ZIP path. If the buffer is not a real
 * archive (e.g. someone named a plain text file `.skill`) we fall back cleanly
 * to single-file handling instead of failing. Real archives still surface their
 * limit/corruption errors so the existing protections stay in force.
 */
export function parseUpload(filename: string, buffer: Buffer): ParsedFile[] {
  const isZipSignature = looksLikeZip(buffer);
  const hasArchiveExt = /\.(zip|skill)$/i.test(filename);
  if (isZipSignature || hasArchiveExt) {
    try {
      const files = parseZip(buffer);
      if (files.length > 0) return files;
      // A valid-but-empty archive falls through to single-file handling.
    } catch (err) {
      // A buffer with a real ZIP signature is genuinely an archive, so limit
      // and corruption errors must surface. An extension-only guess that is
      // not actually a ZIP falls back to single-file handling.
      if (isZipSignature) throw err;
    }
  }
  return [parseSingleFile(filename, buffer)];
}

export function parseSingleFile(filename: string, buffer: Buffer): ParsedFile {
  const path = filename.replace(/\\/g, "/").split("/").pop() ?? filename;
  const hash = hashBytes(buffer);
  if (isProbablyBinary(new Uint8Array(buffer))) {
    return {
      path,
      kind: "resource",
      language: null,
      content: "",
      size: buffer.length,
      hash,
      isBinary: true,
    };
  }
  return {
    path,
    kind: classify(path),
    language: LANG_BY_EXT[extOf(path)] ?? null,
    content: buffer.toString("utf-8"),
    size: buffer.length,
    hash,
    isBinary: false,
  };
}

export function parseText(text: string): ParsedFile {
  return {
    path: "SKILL.md",
    kind: "instruction",
    language: "markdown",
    content: text,
    size: Buffer.byteLength(text, "utf-8"),
    hash: hashBytes(Buffer.from(text, "utf-8")),
    isBinary: false,
  };
}

export function deriveScanName(files: ParsedFile[], fallback: string): string {
  const skillMd = files.find(
    (f) => (f.path.split("/").pop() ?? "").toLowerCase() === "skill.md",
  );
  if (skillMd) {
    const m = skillMd.content.match(/^#\s+(.+)$/m);
    if (m) return m[1].trim().slice(0, 120);
    const nameMatch = skillMd.content.match(/^name:\s*(.+)$/im);
    if (nameMatch) return nameMatch[1].trim().replace(/^["']|["']$/g, "").slice(0, 120);
  }
  const top = files[0]?.path.split("/")[0];
  return (top || fallback).slice(0, 120);
}