skillguard/artifacts/api-server/src/routes/scans.ts

import { Router, type IRouter } from "express";
import { db } from "@workspace/db";
import {
  scansTable,
  scanFilesTable,
  findingsTable,
  type Scan,
  type ScanFile,
  type Finding,
  type ScanRelation,
} from "@workspace/db";
import { eq, desc, count } from "drizzle-orm";
import {
  ListScansResponse,
  CreateScanBody,
  GetScanParams,
  GetScanResponse,
  DeleteScanParams,
  CompareScansParams,
  CompareScansResponse,
} from "@workspace/api-zod";
import {
  parseZip,
  parseSingleFile,
  parseText,
  deriveScanName,
} from "../lib/skillParser";
import { analyzeSkill, type EngineResult } from "../lib/scanEngine";
import { STATIC_RULES, AI_RULES, type ParsedFile } from "../lib/ruleCatalog";
import { computeFingerprint } from "../lib/skillFingerprint";
import { lineDiff, lineSimilarity } from "../lib/lineDiff";
import { logger } from "../lib/logger";

const router: IRouter = Router();

type CreateScanInput = ReturnType<typeof CreateScanBody.parse>;

export function serializeScan(scan: Scan) {
  return {
    id: scan.id,
    name: scan.name,
    source: scan.source,
    status: scan.status,
    verdict: scan.verdict,
    riskScore: scan.riskScore,
    fileCount: scan.fileCount,
    aiUsed: scan.aiUsed,
    aiError: scan.aiError,
    findingCounts: scan.findingCounts,
    fingerprint: scan.fingerprint,
    relation: scan.relation,
    similarity: scan.similarity,
    comparedScanId: scan.comparedScanId,
    createdAt: scan.createdAt.toISOString(),
  };
}

function serializeFile(f: ScanFile) {
  return {
    path: f.path,
    kind: f.kind,
    language: f.language,
    size: f.size,
    hash: f.hash,
    hasContent: f.content !== null,
  };
}

type ComparedScan = {
  id: number;
  name: string;
  verdict: string;
  riskScore: number;
  createdAt: string;
};

async function resolveComparedScan(
  id: number | null,
): Promise<ComparedScan | null> {
  if (id == null) return null;
  const [s] = await db.select().from(scansTable).where(eq(scansTable.id, id));
  if (!s) return null;
  return {
    id: s.id,
    name: s.name,
    verdict: s.verdict,
    riskScore: s.riskScore,
    createdAt: s.createdAt.toISOString(),
  };
}

async function countFingerprint(fingerprint: string): Promise<number> {
  if (!fingerprint) return 1;
  const [row] = await db
    .select({ c: count() })
    .from(scansTable)
    .where(eq(scansTable.fingerprint, fingerprint));
  return Number(row?.c ?? 1);
}

function serializeFinding(f: Finding) {
  return {
    id: f.id,
    ruleId: f.ruleId,
    axis: f.axis,
    severity: f.severity,
    title: f.title,
    description: f.description,
    remediation: f.remediation,
    file: f.file,
    line: f.line,
    snippet: f.snippet,
    detectedBy: f.detectedBy,
  };
}

function serializeScanDetail(
  scan: Scan,
  files: ScanFile[],
  findings: Finding[],
  checkCount: number,
  comparedScan: ComparedScan | null,
) {
  return {
    ...serializeScan(scan),
    checkpoints: scan.checkpoints ?? [],
    files: files.map(serializeFile),
    findings: [...findings].sort((a, b) => a.id - b.id).map(serializeFinding),
    checkCount,
    comparedScan,
  };
}

async function buildScanDetail(
  scan: Scan,
  files: ScanFile[],
  findings: Finding[],
) {
  const [checkCount, comparedScan] = await Promise.all([
    countFingerprint(scan.fingerprint),
    resolveComparedScan(scan.comparedScanId),
  ]);
  return serializeScanDetail(scan, files, findings, checkCount, comparedScan);
}

type RelationInfo = {
  relation: ScanRelation;
  similarity: number | null;
  comparedScanId: number | null;
};

/**
 * Determine how the freshly parsed skill relates to the scans already stored.
 * Exact fingerprint match -> identical; otherwise the most content-similar prior
 * skill (when it overlaps enough or shares a byte-identical file) -> modified;
 * nothing meaningfully in common -> new.
 */
async function computeRelation(
  fingerprint: string,
  files: ParsedFile[],
): Promise<RelationInfo> {
  if (fingerprint) {
    const identical = await db
      .select({ id: scansTable.id })
      .from(scansTable)
      .where(eq(scansTable.fingerprint, fingerprint))
      .orderBy(desc(scansTable.createdAt))
      .limit(1);
    if (identical.length > 0) {
      return { relation: "identical", similarity: 100, comparedScanId: identical[0].id };
    }
  }

  // Group every prior scan's files so we can measure how much of the file tree
  // overlaps. We match on file *paths* (so single-file skills whose content
  // changed are still recognised as a modified version of the same skill) and
  // fall back to hash overlap to disambiguate equally-good path matches.
  const priorFiles = await db
    .select({
      scanId: scanFilesTable.scanId,
      path: scanFilesTable.path,
      hash: scanFilesTable.hash,
      content: scanFilesTable.content,
    })
    .from(scanFilesTable);

  const byScan = new Map<number, Map<string, { hash: string; content: string | null }>>();
  for (const row of priorFiles) {
    if (!row.path) continue;
    let map = byScan.get(row.scanId);
    if (!map) {
      map = new Map();
      byScan.set(row.scanId, map);
    }
    map.set(row.path, { hash: row.hash, content: row.content });
  }

  const newPaths = new Set(files.map((f) => f.path));
  const newHashes = new Set(
    files.map((f) => f.hash).filter((h): h is string => Boolean(h)),
  );

  // Score every prior scan by content-aware similarity (not just path overlap).
  // Path overlap alone is misleading: single-file text skills always share the
  // path "SKILL.md", so unrelated pastes would otherwise look related. We pick
  // the most similar prior scan and only call it a modified version when the
  // content actually overlaps enough OR at least one file is byte-identical.
  let bestId: number | null = null;
  let bestSimilarity = -1;
  let bestHasHashOverlap = false;
  for (const [scanId, map] of byScan) {
    const priorHashes = new Set(
      Array.from(map.values())
        .map((v) => v.hash)
        .filter(Boolean),
    );
    const sharesPath = Array.from(map.keys()).some((p) => newPaths.has(p));
    const hashOverlap = Array.from(priorHashes).some((h) => newHashes.has(h));
    // Nothing in common at all -> cannot be a version of this skill.
    if (!sharesPath && !hashOverlap) continue;

    const similarity = computeContentSimilarity(files, map);
    if (
      similarity > bestSimilarity ||
      (similarity === bestSimilarity && hashOverlap && !bestHasHashOverlap)
    ) {
      bestSimilarity = similarity;
      bestId = scanId;
      bestHasHashOverlap = hashOverlap;
    }
  }

  // Treat as a modified version only with a meaningful content overlap or a
  // shared byte-identical file; otherwise it is a genuinely new skill that just
  // happens to reuse a common file path.
  if (
    bestId !== null &&
    (bestHasHashOverlap || bestSimilarity >= MODIFIED_SIMILARITY_THRESHOLD)
  ) {
    return {
      relation: "modified",
      similarity: bestSimilarity,
      comparedScanId: bestId,
    };
  }

  return { relation: "new", similarity: null, comparedScanId: null };
}

/**
 * Minimum content similarity (0-100) for a non-identical upload to count as a
 * modified version of a prior scan rather than a brand-new skill. Keeps
 * unrelated single-file pastes (which always share the "SKILL.md" path) from
 * being falsely linked together.
 */
const MODIFIED_SIMILARITY_THRESHOLD = 40;

/**
 * Content-aware similarity (0-100) between the new files and a matched prior
 * scan. Identical files (same hash) count fully; changed text files use the
 * line-level similarity; added/removed or changed binary files count as 0.
 */
function computeContentSimilarity(
  newFiles: ParsedFile[],
  prior: Map<string, { hash: string; content: string | null }>,
): number {
  const newByPath = new Map(newFiles.map((f) => [f.path, f]));
  const paths = new Set<string>([...newByPath.keys(), ...prior.keys()]);
  if (paths.size === 0) return 0;

  let total = 0;
  for (const path of paths) {
    const cur = newByPath.get(path);
    const prev = prior.get(path);
    if (!cur || !prev) continue; // added or removed -> 0
    if (cur.hash && cur.hash === prev.hash) {
      total += 1;
      continue;
    }
    if (!cur.isBinary && prev.content !== null) {
      total += lineSimilarity(prev.content, cur.content);
    }
    // changed binary -> 0
  }
  return Math.round((total / paths.size) * 100);
}

type ParseResult =
  | { ok: true; files: ParsedFile[] }
  | { ok: false; status: number; message: string };

function parseScanInput(input: CreateScanInput): ParseResult {
  try {
    let files: ParsedFile[];
    if (input.source === "zip") {
      if (!input.contentBase64)
        return { ok: false, status: 400, message: "ZIP-Inhalt fehlt." };
      files = parseZip(Buffer.from(input.contentBase64, "base64"));
    } else if (input.source === "file") {
      if (!input.contentBase64)
        return { ok: false, status: 400, message: "Dateiinhalt fehlt." };
      files = [
        parseSingleFile(
          input.filename ?? "datei",
          Buffer.from(input.contentBase64, "base64"),
        ),
      ];
    } else {
      if (!input.text || !input.text.trim())
        return { ok: false, status: 400, message: "Text fehlt." };
      files = [parseText(input.text)];
    }
    if (files.length === 0)
      return {
        ok: false,
        status: 400,
        message: "Keine analysierbaren Dateien gefunden.",
      };
    return { ok: true, files };
  } catch (err) {
    logger.error({ err }, "Skill-Parsing fehlgeschlagen");
    return {
      ok: false,
      status: 400,
      message:
        "Das Skill konnte nicht gelesen werden. Bitte prüfen Sie das Format (gültiges ZIP / Textdatei).",
    };
  }
}

async function persistScan(
  input: CreateScanInput,
  name: string,
  files: ParsedFile[],
  result: EngineResult,
): Promise<{ scan: Scan; files: ScanFile[]; findings: Finding[] }> {
  const fingerprint = computeFingerprint(
    files.map((f) => ({ path: f.path, hash: f.hash })),
  );
  // Determine relation against the existing database BEFORE inserting the new
  // scan so the comparison excludes this scan itself. The skill is always
  // re-scanned; identical uploads are stored as duplicates.
  const relationInfo = await computeRelation(fingerprint, files);

  const [scan] = await db
    .insert(scansTable)
    .values({
      name,
      source: input.source,
      status: "completed",
      verdict: result.verdict,
      riskScore: result.riskScore,
      fileCount: files.length,
      aiUsed: result.aiUsed,
      aiError: result.aiError,
      findingCounts: result.counts,
      checkpoints: result.checkpoints,
      fingerprint,
      relation: relationInfo.relation,
      similarity: relationInfo.similarity,
      comparedScanId: relationInfo.comparedScanId,
    })
    .returning();

  let insertedFiles: ScanFile[] = [];
  if (files.length > 0) {
    insertedFiles = await db
      .insert(scanFilesTable)
      .values(
        files.map((f) => ({
          scanId: scan.id,
          path: f.path,
          kind: f.kind,
          language: f.language,
          size: f.size,
          hash: f.hash,
          content: f.isBinary ? null : f.content,
        })),
      )
      .returning();
  }

  let insertedFindings: Finding[] = [];
  if (result.findings.length > 0) {
    insertedFindings = await db
      .insert(findingsTable)
      .values(
        result.findings.map((f) => ({
          scanId: scan.id,
          ruleId: f.ruleId,
          axis: f.axis,
          severity: f.severity,
          title: f.title,
          description: f.description,
          remediation: f.remediation,
          file: f.file,
          line: f.line,
          snippet: f.snippet,
          detectedBy: f.detectedBy,
        })),
      )
      .returning();
  }

  return { scan, files: insertedFiles, findings: insertedFindings };
}

router.get("/scans", async (_req, res) => {
  const rows = await db
    .select()
    .from(scansTable)
    .orderBy(desc(scansTable.createdAt));
  res.json(ListScansResponse.parse(rows.map(serializeScan)));
});

router.post("/scans", async (req, res) => {
  const parsed = CreateScanBody.safeParse(req.body);
  if (!parsed.success) {
    return res
      .status(400)
      .json({ message: "Ungültige Eingabe", details: parsed.error.issues });
  }
  const input = parsed.data;

  const parseResult = parseScanInput(input);
  if (!parseResult.ok) {
    return res.status(parseResult.status).json({ message: parseResult.message });
  }
  const files = parseResult.files;

  const name = input.name?.trim() || deriveScanName(files, "Unbenanntes Skill");
  const result = await analyzeSkill(files, input.useAi);
  const { scan, files: insertedFiles, findings } = await persistScan(
    input,
    name,
    files,
    result,
  );

  return res
    .status(201)
    .json(GetScanResponse.parse(await buildScanDetail(scan, insertedFiles, findings)));
});

const STREAM_PACING_MS = 80;
const delay = (ms: number) => new Promise<void>((resolve) => setTimeout(resolve, ms));

router.post("/scans/stream", async (req, res) => {
  const parsed = CreateScanBody.safeParse(req.body);
  if (!parsed.success) {
    res
      .status(400)
      .json({ message: "Ungültige Eingabe", details: parsed.error.issues });
    return;
  }
  const input = parsed.data;

  const parseResult = parseScanInput(input);
  if (!parseResult.ok) {
    res.status(parseResult.status).json({ message: parseResult.message });
    return;
  }
  const files = parseResult.files;
  const name = input.name?.trim() || deriveScanName(files, "Unbenanntes Skill");

  res.status(200);
  res.setHeader("Content-Type", "application/x-ndjson; charset=utf-8");
  res.setHeader("Cache-Control", "no-cache, no-transform");
  res.setHeader("X-Accel-Buffering", "no");
  res.setHeader("Connection", "keep-alive");
  res.flushHeaders();

  // Detect a genuine client disconnect. NOTE: do NOT use req.on("close") here —
  // for a POST it fires as soon as the request body is consumed, not on abort.
  // res "close" before writableFinished means the client went away.
  let aborted = false;
  res.on("close", () => {
    if (!res.writableFinished) aborted = true;
  });

  const write = (obj: unknown) => {
    if (aborted || res.writableEnded) return;
    res.write(JSON.stringify(obj) + "\n");
  };

  write({
    type: "start",
    name,
    fileCount: files.length,
    totalChecks: STATIC_RULES.length + (input.useAi ? AI_RULES.length : 0),
  });

  let cumulative = 0;
  try {
    const result = await analyzeSkill(files, input.useAi, async (event) => {
      if (event.type === "ai-start") {
        write({ type: "ai-start" });
        return;
      }
      cumulative += event.checkpoint.scoreDelta;
      write({
        type: "checkpoint",
        checkpoint: event.checkpoint,
        runningScore: Math.min(100, cumulative),
      });
      if (!aborted) await delay(STREAM_PACING_MS);
    });

    const { scan } = await persistScan(input, name, files, result);

    write({
      type: "done",
      scanId: scan.id,
      riskScore: result.riskScore,
      verdict: result.verdict,
      findingCounts: result.counts,
      aiUsed: result.aiUsed,
      aiError: result.aiError,
    });
    if (!aborted && !res.writableEnded) res.end();
  } catch (err) {
    logger.error({ err }, "Streaming-Scan fehlgeschlagen");
    write({ type: "error", message: "Die Analyse ist fehlgeschlagen." });
    if (!aborted && !res.writableEnded) res.end();
  }
});

router.get("/scans/:id", async (req, res) => {
  const params = GetScanParams.safeParse(req.params);
  if (!params.success)
    return res.status(400).json({ message: "Ungültige ID" });

  const [scan] = await db
    .select()
    .from(scansTable)
    .where(eq(scansTable.id, params.data.id));
  if (!scan) return res.status(404).json({ message: "Scan nicht gefunden" });

  const files = await db
    .select()
    .from(scanFilesTable)
    .where(eq(scanFilesTable.scanId, scan.id));
  const findings = await db
    .select()
    .from(findingsTable)
    .where(eq(findingsTable.scanId, scan.id))
    .orderBy(findingsTable.id);

  return res.json(GetScanResponse.parse(await buildScanDetail(scan, files, findings)));
});

router.get("/scans/:id/compare/:otherId", async (req, res) => {
  const params = CompareScansParams.safeParse(req.params);
  if (!params.success)
    return res.status(400).json({ message: "Ungültige ID" });

  const { id, otherId } = params.data;

  const [current] = await db
    .select()
    .from(scansTable)
    .where(eq(scansTable.id, id));
  const [previous] = await db
    .select()
    .from(scansTable)
    .where(eq(scansTable.id, otherId));

  if (!current || !previous)
    return res.status(404).json({ message: "Scan nicht gefunden" });

  const [currentFiles, previousFiles] = await Promise.all([
    db.select().from(scanFilesTable).where(eq(scanFilesTable.scanId, id)),
    db.select().from(scanFilesTable).where(eq(scanFilesTable.scanId, otherId)),
  ]);

  const currentByPath = new Map(currentFiles.map((f) => [f.path, f]));
  const previousByPath = new Map(previousFiles.map((f) => [f.path, f]));
  const paths = Array.from(
    new Set([...currentByPath.keys(), ...previousByPath.keys()]),
  ).sort((a, b) => (a < b ? -1 : a > b ? 1 : 0));

  const fileDiffs = paths.map((path) => {
    const cur = currentByPath.get(path) ?? null;
    const prev = previousByPath.get(path) ?? null;

    let status: "unchanged" | "modified" | "added" | "removed";
    if (cur && !prev) status = "added";
    else if (!cur && prev) status = "removed";
    else if (cur && prev && cur.hash === prev.hash) status = "unchanged";
    else status = "modified";

    let diff:
      | {
          type: "context" | "add" | "remove";
          text: string;
          previousLine: number | null;
          currentLine: number | null;
        }[]
      | null = null;
    if (
      status === "modified" &&
      cur?.content !== null &&
      cur?.content !== undefined &&
      prev?.content !== null &&
      prev?.content !== undefined
    ) {
      diff = lineDiff(prev.content, cur.content);
    }

    return {
      path,
      status,
      previousHash: prev?.hash ?? null,
      currentHash: cur?.hash ?? null,
      previousSize: prev?.size ?? null,
      currentSize: cur?.size ?? null,
      previousHasContent: prev ? prev.content !== null : null,
      currentHasContent: cur ? cur.content !== null : null,
      lineDiff: diff,
    };
  });

  const side = (s: Scan) => ({
    id: s.id,
    name: s.name,
    verdict: s.verdict,
    riskScore: s.riskScore,
    fileCount: s.fileCount,
    fingerprint: s.fingerprint,
    createdAt: s.createdAt.toISOString(),
  });

  return res.json(
    CompareScansResponse.parse({
      current: side(current),
      previous: side(previous),
      files: fileDiffs,
    }),
  );
});

router.delete("/scans/:id", async (req, res) => {
  const params = DeleteScanParams.safeParse(req.params);
  if (!params.success)
    return res.status(400).json({ message: "Ungültige ID" });
  await db.delete(scansTable).where(eq(scansTable.id, params.data.id));
  return res.status(204).send();
});

export default router;