skillguard/artifacts/api-server/src/routes/scans.ts

import { Router, type IRouter } from "express";
import { db } from "@workspace/db";
import {
  scansTable,
  scanFilesTable,
  findingsTable,
  type Scan,
  type ScanFile,
  type Finding,
  type ScanRelation,
  aiProvidersTable,
  promptsTable,
  type Prompt,
} from "@workspace/db";
import { eq, desc, count } from "drizzle-orm";
import {
  ListScansResponse,
  CreateScanBody,
  GetScanParams,
  GetScanResponse,
  DeleteScanParams,
  CompareScansParams,
  CompareScansResponse,
  GetScanLineageResponse,
} from "@workspace/api-zod";
import {
  parseUpload,
  parseText,
  deriveScanName,
} from "../lib/skillParser";
import { analyzeSkill, type EngineResult } from "../lib/scanEngine";
import { STATIC_RULES, AI_RULES, type ParsedFile } from "../lib/ruleCatalog";
import { generateSkillDescription } from "../lib/aiAnalysis";
import { computeFingerprint } from "../lib/skillFingerprint";
import { lineDiff, lineSimilarity } from "../lib/lineDiff";
import { logger } from "../lib/logger";

const router: IRouter = Router();

type CreateScanInput = ReturnType<typeof CreateScanBody.parse>;

export function serializeScan(scan: Scan) {
  return {
    id: scan.id,
    name: scan.name,
    description: scan.description,
    source: scan.source,
    status: scan.status,
    verdict: scan.verdict,
    riskScore: scan.riskScore,
    fileCount: scan.fileCount,
    aiUsed: scan.aiUsed,
    aiError: scan.aiError,
    findingCounts: scan.findingCounts,
    fingerprint: scan.fingerprint,
    relation: scan.relation,
    similarity: scan.similarity,
    comparedScanId: scan.comparedScanId,
    createdAt: scan.createdAt.toISOString(),
  };
}

function serializeFile(f: ScanFile) {
  return {
    path: f.path,
    kind: f.kind,
    language: f.language,
    size: f.size,
    hash: f.hash,
    hasContent: f.content !== null,
    content: f.content,
  };
}

type ComparedScan = {
  id: number;
  name: string;
  verdict: string;
  riskScore: number;
  createdAt: string;
};

async function resolveComparedScan(
  id: number | null,
): Promise<ComparedScan | null> {
  if (id == null) return null;
  const [s] = await db.select().from(scansTable).where(eq(scansTable.id, id));
  if (!s) return null;
  return {
    id: s.id,
    name: s.name,
    verdict: s.verdict,
    riskScore: s.riskScore,
    createdAt: s.createdAt.toISOString(),
  };
}

export async function countFingerprint(fingerprint: string): Promise<number> {
  if (!fingerprint) return 1;
  const [row] = await db
    .select({ c: count() })
    .from(scansTable)
    .where(eq(scansTable.fingerprint, fingerprint));
  return Number(row?.c ?? 1);
}

function serializeFinding(f: Finding) {
  return {
    id: f.id,
    ruleId: f.ruleId,
    axis: f.axis,
    severity: f.severity,
    title: f.title,
    description: f.description,
    remediation: f.remediation,
    file: f.file,
    line: f.line,
    snippet: f.snippet,
    detectedBy: f.detectedBy,
  };
}

function serializeScanDetail(
  scan: Scan,
  files: ScanFile[],
  findings: Finding[],
  checkCount: number,
  comparedScan: ComparedScan | null,
) {
  return {
    ...serializeScan(scan),
    checkpoints: scan.checkpoints ?? [],
    files: files.map(serializeFile),
    findings: [...findings].sort((a, b) => a.id - b.id).map(serializeFinding),
    checkCount,
    comparedScan,
  };
}

async function buildScanDetail(
  scan: Scan,
  files: ScanFile[],
  findings: Finding[],
) {
  const [checkCount, comparedScan] = await Promise.all([
    countFingerprint(scan.fingerprint),
    resolveComparedScan(scan.comparedScanId),
  ]);
  return serializeScanDetail(scan, files, findings, checkCount, comparedScan);
}

type RelationInfo = {
  relation: ScanRelation;
  similarity: number | null;
  comparedScanId: number | null;
};

/**
 * Determine how the freshly parsed skill relates to the scans already stored.
 * Exact fingerprint match -> identical; otherwise the most content-similar prior
 * skill (when it overlaps enough or shares a byte-identical file) -> modified;
 * nothing meaningfully in common -> new.
 */
export async function computeRelation(
  fingerprint: string,
  files: ParsedFile[],
): Promise<RelationInfo> {
  if (fingerprint) {
    const identical = await db
      .select({ id: scansTable.id })
      .from(scansTable)
      .where(eq(scansTable.fingerprint, fingerprint))
      .orderBy(desc(scansTable.createdAt))
      .limit(1);
    if (identical.length > 0) {
      return { relation: "identical", similarity: 100, comparedScanId: identical[0].id };
    }
  }

  // Group every prior scan's files so we can measure how much of the file tree
  // overlaps. We match on file *paths* (so single-file skills whose content
  // changed are still recognised as a modified version of the same skill) and
  // fall back to hash overlap to disambiguate equally-good path matches.
  const priorFiles = await db
    .select({
      scanId: scanFilesTable.scanId,
      path: scanFilesTable.path,
      hash: scanFilesTable.hash,
      content: scanFilesTable.content,
    })
    .from(scanFilesTable);

  const byScan = new Map<number, Map<string, { hash: string; content: string | null }>>();
  for (const row of priorFiles) {
    if (!row.path) continue;
    let map = byScan.get(row.scanId);
    if (!map) {
      map = new Map();
      byScan.set(row.scanId, map);
    }
    map.set(row.path, { hash: row.hash, content: row.content });
  }

  const newPaths = new Set(files.map((f) => f.path));
  const newHashes = new Set(
    files.map((f) => f.hash).filter((h): h is string => Boolean(h)),
  );

  // Score every prior scan by content-aware similarity (not just path overlap).
  // Path overlap alone is misleading: single-file text skills always share the
  // path "SKILL.md", so unrelated pastes would otherwise look related. We pick
  // the most similar prior scan and only call it a modified version when the
  // content actually overlaps enough OR at least one file is byte-identical.
  let bestId: number | null = null;
  let bestSimilarity = -1;
  let bestHasHashOverlap = false;
  for (const [scanId, map] of byScan) {
    const priorHashes = new Set(
      Array.from(map.values())
        .map((v) => v.hash)
        .filter(Boolean),
    );
    const sharesPath = Array.from(map.keys()).some((p) => newPaths.has(p));
    const hashOverlap = Array.from(priorHashes).some((h) => newHashes.has(h));
    // Nothing in common at all -> cannot be a version of this skill.
    if (!sharesPath && !hashOverlap) continue;

    const similarity = computeContentSimilarity(files, map);
    if (
      similarity > bestSimilarity ||
      (similarity === bestSimilarity && hashOverlap && !bestHasHashOverlap)
    ) {
      bestSimilarity = similarity;
      bestId = scanId;
      bestHasHashOverlap = hashOverlap;
    }
  }

  // Treat as a modified version only with a meaningful content overlap or a
  // shared byte-identical file; otherwise it is a genuinely new skill that just
  // happens to reuse a common file path.
  if (
    bestId !== null &&
    (bestHasHashOverlap || bestSimilarity >= MODIFIED_SIMILARITY_THRESHOLD)
  ) {
    return {
      relation: "modified",
      similarity: bestSimilarity,
      comparedScanId: bestId,
    };
  }

  return { relation: "new", similarity: null, comparedScanId: null };
}

/**
 * Minimum content similarity (0-100) for a non-identical upload to count as a
 * modified version of a prior scan rather than a brand-new skill. Keeps
 * unrelated single-file pastes (which always share the "SKILL.md" path) from
 * being falsely linked together.
 */
const MODIFIED_SIMILARITY_THRESHOLD = 40;

/**
 * Content-aware similarity (0-100) between the new files and a matched prior
 * scan. Identical files (same hash) count fully; changed text files use the
 * line-level similarity; added/removed or changed binary files count as 0.
 */
export function computeContentSimilarity(
  newFiles: ParsedFile[],
  prior: Map<string, { hash: string; content: string | null }>,
): number {
  const newByPath = new Map(newFiles.map((f) => [f.path, f]));
  const paths = new Set<string>([...newByPath.keys(), ...prior.keys()]);
  if (paths.size === 0) return 0;

  let total = 0;
  for (const path of paths) {
    const cur = newByPath.get(path);
    const prev = prior.get(path);
    if (!cur || !prev) continue; // added or removed -> 0
    if (cur.hash && cur.hash === prev.hash) {
      total += 1;
      continue;
    }
    if (!cur.isBinary && prev.content !== null) {
      total += lineSimilarity(prev.content, cur.content);
    }
    // changed binary -> 0
  }
  return Math.round((total / paths.size) * 100);
}

type ParseResult =
  | { ok: true; files: ParsedFile[] }
  | { ok: false; status: number; message: string };

function parseScanInput(input: CreateScanInput): ParseResult {
  try {
    let files: ParsedFile[];
    if (input.source === "zip") {
      if (!input.contentBase64)
        return { ok: false, status: 400, message: "ZIP-Inhalt fehlt." };
      files = parseUpload(
        input.filename ?? "archiv.zip",
        Buffer.from(input.contentBase64, "base64"),
      );
    } else if (input.source === "file") {
      if (!input.contentBase64)
        return { ok: false, status: 400, message: "Dateiinhalt fehlt." };
      files = parseUpload(
        input.filename ?? "datei",
        Buffer.from(input.contentBase64, "base64"),
      );
    } else {
      if (!input.text || !input.text.trim())
        return { ok: false, status: 400, message: "Text fehlt." };
      files = [parseText(input.text)];
    }
    if (files.length === 0)
      return {
        ok: false,
        status: 400,
        message: "Keine analysierbaren Dateien gefunden.",
      };
    return { ok: true, files };
  } catch (err) {
    logger.error({ err }, "Skill-Parsing fehlgeschlagen");
    return {
      ok: false,
      status: 400,
      message:
        "Das Skill konnte nicht gelesen werden. Bitte prüfen Sie das Format (gültiges ZIP / Textdatei).",
    };
  }
}

async function persistScan(
  input: CreateScanInput,
  name: string,
  files: ParsedFile[],
  result: EngineResult,
): Promise<{ scan: Scan; files: ScanFile[]; findings: Finding[] }> {
  const fingerprint = computeFingerprint(
    files.map((f) => ({ path: f.path, hash: f.hash })),
  );
  // Determine relation against the existing database BEFORE inserting the new
  // scan so the comparison excludes this scan itself. The skill is always
  // re-scanned; identical uploads are stored as duplicates.
  const relationInfo = await computeRelation(fingerprint, files);

  const [scan] = await db
    .insert(scansTable)
    .values({
      name,
      description: result.aiDescription,
      source: input.source,
      status: "completed",
      verdict: result.verdict,
      riskScore: result.riskScore,
      fileCount: files.length,
      aiUsed: result.aiUsed,
      aiError: result.aiError,
      findingCounts: result.counts,
      checkpoints: result.checkpoints,
      fingerprint,
      relation: relationInfo.relation,
      similarity: relationInfo.similarity,
      comparedScanId: relationInfo.comparedScanId,
    })
    .returning();

  let insertedFiles: ScanFile[] = [];
  if (files.length > 0) {
    insertedFiles = await db
      .insert(scanFilesTable)
      .values(
        files.map((f) => ({
          scanId: scan.id,
          path: f.path,
          kind: f.kind,
          language: f.language,
          size: f.size,
          hash: f.hash,
          content: f.isBinary ? null : f.content,
        })),
      )
      .returning();
  }

  let insertedFindings: Finding[] = [];
  if (result.findings.length > 0) {
    insertedFindings = await db
      .insert(findingsTable)
      .values(
        result.findings.map((f) => ({
          scanId: scan.id,
          ruleId: f.ruleId,
          axis: f.axis,
          severity: f.severity,
          title: f.title,
          description: f.description,
          remediation: f.remediation,
          file: f.file,
          line: f.line,
          snippet: f.snippet,
          detectedBy: f.detectedBy,
        })),
      )
      .returning();
  }

  return { scan, files: insertedFiles, findings: insertedFindings };
}

router.get("/scans", async (_req, res) => {
  const rows = await db
    .select()
    .from(scansTable)
    .orderBy(desc(scansTable.createdAt));
  res.json(ListScansResponse.parse(rows.map(serializeScan)));
});

router.post("/scans", async (req, res) => {
  const parsed = CreateScanBody.safeParse(req.body);
  if (!parsed.success) {
    return res
      .status(400)
      .json({ message: "Ungültige Eingabe", details: parsed.error.issues });
  }
  const input = parsed.data;

  const parseResult = parseScanInput(input);
  if (!parseResult.ok) {
    return res.status(parseResult.status).json({ message: parseResult.message });
  }
  const files = parseResult.files;

  const name = input.name?.trim() || deriveScanName(files, "Unbenanntes Skill");
  const result = await analyzeSkill(files, input.useAi);
  const { scan, files: insertedFiles, findings } = await persistScan(
    input,
    name,
    files,
    result,
  );

  return res
    .status(201)
    .json(GetScanResponse.parse(await buildScanDetail(scan, insertedFiles, findings)));
});

const STREAM_PACING_MS = 80;
const delay = (ms: number) => new Promise<void>((resolve) => setTimeout(resolve, ms));

router.post("/scans/stream", async (req, res) => {
  const parsed = CreateScanBody.safeParse(req.body);
  if (!parsed.success) {
    res
      .status(400)
      .json({ message: "Ungültige Eingabe", details: parsed.error.issues });
    return;
  }
  const input = parsed.data;

  const parseResult = parseScanInput(input);
  if (!parseResult.ok) {
    res.status(parseResult.status).json({ message: parseResult.message });
    return;
  }
  const files = parseResult.files;
  const name = input.name?.trim() || deriveScanName(files, "Unbenanntes Skill");

  res.status(200);
  res.setHeader("Content-Type", "application/x-ndjson; charset=utf-8");
  res.setHeader("Cache-Control", "no-cache, no-transform");
  res.setHeader("X-Accel-Buffering", "no");
  res.setHeader("Connection", "keep-alive");
  res.flushHeaders();

  // Detect a genuine client disconnect. NOTE: do NOT use req.on("close") here —
  // for a POST it fires as soon as the request body is consumed, not on abort.
  // res "close" before writableFinished means the client went away.
  let aborted = false;
  res.on("close", () => {
    if (!res.writableFinished) aborted = true;
  });

  const write = (obj: unknown) => {
    if (aborted || res.writableEnded) return;
    res.write(JSON.stringify(obj) + "\n");
  };

  write({
    type: "start",
    name,
    fileCount: files.length,
    totalChecks: STATIC_RULES.length + (input.useAi ? AI_RULES.length : 0),
  });

  let cumulative = 0;
  try {
    const result = await analyzeSkill(files, input.useAi, async (event) => {
      if (event.type === "ai-start") {
        write({ type: "ai-start" });
        return;
      }
      cumulative += event.checkpoint.scoreDelta;
      write({
        type: "checkpoint",
        checkpoint: event.checkpoint,
        runningScore: Math.min(100, cumulative),
      });
      if (!aborted) await delay(STREAM_PACING_MS);
    });

    const { scan } = await persistScan(input, name, files, result);

    write({
      type: "done",
      scanId: scan.id,
      riskScore: result.riskScore,
      verdict: result.verdict,
      findingCounts: result.counts,
      aiUsed: result.aiUsed,
      aiError: result.aiError,
    });
    if (!aborted && !res.writableEnded) res.end();
  } catch (err) {
    logger.error({ err }, "Streaming-Scan fehlgeschlagen");
    write({ type: "error", message: "Die Analyse ist fehlgeschlagen." });
    if (!aborted && !res.writableEnded) res.end();
  }
});

router.get("/scans/:id", async (req, res) => {
  const params = GetScanParams.safeParse(req.params);
  if (!params.success)
    return res.status(400).json({ message: "Ungültige ID" });

  const [scan] = await db
    .select()
    .from(scansTable)
    .where(eq(scansTable.id, params.data.id));
  if (!scan) return res.status(404).json({ message: "Scan nicht gefunden" });

  const files = await db
    .select()
    .from(scanFilesTable)
    .where(eq(scanFilesTable.scanId, scan.id));
  const findings = await db
    .select()
    .from(findingsTable)
    .where(eq(findingsTable.scanId, scan.id))
    .orderBy(findingsTable.id);

  return res.json(GetScanResponse.parse(await buildScanDetail(scan, files, findings)));
});

router.get("/scans/:id/compare/:otherId", async (req, res) => {
  const params = CompareScansParams.safeParse(req.params);
  if (!params.success)
    return res.status(400).json({ message: "Ungültige ID" });

  const { id, otherId } = params.data;

  const [current] = await db
    .select()
    .from(scansTable)
    .where(eq(scansTable.id, id));
  const [previous] = await db
    .select()
    .from(scansTable)
    .where(eq(scansTable.id, otherId));

  if (!current || !previous)
    return res.status(404).json({ message: "Scan nicht gefunden" });

  const [currentFiles, previousFiles] = await Promise.all([
    db.select().from(scanFilesTable).where(eq(scanFilesTable.scanId, id)),
    db.select().from(scanFilesTable).where(eq(scanFilesTable.scanId, otherId)),
  ]);

  const currentByPath = new Map(currentFiles.map((f) => [f.path, f]));
  const previousByPath = new Map(previousFiles.map((f) => [f.path, f]));
  const paths = Array.from(
    new Set([...currentByPath.keys(), ...previousByPath.keys()]),
  ).sort((a, b) => (a < b ? -1 : a > b ? 1 : 0));

  const fileDiffs = paths.map((path) => {
    const cur = currentByPath.get(path) ?? null;
    const prev = previousByPath.get(path) ?? null;

    let status: "unchanged" | "modified" | "added" | "removed";
    if (cur && !prev) status = "added";
    else if (!cur && prev) status = "removed";
    else if (cur && prev && cur.hash === prev.hash) status = "unchanged";
    else status = "modified";

    let diff:
      | {
          type: "context" | "add" | "remove";
          text: string;
          previousLine: number | null;
          currentLine: number | null;
        }[]
      | null = null;
    if (
      status === "modified" &&
      cur?.content !== null &&
      cur?.content !== undefined &&
      prev?.content !== null &&
      prev?.content !== undefined
    ) {
      diff = lineDiff(prev.content, cur.content);
    }

    return {
      path,
      status,
      previousHash: prev?.hash ?? null,
      currentHash: cur?.hash ?? null,
      previousSize: prev?.size ?? null,
      currentSize: cur?.size ?? null,
      previousHasContent: prev ? prev.content !== null : null,
      currentHasContent: cur ? cur.content !== null : null,
      lineDiff: diff,
    };
  });

  const side = (s: Scan) => ({
    id: s.id,
    name: s.name,
    verdict: s.verdict,
    riskScore: s.riskScore,
    fileCount: s.fileCount,
    fingerprint: s.fingerprint,
    createdAt: s.createdAt.toISOString(),
  });

  return res.json(
    CompareScansResponse.parse({
      current: side(current),
      previous: side(previous),
      files: fileDiffs,
    }),
  );
});

router.get("/scans/:id/lineage", async (req, res) => {
  const params = GetScanParams.safeParse(req.params);
  if (!params.success)
    return res.status(400).json({ message: "Ungültige ID" });

  const [scan] = await db
    .select()
    .from(scansTable)
    .where(eq(scansTable.id, params.data.id));
  if (!scan) return res.status(404).json({ message: "Scan nicht gefunden" });

  // Load only the columns needed to reconstruct the lineage graph for every
  // stored scan, then walk the connected component containing this scan.
  const all = await db
    .select({
      id: scansTable.id,
      name: scansTable.name,
      verdict: scansTable.verdict,
      riskScore: scansTable.riskScore,
      relation: scansTable.relation,
      similarity: scansTable.similarity,
      comparedScanId: scansTable.comparedScanId,
      fingerprint: scansTable.fingerprint,
      createdAt: scansTable.createdAt,
    })
    .from(scansTable);

  const byId = new Map(all.map((s) => [s.id, s]));

  // Build an undirected graph: scans are linked when one was compared against
  // the other (comparedScanId chain) or when they share an identical
  // fingerprint. The fingerprint family is the connected component.
  const adjacency = new Map<number, Set<number>>();
  const addEdge = (a: number, b: number) => {
    if (!byId.has(a) || !byId.has(b) || a === b) return;
    (adjacency.get(a) ?? adjacency.set(a, new Set()).get(a)!).add(b);
    (adjacency.get(b) ?? adjacency.set(b, new Set()).get(b)!).add(a);
  };

  const byFingerprint = new Map<string, number[]>();
  for (const s of all) {
    if (s.comparedScanId != null) addEdge(s.id, s.comparedScanId);
    if (s.fingerprint) {
      const list = byFingerprint.get(s.fingerprint) ?? [];
      list.push(s.id);
      byFingerprint.set(s.fingerprint, list);
    }
  }
  for (const ids of byFingerprint.values()) {
    for (let i = 1; i < ids.length; i++) addEdge(ids[0], ids[i]);
  }

  const family = new Set<number>([scan.id]);
  const queue: number[] = [scan.id];
  while (queue.length > 0) {
    const cur = queue.shift()!;
    for (const next of adjacency.get(cur) ?? []) {
      if (!family.has(next)) {
        family.add(next);
        queue.push(next);
      }
    }
  }

  const entries = Array.from(family)
    .map((fid) => byId.get(fid)!)
    .sort((a, b) => b.createdAt.getTime() - a.createdAt.getTime())
    .map((s) => ({
      id: s.id,
      name: s.name,
      verdict: s.verdict,
      riskScore: s.riskScore,
      relation: s.relation,
      similarity: s.similarity,
      comparedScanId: s.comparedScanId,
      fingerprint: s.fingerprint,
      createdAt: s.createdAt.toISOString(),
    }));

  return res.json(GetScanLineageResponse.parse(entries));
});

router.delete("/scans/:id", async (req, res) => {
  const params = DeleteScanParams.safeParse(req.params);
  if (!params.success)
    return res.status(400).json({ message: "Ungültige ID" });
  await db.delete(scansTable).where(eq(scansTable.id, params.data.id));
  return res.status(204).send();
});

// Generate the AI description for an existing scan that has none yet (older
// scans were created before description generation existed). Reuses the same
// generateSkillDescription() helper and the configured provider. A failure must
// never alter the stored scan.
router.post("/scans/:id/description", async (req, res) => {
  const params = GetScanParams.safeParse(req.params);
  if (!params.success)
    return res.status(400).json({ error: "Ungültige ID" });

  const [scan] = await db
    .select()
    .from(scansTable)
    .where(eq(scansTable.id, params.data.id));
  if (!scan) return res.status(404).json({ error: "Scan nicht gefunden" });

  const storedFiles = await db
    .select()
    .from(scanFilesTable)
    .where(eq(scanFilesTable.scanId, scan.id));

  const [provider] = await db
    .select()
    .from(aiProvidersTable)
    .where(eq(aiProvidersTable.enabled, true))
    .limit(1);

  if (!provider) {
    return res.status(422).json({
      error:
        "Kein aktiver KI-Provider konfiguriert. Bitte im Admin-Bereich einrichten.",
    });
  }
  if (!provider.apiToken) {
    return res.status(422).json({
      error: `Für den Provider "${provider.name}" ist kein API-Token hinterlegt.`,
    });
  }

  const prompts: Prompt[] = await db.select().from(promptsTable);

  // Reconstruct ParsedFile inputs from the stored scan files. Binary files have
  // no stored content; generateSkillDescription skips empty content anyway.
  const files: ParsedFile[] = storedFiles.map((f) => ({
    path: f.path,
    kind: f.kind as ParsedFile["kind"],
    language: f.language,
    content: f.content ?? "",
    size: f.size,
    hash: f.hash,
    isBinary: f.content === null,
  }));

  const description = await generateSkillDescription(provider, prompts, files);
  if (!description) {
    return res.status(422).json({
      error:
        "Die Beschreibung konnte nicht erzeugt werden. Bitte Provider-Konfiguration und KI-Prompts prüfen.",
    });
  }

  const [updated] = await db
    .update(scansTable)
    .set({ description })
    .where(eq(scansTable.id, scan.id))
    .returning();

  const findings = await db
    .select()
    .from(findingsTable)
    .where(eq(findingsTable.scanId, scan.id))
    .orderBy(findingsTable.id);

  return res.json(
    GetScanResponse.parse(await buildScanDetail(updated, storedFiles, findings)),
  );
});

export default router;