Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 17 additions & 8 deletions apps/server/src/attachmentStore.ts
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
import { randomUUID } from "node:crypto";
import { existsSync } from "node:fs";
import { existsSync, readdirSync } from "node:fs";

import type { ChatAttachment } from "@okcode/contracts";

import {
normalizeAttachmentRelativePath,
resolveAttachmentRelativePath,
} from "./attachmentPaths.ts";
import { inferImageExtension, SAFE_IMAGE_FILE_EXTENSIONS } from "./imageMime.ts";

const ATTACHMENT_FILENAME_EXTENSIONS = [...SAFE_IMAGE_FILE_EXTENSIONS, ".bin"];
import { inferAttachmentExtension } from "./imageMime.ts";
const ATTACHMENT_ID_THREAD_SEGMENT_MAX_CHARS = 80;
const ATTACHMENT_ID_THREAD_SEGMENT_PATTERN = "[a-z0-9_]+(?:-[a-z0-9_]+)*";
const ATTACHMENT_ID_UUID_PATTERN = "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}";
Expand Down Expand Up @@ -55,8 +53,9 @@ export function parseThreadSegmentFromAttachmentId(attachmentId: string): string

export function attachmentRelativePath(attachment: ChatAttachment): string {
switch (attachment.type) {
case "image": {
const extension = inferImageExtension({
case "image":
case "file": {
const extension = inferAttachmentExtension({
mimeType: attachment.mimeType,
fileName: attachment.name,
});
Expand All @@ -83,10 +82,20 @@ export function resolveAttachmentPathById(input: {
if (!normalizedId || normalizedId.includes("/") || normalizedId.includes(".")) {
return null;
}
for (const extension of ATTACHMENT_FILENAME_EXTENSIONS) {
let entries: string[];
try {
entries = readdirSync(input.attachmentsDir);
} catch {
return null;
}
for (const entry of entries) {
const entryId = parseAttachmentIdFromRelativePath(entry);
if (entryId !== normalizedId) {
continue;
}
const maybePath = resolveAttachmentRelativePath({
attachmentsDir: input.attachmentsDir,
relativePath: `${normalizedId}${extension}`,
relativePath: entry,
});
if (maybePath && existsSync(maybePath)) {
return maybePath;
Expand Down
213 changes: 213 additions & 0 deletions apps/server/src/attachmentText.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
import {
PROVIDER_SEND_TURN_MAX_INPUT_CHARS,
type ChatFileAttachment,
} from "@okcode/contracts";

const MAX_FILE_CONTEXT_TOTAL_CHARS = 80_000;
const MAX_FILE_CONTEXT_CHARS_PER_FILE = 24_000;
const TEXT_DECODER = new TextDecoder("utf-8", { fatal: false });
const TEXTUAL_MIME_SUBSTRINGS = [
"json",
"xml",
"yaml",
"toml",
"javascript",
"typescript",
"markdown",
"csv",
"graphql",
"sql",
"x-sh",
"x-shellscript",
];
const TEXTUAL_FILE_EXTENSIONS = new Set([
"c",
"cc",
"cfg",
"conf",
"cpp",
"cs",
"css",
"csv",
"env",
"go",
"graphql",
"h",
"hpp",
"html",
"ini",
"java",
"js",
"json",
"jsx",
"kt",
"log",
"lua",
"md",
"mjs",
"php",
"pl",
"py",
"rb",
"rs",
"scss",
"sh",
"sql",
"svg",
"swift",
"toml",
"ts",
"tsx",
"txt",
"vue",
"xml",
"yaml",
"yml",
"zsh",
]);

function attachmentExtension(fileName: string): string {
const match = /\.([a-z0-9]{1,12})$/i.exec(fileName.trim());
return match?.[1]?.toLowerCase() ?? "";
}

function looksTextLikeMimeType(mimeType: string): boolean {
const normalized = mimeType.trim().toLowerCase();
if (normalized.startsWith("text/")) {
return true;
}
return TEXTUAL_MIME_SUBSTRINGS.some((part) => normalized.includes(part));
}

function looksTextLikeFileName(fileName: string): boolean {
return TEXTUAL_FILE_EXTENSIONS.has(attachmentExtension(fileName));
}

function hasSuspiciousControlBytes(text: string): boolean {
let suspiciousCount = 0;
let visibleCount = 0;
for (let index = 0; index < text.length; index += 1) {
const codePoint = text.charCodeAt(index);
if (codePoint === 0) {
return true;
}
if (codePoint < 32 && codePoint !== 9 && codePoint !== 10 && codePoint !== 13) {
suspiciousCount += 1;
continue;
}
visibleCount += 1;
}
if (visibleCount === 0) {
return suspiciousCount > 0;
}
return suspiciousCount / Math.max(visibleCount, 1) > 0.02;
}

export function extractTextAttachmentContents(input: {
readonly mimeType: string;
readonly fileName: string;
readonly bytes: Uint8Array;
}): string | null {
if (input.bytes.byteLength === 0) {
return "";
}
const decoded = TEXT_DECODER.decode(input.bytes);
if (hasSuspiciousControlBytes(decoded)) {
return null;
}
const replacementCount = decoded.split("\uFFFD").length - 1;
const replacementRatio = replacementCount / Math.max(decoded.length, 1);
const expectedText =
looksTextLikeMimeType(input.mimeType) || looksTextLikeFileName(input.fileName);
if (replacementRatio > (expectedText ? 0.02 : 0.005)) {
return null;
}
if (!expectedText && decoded.trim().length === 0) {
return null;
}
return decoded.replace(/\r\n?/g, "\n");
}

export function buildFileAttachmentContextText(input: {
readonly baseText: string;
readonly attachments: ReadonlyArray<{
readonly attachment: ChatFileAttachment;
readonly text: string;
}>;
readonly maxChars?: number;
}): string {
if (input.attachments.length === 0) {
return input.baseText;
}

const maxChars = Math.max(
1,
Math.floor(input.maxChars ?? PROVIDER_SEND_TURN_MAX_INPUT_CHARS),
);
let result = input.baseText;
let usedFileContextChars = 0;
let omittedCount = 0;

const append = (chunk: string): boolean => {
if (chunk.length === 0) {
return true;
}
if (result.length + chunk.length > maxChars) {
return false;
}
result += chunk;
return true;
};

const header = `${result.length > 0 ? "\n\n" : ""}Attached file context:`;
if (!append(header)) {
return result;
}

for (const [index, entry] of input.attachments.entries()) {
const openBlock =
"\n\n<attached_file>\n" +
`name: ${entry.attachment.name}\n` +
`mime_type: ${entry.attachment.mimeType}\n` +
`size_bytes: ${entry.attachment.sizeBytes}\n` +
"content:\n";
const closeBlock = "\n</attached_file>";
const remainingContextBudget =
MAX_FILE_CONTEXT_TOTAL_CHARS - usedFileContextChars - openBlock.length - closeBlock.length;
const remainingTotalBudget = maxChars - result.length - openBlock.length - closeBlock.length;
const maxContentChars = Math.min(
MAX_FILE_CONTEXT_CHARS_PER_FILE,
remainingContextBudget,
remainingTotalBudget,
);

if (maxContentChars <= 0) {
omittedCount = input.attachments.length - index;
break;
}

const truncationNote = "\n[content truncated to fit input limits]";
const needsTruncation = entry.text.length > maxContentChars;
const availableContentChars = needsTruncation
? Math.max(0, maxContentChars - truncationNote.length)
: maxContentChars;
if (availableContentChars <= 0) {
omittedCount = input.attachments.length - index;
break;
}

const blockBody = entry.text.slice(0, availableContentChars);
const block = `${openBlock}${blockBody}${needsTruncation ? truncationNote : ""}${closeBlock}`;
if (!append(block)) {
omittedCount = input.attachments.length - index;
break;
}
usedFileContextChars += block.length;
}

if (omittedCount > 0) {
append(`\n\n[${omittedCount} attached file(s) omitted due to input size limits.]`);
}

return result;
}
28 changes: 28 additions & 0 deletions apps/server/src/imageMime.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import Mime from "@effect/platform-node/Mime";

const SAFE_ATTACHMENT_FILE_EXTENSION_PATTERN = /^[a-z0-9]{1,12}$/i;

export const IMAGE_EXTENSION_BY_MIME_TYPE: Record<string, string> = {
"image/avif": ".avif",
"image/bmp": ".bmp",
Expand Down Expand Up @@ -29,6 +31,10 @@ export const SAFE_IMAGE_FILE_EXTENSIONS = new Set([
".webp",
]);

export function isImageMimeType(mimeType: string): boolean {
return mimeType.trim().toLowerCase().startsWith("image/");
}

export function parseBase64DataUrl(
dataUrl: string,
): { readonly mimeType: string; readonly base64: string } | null {
Expand Down Expand Up @@ -77,3 +83,25 @@ export function inferImageExtension(input: { mimeType: string; fileName?: string

return ".bin";
}

export function inferAttachmentExtension(input: { mimeType: string; fileName?: string }): string {
if (isImageMimeType(input.mimeType)) {
return inferImageExtension(input);
}

const mimeExtension = Mime.getExtension(input.mimeType);
if (
typeof mimeExtension === "string" &&
SAFE_ATTACHMENT_FILE_EXTENSION_PATTERN.test(mimeExtension.replace(/^\./, ""))
) {
return mimeExtension.startsWith(".") ? mimeExtension : `.${mimeExtension}`;
}

const fileName = input.fileName?.trim() ?? "";
const extensionMatch = /\.([a-z0-9]{1,12})$/i.exec(fileName);
if (extensionMatch?.[1]) {
return `.${extensionMatch[1].toLowerCase()}`;
}

return ".bin";
}
3 changes: 0 additions & 3 deletions apps/server/src/orchestration/Layers/ProjectionPipeline.ts
Original file line number Diff line number Diff line change
Expand Up @@ -221,9 +221,6 @@ function collectThreadAttachmentRelativePaths(
const relativePaths = new Set<string>();
for (const message of messages) {
for (const attachment of message.attachments ?? []) {
if (attachment.type !== "image") {
continue;
}
const attachmentThreadSegment = parseThreadSegmentFromAttachmentId(attachment.id);
if (!attachmentThreadSegment || attachmentThreadSegment !== threadSegment) {
continue;
Expand Down
Loading
Loading