Spaces:

lenson78
/

codex-proxy

Paused

icebear0828 Claude Opus 4.6 commited on Mar 4

Commit

142c9c4

1 Parent(s): 2df0167

feat: add image input support for all API routes

Previously all image content was silently discarded by the translation
layer. Now image_url (OpenAI), image/base64 (Anthropic), and inlineData
(Gemini) content parts are translated to Codex input_image format and
passed through to the backend.

Closes #25

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (6) hide show

CHANGELOG.md +1 -0
src/proxy/codex-api.ts +6 -1
src/translation/anthropic-to-codex.ts +45 -5
src/translation/gemini-to-codex.ts +36 -4
src/translation/openai-to-codex.ts +43 -1
src/types/gemini.ts +11 -0

CHANGELOG.md CHANGED Viewed

@@ -8,6 +8,7 @@
 ### Added
 - 每窗口使用量计数器：Dashboard 主显示当前窗口内的请求数和 Token 用量，累计总量降为次要灰色小字；窗口过期时自动归零（时间驱动，零 API 开销），后端同步作为双保险校正
 - 窗口时长显示：从后端同步 `limit_window_seconds`，AccountCard header 显示窗口时长 badge（如 `3h`），重置时间行追加窗口时长文字
 - Dashboard 账号列表新增手动刷新按钮：点击重新拉取额度数据，刷新中按钮旋转并禁用；独立 `refreshing` 状态确保刷新时列表不清空；标题行右侧显示"更新于 HH:MM:SS"时间戳（桌面端可见）

 ### Added
+- 图片输入支持：OpenAI、Anthropic、Gemini 三种格式的图片内容现在可以正确透传到 Codex 后端（`input_image` + data URI），此前图片被静默丢弃
 - 每窗口使用量计数器：Dashboard 主显示当前窗口内的请求数和 Token 用量，累计总量降为次要灰色小字；窗口过期时自动归零（时间驱动，零 API 开销），后端同步作为双保险校正
 - 窗口时长显示：从后端同步 `limit_window_seconds`，AccountCard header 显示窗口时长 badge（如 `3h`），重置时间行追加窗口时长文字
 - Dashboard 账号列表新增手动刷新按钮：点击重新拉取额度数据，刷新中按钮旋转并禁用；独立 `refreshing` 状态确保刷新时列表不清空；标题行右侧显示"更新于 HH:MM:SS"时间戳（桌面端可见）

src/proxy/codex-api.ts CHANGED Viewed

@@ -36,8 +36,13 @@ export interface CodexResponsesRequest {
   previous_response_id?: string | null;
 }
 export type CodexInputItem =
-  | { role: "user"; content: string }
   | { role: "assistant"; content: string }
   | { role: "system"; content: string }
   | { type: "function_call"; id?: string; call_id: string; name: string; arguments: string }

   previous_response_id?: string | null;
 }
+/** Structured content part for multimodal Codex input. */
+export type CodexContentPart =
+  | { type: "input_text"; text: string }
+  | { type: "input_image"; image_url: string };
 export type CodexInputItem =
+  | { role: "user"; content: string | CodexContentPart[] }
   | { role: "assistant"; content: string }
   | { role: "system"; content: string }
   | { type: "function_call"; id?: string; call_id: string; name: string; arguments: string }

src/translation/anthropic-to-codex.ts CHANGED Viewed

@@ -6,6 +6,7 @@ import type { AnthropicMessagesRequest } from "../types/anthropic.js";
 import type {
   CodexResponsesRequest,
   CodexInputItem,
 } from "../proxy/codex-api.js";
 import { resolveModelId, getModelInfo } from "../models/model-store.js";
 import { getConfig } from "../config.js";
@@ -39,9 +40,39 @@ function extractTextContent(
     .join("\n");
 }
 /**
  * Convert Anthropic message content blocks into native Codex input items.
- * Handles text, tool_use, and tool_result blocks.
  */
 function contentToInputItems(
   role: "user" | "assistant",
@@ -53,10 +84,19 @@ function contentToInputItems(
   const items: CodexInputItem[] = [];
-  // Collect text blocks first
-  const text = extractTextContent(content);
-  if (text || !content.some((b) => b.type === "tool_use" || b.type === "tool_result")) {
-    items.push({ role, content: text });
   }
   for (const block of content) {

 import type {
   CodexResponsesRequest,
   CodexInputItem,
+  CodexContentPart,
 } from "../proxy/codex-api.js";
 import { resolveModelId, getModelInfo } from "../models/model-store.js";
 import { getConfig } from "../config.js";
     .join("\n");
 }
+/**
+ * Build multimodal content (text + images) from Anthropic blocks.
+ * Returns plain string if text-only, or CodexContentPart[] if images present.
+ */
+function extractMultimodalContent(
+  content: Array<Record<string, unknown>>,
+): string | CodexContentPart[] {
+  const hasImage = content.some((b) => b.type === "image");
+  if (!hasImage) return extractTextContent(content);
+  const parts: CodexContentPart[] = [];
+  for (const block of content) {
+    if (block.type === "text" && typeof block.text === "string") {
+      parts.push({ type: "input_text", text: block.text });
+    } else if (block.type === "image") {
+      // Anthropic format: source: { type: "base64", media_type: "image/png", data: "..." }
+      const source = block.source as
+        | { type: string; media_type: string; data: string }
+        | undefined;
+      if (source?.type === "base64" && source.media_type && source.data) {
+        parts.push({
+          type: "input_image",
+          image_url: `data:${source.media_type};base64,${source.data}`,
+        });
+      }
+    }
+  }
+  return parts.length > 0 ? parts : "";
+}
 /**
  * Convert Anthropic message content blocks into native Codex input items.
+ * Handles text, image, tool_use, and tool_result blocks.
  */
 function contentToInputItems(
   role: "user" | "assistant",
   const items: CodexInputItem[] = [];
+  // Build content (text or multimodal) for the message itself
+  const hasToolBlocks = content.some((b) => b.type === "tool_use" || b.type === "tool_result");
+  if (role === "user") {
+    const extracted = extractMultimodalContent(content);
+    if (extracted || !hasToolBlocks) {
+      items.push({ role: "user", content: extracted || "" });
+    }
+  } else {
+    // Assistant messages: text-only (Codex doesn't support structured assistant content)
+    const text = extractTextContent(content);
+    if (text || !hasToolBlocks) {
+      items.push({ role: "assistant", content: text });
+    }
   }
   for (const block of content) {

src/translation/gemini-to-codex.ts CHANGED Viewed

@@ -10,6 +10,7 @@ import type {
 import type {
   CodexResponsesRequest,
   CodexInputItem,
 } from "../proxy/codex-api.js";
 import { resolveModelId, getModelInfo } from "../models/model-store.js";
 import { getConfig } from "../config.js";
@@ -26,6 +27,30 @@ function extractTextFromParts(parts: GeminiPart[]): string {
     .join("\n");
 }
 /**
  * Convert Gemini content parts into native Codex input items.
  */
@@ -36,10 +61,17 @@ function partsToInputItems(
   const items: CodexInputItem[] = [];
   const hasFunctionParts = parts.some((p) => p.functionCall || p.functionResponse);
-  // Collect text content
-  const text = extractTextFromParts(parts);
-  if (text || !hasFunctionParts) {
-    items.push({ role, content: text });
   }
   // Track call_ids by function name to correlate functionCall → functionResponse

 import type {
   CodexResponsesRequest,
   CodexInputItem,
+  CodexContentPart,
 } from "../proxy/codex-api.js";
 import { resolveModelId, getModelInfo } from "../models/model-store.js";
 import { getConfig } from "../config.js";
     .join("\n");
 }
+/**
+ * Build multimodal content (text + images) from Gemini parts.
+ * Returns plain string if text-only, or CodexContentPart[] if images present.
+ */
+function extractMultimodalFromParts(
+  parts: GeminiPart[],
+): string | CodexContentPart[] {
+  const hasImage = parts.some((p) => p.inlineData);
+  if (!hasImage) return extractTextFromParts(parts);
+  const codexParts: CodexContentPart[] = [];
+  for (const p of parts) {
+    if (!p.thought && p.text) {
+      codexParts.push({ type: "input_text", text: p.text });
+    } else if (p.inlineData) {
+      codexParts.push({
+        type: "input_image",
+        image_url: `data:${p.inlineData.mimeType};base64,${p.inlineData.data}`,
+      });
+    }
+  }
+  return codexParts.length > 0 ? codexParts : "";
+}
 /**
  * Convert Gemini content parts into native Codex input items.
  */
   const items: CodexInputItem[] = [];
   const hasFunctionParts = parts.some((p) => p.functionCall || p.functionResponse);
+  // Build content — multimodal for user, text-only for assistant
+  if (role === "user") {
+    const content = extractMultimodalFromParts(parts);
+    if (content || !hasFunctionParts) {
+      items.push({ role: "user", content: content || "" });
+    }
+  } else {
+    const text = extractTextFromParts(parts);
+    if (text || !hasFunctionParts) {
+      items.push({ role: "assistant", content: text });
+    }
   }
   // Track call_ids by function name to correlate functionCall → functionResponse

src/translation/openai-to-codex.ts CHANGED Viewed

@@ -6,6 +6,7 @@ import type { ChatCompletionRequest, ChatMessage } from "../types/openai.js";
 import type {
   CodexResponsesRequest,
   CodexInputItem,
 } from "../proxy/codex-api.js";
 import { resolveModelId, getModelInfo } from "../models/model-store.js";
 import { getConfig } from "../config.js";
@@ -26,6 +27,47 @@ function extractText(content: ChatMessage["content"]): string {
     .join("\n");
 }
 /**
  * Convert a ChatCompletionRequest to a CodexResponsesRequest.
@@ -95,7 +137,7 @@ export function translateToCodexRequest(
         output: extractText(msg.content),
       });
     } else {
-      input.push({ role: "user", content: extractText(msg.content) });
     }
   }

 import type {
   CodexResponsesRequest,
   CodexInputItem,
+  CodexContentPart,
 } from "../proxy/codex-api.js";
 import { resolveModelId, getModelInfo } from "../models/model-store.js";
 import { getConfig } from "../config.js";
     .join("\n");
 }
+/**
+ * Extract content from a message, preserving images as structured content parts.
+ * Returns a plain string if text-only, or CodexContentPart[] if images are present.
+ */
+function extractContent(
+  content: ChatMessage["content"],
+): string | CodexContentPart[] {
+  if (content == null) return "";
+  if (typeof content === "string") return content;
+  const hasImage = content.some((p) => p.type === "image_url");
+  if (!hasImage) {
+    // Text-only: return plain string (preserves existing behavior)
+    return content
+      .filter((p) => p.type === "text" && p.text)
+      .map((p) => p.text!)
+      .join("\n");
+  }
+  // Multimodal: convert to Codex content parts
+  const parts: CodexContentPart[] = [];
+  for (const p of content) {
+    if (p.type === "text" && p.text) {
+      parts.push({ type: "input_text", text: p.text });
+    } else if (p.type === "image_url") {
+      // OpenAI format: image_url: { url: "data:..." } or image_url: "string"
+      const imageUrl = p.image_url as
+        | string
+        | { url: string; detail?: string }
+        | undefined;
+      if (!imageUrl) continue;
+      const url = typeof imageUrl === "string" ? imageUrl : imageUrl.url;
+      if (url) {
+        parts.push({ type: "input_image", image_url: url });
+      }
+    }
+  }
+  return parts.length > 0 ? parts : "";
+}
 /**
  * Convert a ChatCompletionRequest to a CodexResponsesRequest.
         output: extractText(msg.content),
       });
     } else {
+      input.push({ role: "user", content: extractContent(msg.content) });
     }
   }

src/types/gemini.ts CHANGED Viewed

@@ -8,6 +8,11 @@ import { z } from "zod";
 const GeminiPartSchema = z.object({
   text: z.string().optional(),
   thought: z.boolean().optional(),
   // Function calling fields (accepted for compatibility, not forwarded to Codex)
   functionCall: z.object({
     name: z.string(),
@@ -74,9 +79,15 @@ export interface GeminiFunctionResponse {
   response?: Record<string, unknown>;
 }
 export interface GeminiPart {
   text?: string;
   thought?: boolean;
   functionCall?: GeminiFunctionCall;
   functionResponse?: GeminiFunctionResponse;
 }

 const GeminiPartSchema = z.object({
   text: z.string().optional(),
   thought: z.boolean().optional(),
+  // Inline image data
+  inlineData: z.object({
+    mimeType: z.string(),
+    data: z.string(),
+  }).optional(),
   // Function calling fields (accepted for compatibility, not forwarded to Codex)
   functionCall: z.object({
     name: z.string(),
   response?: Record<string, unknown>;
 }
+export interface GeminiInlineData {
+  mimeType: string;
+  data: string;
+}
 export interface GeminiPart {
   text?: string;
   thought?: boolean;
+  inlineData?: GeminiInlineData;
   functionCall?: GeminiFunctionCall;
   functionResponse?: GeminiFunctionResponse;
 }