icebear0828 Claude Opus 4.6 commited on
Commit
142c9c4
·
1 Parent(s): 2df0167

feat: add image input support for all API routes

Browse files

Previously all image content was silently discarded by the translation
layer. Now image_url (OpenAI), image/base64 (Anthropic), and inlineData
(Gemini) content parts are translated to Codex input_image format and
passed through to the backend.

Closes #25

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

CHANGELOG.md CHANGED
@@ -8,6 +8,7 @@
8
 
9
  ### Added
10
 
 
11
  - 每窗口使用量计数器:Dashboard 主显示当前窗口内的请求数和 Token 用量,累计总量降为次要灰色小字;窗口过期时自动归零(时间驱动,零 API 开销),后端同步作为双保险校正
12
  - 窗口时长显示:从后端同步 `limit_window_seconds`,AccountCard header 显示窗口时长 badge(如 `3h`),重置时间行追加窗口时长文字
13
  - Dashboard 账号列表新增手动刷新按钮:点击重新拉取额度数据,刷新中按钮旋转并禁用;独立 `refreshing` 状态确保刷新时列表不清空;标题行右侧显示"更新于 HH:MM:SS"时间戳(桌面端可见)
 
8
 
9
  ### Added
10
 
11
+ - 图片输入支持:OpenAI、Anthropic、Gemini 三种格式的图片内容现在可以正确透传到 Codex 后端(`input_image` + data URI),此前图片被静默丢弃
12
  - 每窗口使用量计数器:Dashboard 主显示当前窗口内的请求数和 Token 用量,累计总量降为次要灰色小字;窗口过期时自动归零(时间驱动,零 API 开销),后端同步作为双保险校正
13
  - 窗口时长显示:从后端同步 `limit_window_seconds`,AccountCard header 显示窗口时长 badge(如 `3h`),重置时间行追加窗口时长文字
14
  - Dashboard 账号列表新增手动刷新按钮:点击重新拉取额度数据,刷新中按钮旋转并禁用;独立 `refreshing` 状态确保刷新时列表不清空;标题行右侧显示"更新于 HH:MM:SS"时间戳(桌面端可见)
src/proxy/codex-api.ts CHANGED
@@ -36,8 +36,13 @@ export interface CodexResponsesRequest {
36
  previous_response_id?: string | null;
37
  }
38
 
 
 
 
 
 
39
  export type CodexInputItem =
40
- | { role: "user"; content: string }
41
  | { role: "assistant"; content: string }
42
  | { role: "system"; content: string }
43
  | { type: "function_call"; id?: string; call_id: string; name: string; arguments: string }
 
36
  previous_response_id?: string | null;
37
  }
38
 
39
+ /** Structured content part for multimodal Codex input. */
40
+ export type CodexContentPart =
41
+ | { type: "input_text"; text: string }
42
+ | { type: "input_image"; image_url: string };
43
+
44
  export type CodexInputItem =
45
+ | { role: "user"; content: string | CodexContentPart[] }
46
  | { role: "assistant"; content: string }
47
  | { role: "system"; content: string }
48
  | { type: "function_call"; id?: string; call_id: string; name: string; arguments: string }
src/translation/anthropic-to-codex.ts CHANGED
@@ -6,6 +6,7 @@ import type { AnthropicMessagesRequest } from "../types/anthropic.js";
6
  import type {
7
  CodexResponsesRequest,
8
  CodexInputItem,
 
9
  } from "../proxy/codex-api.js";
10
  import { resolveModelId, getModelInfo } from "../models/model-store.js";
11
  import { getConfig } from "../config.js";
@@ -39,9 +40,39 @@ function extractTextContent(
39
  .join("\n");
40
  }
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  /**
43
  * Convert Anthropic message content blocks into native Codex input items.
44
- * Handles text, tool_use, and tool_result blocks.
45
  */
46
  function contentToInputItems(
47
  role: "user" | "assistant",
@@ -53,10 +84,19 @@ function contentToInputItems(
53
 
54
  const items: CodexInputItem[] = [];
55
 
56
- // Collect text blocks first
57
- const text = extractTextContent(content);
58
- if (text || !content.some((b) => b.type === "tool_use" || b.type === "tool_result")) {
59
- items.push({ role, content: text });
 
 
 
 
 
 
 
 
 
60
  }
61
 
62
  for (const block of content) {
 
6
  import type {
7
  CodexResponsesRequest,
8
  CodexInputItem,
9
+ CodexContentPart,
10
  } from "../proxy/codex-api.js";
11
  import { resolveModelId, getModelInfo } from "../models/model-store.js";
12
  import { getConfig } from "../config.js";
 
40
  .join("\n");
41
  }
42
 
43
+ /**
44
+ * Build multimodal content (text + images) from Anthropic blocks.
45
+ * Returns plain string if text-only, or CodexContentPart[] if images present.
46
+ */
47
+ function extractMultimodalContent(
48
+ content: Array<Record<string, unknown>>,
49
+ ): string | CodexContentPart[] {
50
+ const hasImage = content.some((b) => b.type === "image");
51
+ if (!hasImage) return extractTextContent(content);
52
+
53
+ const parts: CodexContentPart[] = [];
54
+ for (const block of content) {
55
+ if (block.type === "text" && typeof block.text === "string") {
56
+ parts.push({ type: "input_text", text: block.text });
57
+ } else if (block.type === "image") {
58
+ // Anthropic format: source: { type: "base64", media_type: "image/png", data: "..." }
59
+ const source = block.source as
60
+ | { type: string; media_type: string; data: string }
61
+ | undefined;
62
+ if (source?.type === "base64" && source.media_type && source.data) {
63
+ parts.push({
64
+ type: "input_image",
65
+ image_url: `data:${source.media_type};base64,${source.data}`,
66
+ });
67
+ }
68
+ }
69
+ }
70
+ return parts.length > 0 ? parts : "";
71
+ }
72
+
73
  /**
74
  * Convert Anthropic message content blocks into native Codex input items.
75
+ * Handles text, image, tool_use, and tool_result blocks.
76
  */
77
  function contentToInputItems(
78
  role: "user" | "assistant",
 
84
 
85
  const items: CodexInputItem[] = [];
86
 
87
+ // Build content (text or multimodal) for the message itself
88
+ const hasToolBlocks = content.some((b) => b.type === "tool_use" || b.type === "tool_result");
89
+ if (role === "user") {
90
+ const extracted = extractMultimodalContent(content);
91
+ if (extracted || !hasToolBlocks) {
92
+ items.push({ role: "user", content: extracted || "" });
93
+ }
94
+ } else {
95
+ // Assistant messages: text-only (Codex doesn't support structured assistant content)
96
+ const text = extractTextContent(content);
97
+ if (text || !hasToolBlocks) {
98
+ items.push({ role: "assistant", content: text });
99
+ }
100
  }
101
 
102
  for (const block of content) {
src/translation/gemini-to-codex.ts CHANGED
@@ -10,6 +10,7 @@ import type {
10
  import type {
11
  CodexResponsesRequest,
12
  CodexInputItem,
 
13
  } from "../proxy/codex-api.js";
14
  import { resolveModelId, getModelInfo } from "../models/model-store.js";
15
  import { getConfig } from "../config.js";
@@ -26,6 +27,30 @@ function extractTextFromParts(parts: GeminiPart[]): string {
26
  .join("\n");
27
  }
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  /**
30
  * Convert Gemini content parts into native Codex input items.
31
  */
@@ -36,10 +61,17 @@ function partsToInputItems(
36
  const items: CodexInputItem[] = [];
37
  const hasFunctionParts = parts.some((p) => p.functionCall || p.functionResponse);
38
 
39
- // Collect text content
40
- const text = extractTextFromParts(parts);
41
- if (text || !hasFunctionParts) {
42
- items.push({ role, content: text });
 
 
 
 
 
 
 
43
  }
44
 
45
  // Track call_ids by function name to correlate functionCall → functionResponse
 
10
  import type {
11
  CodexResponsesRequest,
12
  CodexInputItem,
13
+ CodexContentPart,
14
  } from "../proxy/codex-api.js";
15
  import { resolveModelId, getModelInfo } from "../models/model-store.js";
16
  import { getConfig } from "../config.js";
 
27
  .join("\n");
28
  }
29
 
30
+ /**
31
+ * Build multimodal content (text + images) from Gemini parts.
32
+ * Returns plain string if text-only, or CodexContentPart[] if images present.
33
+ */
34
+ function extractMultimodalFromParts(
35
+ parts: GeminiPart[],
36
+ ): string | CodexContentPart[] {
37
+ const hasImage = parts.some((p) => p.inlineData);
38
+ if (!hasImage) return extractTextFromParts(parts);
39
+
40
+ const codexParts: CodexContentPart[] = [];
41
+ for (const p of parts) {
42
+ if (!p.thought && p.text) {
43
+ codexParts.push({ type: "input_text", text: p.text });
44
+ } else if (p.inlineData) {
45
+ codexParts.push({
46
+ type: "input_image",
47
+ image_url: `data:${p.inlineData.mimeType};base64,${p.inlineData.data}`,
48
+ });
49
+ }
50
+ }
51
+ return codexParts.length > 0 ? codexParts : "";
52
+ }
53
+
54
  /**
55
  * Convert Gemini content parts into native Codex input items.
56
  */
 
61
  const items: CodexInputItem[] = [];
62
  const hasFunctionParts = parts.some((p) => p.functionCall || p.functionResponse);
63
 
64
+ // Build content — multimodal for user, text-only for assistant
65
+ if (role === "user") {
66
+ const content = extractMultimodalFromParts(parts);
67
+ if (content || !hasFunctionParts) {
68
+ items.push({ role: "user", content: content || "" });
69
+ }
70
+ } else {
71
+ const text = extractTextFromParts(parts);
72
+ if (text || !hasFunctionParts) {
73
+ items.push({ role: "assistant", content: text });
74
+ }
75
  }
76
 
77
  // Track call_ids by function name to correlate functionCall → functionResponse
src/translation/openai-to-codex.ts CHANGED
@@ -6,6 +6,7 @@ import type { ChatCompletionRequest, ChatMessage } from "../types/openai.js";
6
  import type {
7
  CodexResponsesRequest,
8
  CodexInputItem,
 
9
  } from "../proxy/codex-api.js";
10
  import { resolveModelId, getModelInfo } from "../models/model-store.js";
11
  import { getConfig } from "../config.js";
@@ -26,6 +27,47 @@ function extractText(content: ChatMessage["content"]): string {
26
  .join("\n");
27
  }
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  /**
31
  * Convert a ChatCompletionRequest to a CodexResponsesRequest.
@@ -95,7 +137,7 @@ export function translateToCodexRequest(
95
  output: extractText(msg.content),
96
  });
97
  } else {
98
- input.push({ role: "user", content: extractText(msg.content) });
99
  }
100
  }
101
 
 
6
  import type {
7
  CodexResponsesRequest,
8
  CodexInputItem,
9
+ CodexContentPart,
10
  } from "../proxy/codex-api.js";
11
  import { resolveModelId, getModelInfo } from "../models/model-store.js";
12
  import { getConfig } from "../config.js";
 
27
  .join("\n");
28
  }
29
 
30
+ /**
31
+ * Extract content from a message, preserving images as structured content parts.
32
+ * Returns a plain string if text-only, or CodexContentPart[] if images are present.
33
+ */
34
+ function extractContent(
35
+ content: ChatMessage["content"],
36
+ ): string | CodexContentPart[] {
37
+ if (content == null) return "";
38
+ if (typeof content === "string") return content;
39
+
40
+ const hasImage = content.some((p) => p.type === "image_url");
41
+ if (!hasImage) {
42
+ // Text-only: return plain string (preserves existing behavior)
43
+ return content
44
+ .filter((p) => p.type === "text" && p.text)
45
+ .map((p) => p.text!)
46
+ .join("\n");
47
+ }
48
+
49
+ // Multimodal: convert to Codex content parts
50
+ const parts: CodexContentPart[] = [];
51
+ for (const p of content) {
52
+ if (p.type === "text" && p.text) {
53
+ parts.push({ type: "input_text", text: p.text });
54
+ } else if (p.type === "image_url") {
55
+ // OpenAI format: image_url: { url: "data:..." } or image_url: "string"
56
+ const imageUrl = p.image_url as
57
+ | string
58
+ | { url: string; detail?: string }
59
+ | undefined;
60
+ if (!imageUrl) continue;
61
+ const url = typeof imageUrl === "string" ? imageUrl : imageUrl.url;
62
+ if (url) {
63
+ parts.push({ type: "input_image", image_url: url });
64
+ }
65
+ }
66
+ }
67
+
68
+ return parts.length > 0 ? parts : "";
69
+ }
70
+
71
 
72
  /**
73
  * Convert a ChatCompletionRequest to a CodexResponsesRequest.
 
137
  output: extractText(msg.content),
138
  });
139
  } else {
140
+ input.push({ role: "user", content: extractContent(msg.content) });
141
  }
142
  }
143
 
src/types/gemini.ts CHANGED
@@ -8,6 +8,11 @@ import { z } from "zod";
8
  const GeminiPartSchema = z.object({
9
  text: z.string().optional(),
10
  thought: z.boolean().optional(),
 
 
 
 
 
11
  // Function calling fields (accepted for compatibility, not forwarded to Codex)
12
  functionCall: z.object({
13
  name: z.string(),
@@ -74,9 +79,15 @@ export interface GeminiFunctionResponse {
74
  response?: Record<string, unknown>;
75
  }
76
 
 
 
 
 
 
77
  export interface GeminiPart {
78
  text?: string;
79
  thought?: boolean;
 
80
  functionCall?: GeminiFunctionCall;
81
  functionResponse?: GeminiFunctionResponse;
82
  }
 
8
  const GeminiPartSchema = z.object({
9
  text: z.string().optional(),
10
  thought: z.boolean().optional(),
11
+ // Inline image data
12
+ inlineData: z.object({
13
+ mimeType: z.string(),
14
+ data: z.string(),
15
+ }).optional(),
16
  // Function calling fields (accepted for compatibility, not forwarded to Codex)
17
  functionCall: z.object({
18
  name: z.string(),
 
79
  response?: Record<string, unknown>;
80
  }
81
 
82
+ export interface GeminiInlineData {
83
+ mimeType: string;
84
+ data: string;
85
+ }
86
+
87
  export interface GeminiPart {
88
  text?: string;
89
  thought?: boolean;
90
+ inlineData?: GeminiInlineData;
91
  functionCall?: GeminiFunctionCall;
92
  functionResponse?: GeminiFunctionResponse;
93
  }