Spaces:
Paused
Paused
icebear0828 Claude Opus 4.6 commited on
Commit ·
142c9c4
1
Parent(s): 2df0167
feat: add image input support for all API routes
Browse filesPreviously all image content was silently discarded by the translation
layer. Now image_url (OpenAI), image/base64 (Anthropic), and inlineData
(Gemini) content parts are translated to Codex input_image format and
passed through to the backend.
Closes #25
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- CHANGELOG.md +1 -0
- src/proxy/codex-api.ts +6 -1
- src/translation/anthropic-to-codex.ts +45 -5
- src/translation/gemini-to-codex.ts +36 -4
- src/translation/openai-to-codex.ts +43 -1
- src/types/gemini.ts +11 -0
CHANGELOG.md
CHANGED
|
@@ -8,6 +8,7 @@
|
|
| 8 |
|
| 9 |
### Added
|
| 10 |
|
|
|
|
| 11 |
- 每窗口使用量计数器:Dashboard 主显示当前窗口内的请求数和 Token 用量,累计总量降为次要灰色小字;窗口过期时自动归零(时间驱动,零 API 开销),后端同步作为双保险校正
|
| 12 |
- 窗口时长显示:从后端同步 `limit_window_seconds`,AccountCard header 显示窗口时长 badge(如 `3h`),重置时间行追加窗口时长文字
|
| 13 |
- Dashboard 账号列表新增手动刷新按钮:点击重新拉取额度数据,刷新中按钮旋转并禁用;独立 `refreshing` 状态确保刷新时列表不清空;标题行右侧显示"更新于 HH:MM:SS"时间戳(桌面端可见)
|
|
|
|
| 8 |
|
| 9 |
### Added
|
| 10 |
|
| 11 |
+
- 图片输入支持:OpenAI、Anthropic、Gemini 三种格式的图片内容现在可以正确透传到 Codex 后端(`input_image` + data URI),此前图片被静默丢弃
|
| 12 |
- 每窗口使用量计数器:Dashboard 主显示当前窗口内的请求数和 Token 用量,累计总量降为次要灰色小字;窗口过期时自动归零(时间驱动,零 API 开销),后端同步作为双保险校正
|
| 13 |
- 窗口时长显示:从后端同步 `limit_window_seconds`,AccountCard header 显示窗口时长 badge(如 `3h`),重置时间行追加窗口时长文字
|
| 14 |
- Dashboard 账号列表新增手动刷新按钮:点击重新拉取额度数据,刷新中按钮旋转并禁用;独立 `refreshing` 状态确保刷新时列表不清空;标题行右侧显示"更新于 HH:MM:SS"时间戳(桌面端可见)
|
src/proxy/codex-api.ts
CHANGED
|
@@ -36,8 +36,13 @@ export interface CodexResponsesRequest {
|
|
| 36 |
previous_response_id?: string | null;
|
| 37 |
}
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
export type CodexInputItem =
|
| 40 |
-
| { role: "user"; content: string }
|
| 41 |
| { role: "assistant"; content: string }
|
| 42 |
| { role: "system"; content: string }
|
| 43 |
| { type: "function_call"; id?: string; call_id: string; name: string; arguments: string }
|
|
|
|
| 36 |
previous_response_id?: string | null;
|
| 37 |
}
|
| 38 |
|
| 39 |
+
/** Structured content part for multimodal Codex input. */
|
| 40 |
+
export type CodexContentPart =
|
| 41 |
+
| { type: "input_text"; text: string }
|
| 42 |
+
| { type: "input_image"; image_url: string };
|
| 43 |
+
|
| 44 |
export type CodexInputItem =
|
| 45 |
+
| { role: "user"; content: string | CodexContentPart[] }
|
| 46 |
| { role: "assistant"; content: string }
|
| 47 |
| { role: "system"; content: string }
|
| 48 |
| { type: "function_call"; id?: string; call_id: string; name: string; arguments: string }
|
src/translation/anthropic-to-codex.ts
CHANGED
|
@@ -6,6 +6,7 @@ import type { AnthropicMessagesRequest } from "../types/anthropic.js";
|
|
| 6 |
import type {
|
| 7 |
CodexResponsesRequest,
|
| 8 |
CodexInputItem,
|
|
|
|
| 9 |
} from "../proxy/codex-api.js";
|
| 10 |
import { resolveModelId, getModelInfo } from "../models/model-store.js";
|
| 11 |
import { getConfig } from "../config.js";
|
|
@@ -39,9 +40,39 @@ function extractTextContent(
|
|
| 39 |
.join("\n");
|
| 40 |
}
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
/**
|
| 43 |
* Convert Anthropic message content blocks into native Codex input items.
|
| 44 |
-
* Handles text, tool_use, and tool_result blocks.
|
| 45 |
*/
|
| 46 |
function contentToInputItems(
|
| 47 |
role: "user" | "assistant",
|
|
@@ -53,10 +84,19 @@ function contentToInputItems(
|
|
| 53 |
|
| 54 |
const items: CodexInputItem[] = [];
|
| 55 |
|
| 56 |
-
//
|
| 57 |
-
const
|
| 58 |
-
if (
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
}
|
| 61 |
|
| 62 |
for (const block of content) {
|
|
|
|
| 6 |
import type {
|
| 7 |
CodexResponsesRequest,
|
| 8 |
CodexInputItem,
|
| 9 |
+
CodexContentPart,
|
| 10 |
} from "../proxy/codex-api.js";
|
| 11 |
import { resolveModelId, getModelInfo } from "../models/model-store.js";
|
| 12 |
import { getConfig } from "../config.js";
|
|
|
|
| 40 |
.join("\n");
|
| 41 |
}
|
| 42 |
|
| 43 |
+
/**
|
| 44 |
+
* Build multimodal content (text + images) from Anthropic blocks.
|
| 45 |
+
* Returns plain string if text-only, or CodexContentPart[] if images present.
|
| 46 |
+
*/
|
| 47 |
+
function extractMultimodalContent(
|
| 48 |
+
content: Array<Record<string, unknown>>,
|
| 49 |
+
): string | CodexContentPart[] {
|
| 50 |
+
const hasImage = content.some((b) => b.type === "image");
|
| 51 |
+
if (!hasImage) return extractTextContent(content);
|
| 52 |
+
|
| 53 |
+
const parts: CodexContentPart[] = [];
|
| 54 |
+
for (const block of content) {
|
| 55 |
+
if (block.type === "text" && typeof block.text === "string") {
|
| 56 |
+
parts.push({ type: "input_text", text: block.text });
|
| 57 |
+
} else if (block.type === "image") {
|
| 58 |
+
// Anthropic format: source: { type: "base64", media_type: "image/png", data: "..." }
|
| 59 |
+
const source = block.source as
|
| 60 |
+
| { type: string; media_type: string; data: string }
|
| 61 |
+
| undefined;
|
| 62 |
+
if (source?.type === "base64" && source.media_type && source.data) {
|
| 63 |
+
parts.push({
|
| 64 |
+
type: "input_image",
|
| 65 |
+
image_url: `data:${source.media_type};base64,${source.data}`,
|
| 66 |
+
});
|
| 67 |
+
}
|
| 68 |
+
}
|
| 69 |
+
}
|
| 70 |
+
return parts.length > 0 ? parts : "";
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
/**
|
| 74 |
* Convert Anthropic message content blocks into native Codex input items.
|
| 75 |
+
* Handles text, image, tool_use, and tool_result blocks.
|
| 76 |
*/
|
| 77 |
function contentToInputItems(
|
| 78 |
role: "user" | "assistant",
|
|
|
|
| 84 |
|
| 85 |
const items: CodexInputItem[] = [];
|
| 86 |
|
| 87 |
+
// Build content (text or multimodal) for the message itself
|
| 88 |
+
const hasToolBlocks = content.some((b) => b.type === "tool_use" || b.type === "tool_result");
|
| 89 |
+
if (role === "user") {
|
| 90 |
+
const extracted = extractMultimodalContent(content);
|
| 91 |
+
if (extracted || !hasToolBlocks) {
|
| 92 |
+
items.push({ role: "user", content: extracted || "" });
|
| 93 |
+
}
|
| 94 |
+
} else {
|
| 95 |
+
// Assistant messages: text-only (Codex doesn't support structured assistant content)
|
| 96 |
+
const text = extractTextContent(content);
|
| 97 |
+
if (text || !hasToolBlocks) {
|
| 98 |
+
items.push({ role: "assistant", content: text });
|
| 99 |
+
}
|
| 100 |
}
|
| 101 |
|
| 102 |
for (const block of content) {
|
src/translation/gemini-to-codex.ts
CHANGED
|
@@ -10,6 +10,7 @@ import type {
|
|
| 10 |
import type {
|
| 11 |
CodexResponsesRequest,
|
| 12 |
CodexInputItem,
|
|
|
|
| 13 |
} from "../proxy/codex-api.js";
|
| 14 |
import { resolveModelId, getModelInfo } from "../models/model-store.js";
|
| 15 |
import { getConfig } from "../config.js";
|
|
@@ -26,6 +27,30 @@ function extractTextFromParts(parts: GeminiPart[]): string {
|
|
| 26 |
.join("\n");
|
| 27 |
}
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
/**
|
| 30 |
* Convert Gemini content parts into native Codex input items.
|
| 31 |
*/
|
|
@@ -36,10 +61,17 @@ function partsToInputItems(
|
|
| 36 |
const items: CodexInputItem[] = [];
|
| 37 |
const hasFunctionParts = parts.some((p) => p.functionCall || p.functionResponse);
|
| 38 |
|
| 39 |
-
//
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
}
|
| 44 |
|
| 45 |
// Track call_ids by function name to correlate functionCall → functionResponse
|
|
|
|
| 10 |
import type {
|
| 11 |
CodexResponsesRequest,
|
| 12 |
CodexInputItem,
|
| 13 |
+
CodexContentPart,
|
| 14 |
} from "../proxy/codex-api.js";
|
| 15 |
import { resolveModelId, getModelInfo } from "../models/model-store.js";
|
| 16 |
import { getConfig } from "../config.js";
|
|
|
|
| 27 |
.join("\n");
|
| 28 |
}
|
| 29 |
|
| 30 |
+
/**
|
| 31 |
+
* Build multimodal content (text + images) from Gemini parts.
|
| 32 |
+
* Returns plain string if text-only, or CodexContentPart[] if images present.
|
| 33 |
+
*/
|
| 34 |
+
function extractMultimodalFromParts(
|
| 35 |
+
parts: GeminiPart[],
|
| 36 |
+
): string | CodexContentPart[] {
|
| 37 |
+
const hasImage = parts.some((p) => p.inlineData);
|
| 38 |
+
if (!hasImage) return extractTextFromParts(parts);
|
| 39 |
+
|
| 40 |
+
const codexParts: CodexContentPart[] = [];
|
| 41 |
+
for (const p of parts) {
|
| 42 |
+
if (!p.thought && p.text) {
|
| 43 |
+
codexParts.push({ type: "input_text", text: p.text });
|
| 44 |
+
} else if (p.inlineData) {
|
| 45 |
+
codexParts.push({
|
| 46 |
+
type: "input_image",
|
| 47 |
+
image_url: `data:${p.inlineData.mimeType};base64,${p.inlineData.data}`,
|
| 48 |
+
});
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
return codexParts.length > 0 ? codexParts : "";
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
/**
|
| 55 |
* Convert Gemini content parts into native Codex input items.
|
| 56 |
*/
|
|
|
|
| 61 |
const items: CodexInputItem[] = [];
|
| 62 |
const hasFunctionParts = parts.some((p) => p.functionCall || p.functionResponse);
|
| 63 |
|
| 64 |
+
// Build content — multimodal for user, text-only for assistant
|
| 65 |
+
if (role === "user") {
|
| 66 |
+
const content = extractMultimodalFromParts(parts);
|
| 67 |
+
if (content || !hasFunctionParts) {
|
| 68 |
+
items.push({ role: "user", content: content || "" });
|
| 69 |
+
}
|
| 70 |
+
} else {
|
| 71 |
+
const text = extractTextFromParts(parts);
|
| 72 |
+
if (text || !hasFunctionParts) {
|
| 73 |
+
items.push({ role: "assistant", content: text });
|
| 74 |
+
}
|
| 75 |
}
|
| 76 |
|
| 77 |
// Track call_ids by function name to correlate functionCall → functionResponse
|
src/translation/openai-to-codex.ts
CHANGED
|
@@ -6,6 +6,7 @@ import type { ChatCompletionRequest, ChatMessage } from "../types/openai.js";
|
|
| 6 |
import type {
|
| 7 |
CodexResponsesRequest,
|
| 8 |
CodexInputItem,
|
|
|
|
| 9 |
} from "../proxy/codex-api.js";
|
| 10 |
import { resolveModelId, getModelInfo } from "../models/model-store.js";
|
| 11 |
import { getConfig } from "../config.js";
|
|
@@ -26,6 +27,47 @@ function extractText(content: ChatMessage["content"]): string {
|
|
| 26 |
.join("\n");
|
| 27 |
}
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
/**
|
| 31 |
* Convert a ChatCompletionRequest to a CodexResponsesRequest.
|
|
@@ -95,7 +137,7 @@ export function translateToCodexRequest(
|
|
| 95 |
output: extractText(msg.content),
|
| 96 |
});
|
| 97 |
} else {
|
| 98 |
-
input.push({ role: "user", content:
|
| 99 |
}
|
| 100 |
}
|
| 101 |
|
|
|
|
| 6 |
import type {
|
| 7 |
CodexResponsesRequest,
|
| 8 |
CodexInputItem,
|
| 9 |
+
CodexContentPart,
|
| 10 |
} from "../proxy/codex-api.js";
|
| 11 |
import { resolveModelId, getModelInfo } from "../models/model-store.js";
|
| 12 |
import { getConfig } from "../config.js";
|
|
|
|
| 27 |
.join("\n");
|
| 28 |
}
|
| 29 |
|
| 30 |
+
/**
|
| 31 |
+
* Extract content from a message, preserving images as structured content parts.
|
| 32 |
+
* Returns a plain string if text-only, or CodexContentPart[] if images are present.
|
| 33 |
+
*/
|
| 34 |
+
function extractContent(
|
| 35 |
+
content: ChatMessage["content"],
|
| 36 |
+
): string | CodexContentPart[] {
|
| 37 |
+
if (content == null) return "";
|
| 38 |
+
if (typeof content === "string") return content;
|
| 39 |
+
|
| 40 |
+
const hasImage = content.some((p) => p.type === "image_url");
|
| 41 |
+
if (!hasImage) {
|
| 42 |
+
// Text-only: return plain string (preserves existing behavior)
|
| 43 |
+
return content
|
| 44 |
+
.filter((p) => p.type === "text" && p.text)
|
| 45 |
+
.map((p) => p.text!)
|
| 46 |
+
.join("\n");
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
// Multimodal: convert to Codex content parts
|
| 50 |
+
const parts: CodexContentPart[] = [];
|
| 51 |
+
for (const p of content) {
|
| 52 |
+
if (p.type === "text" && p.text) {
|
| 53 |
+
parts.push({ type: "input_text", text: p.text });
|
| 54 |
+
} else if (p.type === "image_url") {
|
| 55 |
+
// OpenAI format: image_url: { url: "data:..." } or image_url: "string"
|
| 56 |
+
const imageUrl = p.image_url as
|
| 57 |
+
| string
|
| 58 |
+
| { url: string; detail?: string }
|
| 59 |
+
| undefined;
|
| 60 |
+
if (!imageUrl) continue;
|
| 61 |
+
const url = typeof imageUrl === "string" ? imageUrl : imageUrl.url;
|
| 62 |
+
if (url) {
|
| 63 |
+
parts.push({ type: "input_image", image_url: url });
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
return parts.length > 0 ? parts : "";
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
|
| 72 |
/**
|
| 73 |
* Convert a ChatCompletionRequest to a CodexResponsesRequest.
|
|
|
|
| 137 |
output: extractText(msg.content),
|
| 138 |
});
|
| 139 |
} else {
|
| 140 |
+
input.push({ role: "user", content: extractContent(msg.content) });
|
| 141 |
}
|
| 142 |
}
|
| 143 |
|
src/types/gemini.ts
CHANGED
|
@@ -8,6 +8,11 @@ import { z } from "zod";
|
|
| 8 |
const GeminiPartSchema = z.object({
|
| 9 |
text: z.string().optional(),
|
| 10 |
thought: z.boolean().optional(),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
// Function calling fields (accepted for compatibility, not forwarded to Codex)
|
| 12 |
functionCall: z.object({
|
| 13 |
name: z.string(),
|
|
@@ -74,9 +79,15 @@ export interface GeminiFunctionResponse {
|
|
| 74 |
response?: Record<string, unknown>;
|
| 75 |
}
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
export interface GeminiPart {
|
| 78 |
text?: string;
|
| 79 |
thought?: boolean;
|
|
|
|
| 80 |
functionCall?: GeminiFunctionCall;
|
| 81 |
functionResponse?: GeminiFunctionResponse;
|
| 82 |
}
|
|
|
|
| 8 |
const GeminiPartSchema = z.object({
|
| 9 |
text: z.string().optional(),
|
| 10 |
thought: z.boolean().optional(),
|
| 11 |
+
// Inline image data
|
| 12 |
+
inlineData: z.object({
|
| 13 |
+
mimeType: z.string(),
|
| 14 |
+
data: z.string(),
|
| 15 |
+
}).optional(),
|
| 16 |
// Function calling fields (accepted for compatibility, not forwarded to Codex)
|
| 17 |
functionCall: z.object({
|
| 18 |
name: z.string(),
|
|
|
|
| 79 |
response?: Record<string, unknown>;
|
| 80 |
}
|
| 81 |
|
| 82 |
+
export interface GeminiInlineData {
|
| 83 |
+
mimeType: string;
|
| 84 |
+
data: string;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
export interface GeminiPart {
|
| 88 |
text?: string;
|
| 89 |
thought?: boolean;
|
| 90 |
+
inlineData?: GeminiInlineData;
|
| 91 |
functionCall?: GeminiFunctionCall;
|
| 92 |
functionResponse?: GeminiFunctionResponse;
|
| 93 |
}
|