Commit ·
48b03a2
1
Parent(s): 9d849dd
perf: enable Gradio queue concurrency (fix "long queue" serialization)
Browse filesThe Space had no demo.queue() call, so Gradio default_concurrency_limit=1
serialised every run_ui research session — the second concurrent user
onward just piled up behind a single worker, producing the "long queue of
requests pending" warning.
run_ui is I/O-bound (it waits on the OSC vLLM endpoint, Serper, Jina and
Azure over HTTP), so it parallelises well. Add demo.queue() with
default_concurrency_limit=12 (tunable via QUEST_CONCURRENCY) and max_size=80;
raise launch max_threads to match. Real ceiling is the 8 OSC vLLM instances
behind the nginx load balancer.
app.py
CHANGED
|
@@ -2532,5 +2532,20 @@ with gr.Blocks(
|
|
| 2532 |
)
|
| 2533 |
|
| 2534 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2535 |
if __name__ == "__main__":
|
| 2536 |
-
demo.launch()
|
|
|
|
| 2532 |
)
|
| 2533 |
|
| 2534 |
|
| 2535 |
+
# The research agent is almost entirely I/O-bound (it waits on the OSC vLLM
|
| 2536 |
+
# endpoint, Serper, Jina and Azure over HTTP), so many runs can proceed in
|
| 2537 |
+
# parallel on even a small CPU box. Gradio's default_concurrency_limit is 1,
|
| 2538 |
+
# which serialises every run_ui call and is what produces the "long queue of
|
| 2539 |
+
# requests pending" warning. Lift it; the real ceiling is the 8 OSC vLLM
|
| 2540 |
+
# instances behind the nginx load balancer. Tunable via the QUEST_CONCURRENCY
|
| 2541 |
+
# Space variable without a code change.
|
| 2542 |
+
QUEST_CONCURRENCY = int(os.getenv("QUEST_CONCURRENCY", "12"))
|
| 2543 |
+
QUEST_QUEUE_MAX = int(os.getenv("QUEST_QUEUE_MAX", "80"))
|
| 2544 |
+
|
| 2545 |
+
demo.queue(
|
| 2546 |
+
default_concurrency_limit=QUEST_CONCURRENCY,
|
| 2547 |
+
max_size=QUEST_QUEUE_MAX,
|
| 2548 |
+
)
|
| 2549 |
+
|
| 2550 |
if __name__ == "__main__":
|
| 2551 |
+
demo.launch(max_threads=max(40, QUEST_CONCURRENCY * 3))
|