latency
hi,
i tried the following example with a h100 gpu - but latenct is 4.4 seconds - any idea?
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import argparse
import json
import time
import onnxruntime_genai as og
from common import apply_chat_template, get_config, get_generator_params_args, get_guidance, get_guidance_args, get_search_options, register_ep, set_logger
def main(args):
if args.debug:
set_logger()
register_ep(args.execution_provider, args.ep_path, args.use_winml)
if args.verbose:
print("Loading model...")
config = get_config(args.model_path, args.execution_provider)
model = og.Model(config)
if args.verbose:
print("Model loaded")
tokenizer = og.Tokenizer(model)
params = og.GeneratorParams(model)
search_options = get_search_options(args)
params.set_search_options(**search_options)
guidance_type, guidance_data, tools = "", "", ""
message = [{"role": "system", "content": args.system_prompt}]
if args.response_format != "":
guidance_type, guidance_data, tools = get_guidance(
response_format=args.response_format,
filepath=args.tools_file,
text_output=args.text_output,
tool_output=args.tool_output,
tool_call_start=args.tool_call_start,
tool_call_end=args.tool_call_end,
)
message[0]["tools"] = tools
params.set_guidance(guidance_type, guidance_data)
generator = og.Generator(model, params)
# Apply system prompt
try:
system_prompt = apply_chat_template(
model_path=args.model_path,
tokenizer=tokenizer,
messages=json.dumps(message),
tools=tools,
add_generation_prompt=False,
)
except Exception:
system_prompt = args.system_prompt
system_tokens = tokenizer.encode(system_prompt)
generator.append_tokens(system_tokens)
if not args.user_prompt:
raise ValueError("--user_prompt is required")
# Apply user prompt
user_message = [{"role": "user", "content": args.user_prompt}]
try:
user_prompt = apply_chat_template(
model_path=args.model_path,
tokenizer=tokenizer,
messages=json.dumps(user_message),
add_generation_prompt=True,
)
except Exception:
user_prompt = args.user_prompt
user_tokens = tokenizer.encode(user_prompt)
generator.append_tokens(user_tokens)
# Generation (no streaming)
start_time = time.time()
output_token_ids = []
try:
while not generator.is_done():
if args.max_new_tokens and len(output_token_ids) >= args.max_new_tokens:
break
generator.generate_next_token()
output_token_ids.append(generator.get_next_tokens()[0])
except KeyboardInterrupt:
print("Generation interrupted.")
end_time = time.time()
latency = end_time - start_time
# Decode all tokens at once
decode_stream = tokenizer.create_stream()
output_text = "".join(decode_stream.decode(tid) for tid in output_token_ids)
print("Output:")
print(output_text)
print(f"\nLatency (until all output tokens received): {latency:.3f} seconds")
if args.timings:
print(f"Prompt tokens: {len(user_tokens)}")
print(f"Generated tokens: {len(output_token_ids)}")
print(f"Total tokens (model): {generator.token_count()}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI chat example for ORT GenAI")
parser.add_argument('-m', '--model_path', type=str, required=True, help='ONNX model folder path (must contain genai_config.json and model.onnx)')
parser.add_argument('-e', '--execution_provider', type=str, required=False, default='follow_config', choices=["cuda", "cpu", "dml", "follow_config"], help='Execution provider to run the ONNX Runtime session with.')
parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Verbose output')
parser.add_argument('-d', '--debug', action='store_true', default=False, help='Debug mode')
parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information')
parser.add_argument('-sp', '--system_prompt', type=str, default='You are a helpful AI assistant.', help='System prompt')
parser.add_argument('-rw', '--rewind', action='store_true', default=False, help='Rewind after each generation')
parser.add_argument('--ep_path', type=str, required=False, default='', help='Path to execution provider DLL/SO')
parser.add_argument('--use_winml', action=argparse.BooleanOptionalAction, required=False, default=False, help='Use WinML')
parser.add_argument('--user_prompt', type=str, default='tell me how the stock market works')
parser.add_argument('--max_new_tokens', type=int, default=1000, help='Maximum number of new tokens to generate (0 = unlimited)')
get_generator_params_args(parser)
get_guidance_args(parser)
args = parser.parse_args()
main(args)
output:
python model-chat.py -m cuda/cuda-int4-kquant-block-32-mixed -e follow_config
Output:
<|channel|>analysis<|message|>The user asked: "tell me how the stock market works". We need to provide a concise yet somewhat thorough explanation of how the stock market operates, including the basics: what is a stock, how trading occurs, primary vs secondary markets, market participants, order types, exchanges, regulations, indices, short-term vs long-term, role of broker, market makers, etc. Must be helpful, accurate, and maybe give overview. Also possibly ask what level of depth. But a straightforward explanation should be fine.
We can explain: Stock is a security representing ownership in a corporation. The market is a place where stocks are bought/sold, enabling companies to raise capital, and investors to gain exposure to that equity.
Explain key players: Companies (issuers), Investors (retail and institutional), Exchanges (NASDAQ, NYSE, etc.), Intermediaries (brokers, dealers, market makers). Provide a description of the Primary Market: initial public offering (IPO) where firms sell shares to the public for first time. Secondary Market: shares traded between investors, where price determined by supply/demand.
Explain order flow: When an investor wants to buy, places a buy order; if limit price meets or better than any sell order, the exchange matches orders. Execution via automated systems or floor trading. Stock price fluctuations: The price at which the last trade occurred is the closing price; indices track group.
Explain regulation: Regulated by Securities and Exchange Commission (SEC), requires disclosure, reporting, preventing manipulations.
Explain margin, shorting, options, derivatives.
Explain the role of a stockbroker: individuals trade through brokers, may be full-service or discount; brokerage companies maintain custodial accounts and provide market access.
Explain the concept of bid-ask spread, liquidity.
Explain how companies can issue new shares or retire them, effect on ownership.
Explain about dividends, earnings, growth, the fundamentals.
Explain the significance of a ticker symbol.
Perhaps answer can be broken in sections. The question is general, we can give a concise overview. Possibly ask for more specific interest like "Are you interested in primary vs secondary" but maybe not.
Thus deliver answer.<|end|><|start|>assistant<|channel|>final<|message|>Below is a “big‑picture” tour of the stock market – what it is, who plays there, and how the actual buying and selling takes place. It skips the most technical trading‑algorithm details but focuses on the key concepts that matter to any investor, company, or student of finance.
---
## 1. What Is a Stock?
* **Ownership slice** – A stock (or share) represents a fraction of ownership in a corporate entity.
* **Equity vs. debt** – Unlike a bond (which is a loan), a share gives you a claim on the company’s assets and earnings.
* **Types of common securities**
* **Primary** – First sale from a company (IPO, new stock issue).
* **Secondary** – Purchases between investors; what most people think of when they say “the market.”
---
## 2. How Can a Company Raise Capital?
1. **Initial Public Offering (IPO)**
*The first time a private company sells stock to the public.*
* A company hires an investment bank to underwrite the sale, price the shares, and then sells them to institutional and retail investors through a regulated exchange (NYSE, Nasdaq, etc.).
2. **Follow‑On Offer (FTO)**
*Additional shares issued by the company after the IPO.*
* Often sold to the same buyers as the IPO to minimize market disruption.*
---
## 2. How Can a Company Raise Capital?
1. **Initial Public Offering (IPO)**
*The first time a private company sells stock to the public.*
* A company hires an investment bank to underwrite the sale, price the shares, and then sells them to institutional and retail investors through a regulated exchange (NYSE, Nasdaq, etc.).
2. **Follow‑On Offer (FTO)**
*Additional shares issued by the company after the IPO.*
* Often sold to the same buyers as the IPO to minimize market disruption.*
3. **Secondary Offer**
*Shares previously sold by the company that are now being re‑sold.*
* No new capital is raised; existing shareholders exit or increase holdings.*
---
## 3. Primary vs. Secondary Markets
| Feature | Primary | Secondary |
|---------|---------|-----------|
| **Seller** | Company (or its agent) | Investors (retail, institutional, hedge funds) |
| **Buyer** | Investors | Investors |
| **Price** | Fixed by underwriter | Determined by market demand & supply |
| **Where** | Issuance via securities filings (S‑1, etc.) | Exchange (floor/floorless) or OTC |
| **Purpose** | Raise capital | Liquidity, speculation, arbitrage, hedging |
---
## 4. Main Actors Today
| Actor | Role |
|-------|------|
| **Issuers** | Corporations raising money (via IPO or secondary). |
| **Investors** | Individuals, pension funds, mutual funds, ETFs, hedge funds. |
| **Stock Exchanges** | NYSE, Nasdaq, CBOE, others: regulated venues for trade matching. |
| **Broker/Dealer** | Full‑service, discount, or electronic. Facilitate
Latency (until all output tokens received): 4.429 seconds
here with all logs:
ORTGENAI_ORT_VERBOSE_LOGGING=1 python model-chat.py \
-m cuda/cuda-int4-kquant-block-32-mixed \
-e follow_config \
> log.txt 2>&1
log.txt:
Loading model...
2026-02-28 03:54:51.367167804 [I:onnxruntime:, device_discovery_common.cc:34 operator()] Discovered OrtHardwareDevice {vendor_id:0x10de, device_id:0x2330, vendor:, type:1, metadata: [Discrete=1, card_idx=0, pci_bus_id=0000:8d:00.0, ]}
2026-02-28 03:54:51.367191306 [I:onnxruntime:, device_discovery_common.cc:34 operator()] Discovered OrtHardwareDevice {vendor_id:0x8086, device_id:0x0, vendor:Intel, type:0, metadata: []}
2026-02-28 03:54:51.367518254 [V:onnxruntime:, env.cc:566 GetRuntimePath] Getting runtime path as parent directory of binary: /home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/onnxruntime/capi/libonnxruntime.so.1.24.2
2026-02-28 03:54:51.367600917 [V:onnxruntime:, env.cc:566 GetRuntimePath] Getting runtime path as parent directory of binary: /home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/onnxruntime/capi/libonnxruntime.so.1.24.2
2026-02-28 03:54:51.722258941 [V:onnxruntime:, env.cc:221 ThreadMain] pthread_setaffinity_np succeed for thread: 206922, index: 1, mask: {4, 5, }
2026-02-28 03:54:51.722332623 [V:onnxruntime:, env.cc:221 ThreadMain] pthread_setaffinity_np succeed for thread: 206921, index: 0, mask: {2, 3, }
2026-02-28 03:54:51.722320171 [V:onnxruntime:, env.cc:221 ThreadMain] pthread_setaffinity_np succeed for thread: 206923, index: 2, mask: {6, 7, }
2026-02-28 03:54:51.722384617 [V:onnxruntime:, env.cc:221 ThreadMain] pthread_setaffinity_np succeed for thread: 206926, index: 5, mask: {12, 13, }
2026-02-28 03:54:51.722369712 [V:onnxruntime:, env.cc:221 ThreadMain] pthread_setaffinity_np succeed for thread: 206924, index: 3, mask: {8, 9, }
2026-02-28 03:54:51.722429306 [V:onnxruntime:, env.cc:221 ThreadMain] pthread_setaffinity_np succeed for thread: 206927, index: 6, mask: {14, 15, }
2026-02-28 03:54:51.722321186 [V:onnxruntime:, env.cc:221 ThreadMain] pthread_setaffinity_np succeed for thread: 206925, index: 4, mask: {10, 11, }
2026-02-28 03:54:51.724131272 [I:onnxruntime:, bfc_arena.cc:27 BFCArena] Creating BFCArena for Cuda with following configs: initial_chunk_size_bytes: 1048576 max_dead_bytes_per_chunk: 134217728 initial_growth_chunk_size_bytes: 2097152 max_power_of_two_extend_bytes: 1073741824 memory limit: 18446744073709551615 arena_extend_strategy: 0
2026-02-28 03:54:51.724150784 [V:onnxruntime:, bfc_arena.cc:64 BFCArena] Creating 21 bins of max chunk size 256 to 268435456
2026-02-28 03:54:51.724158807 [I:onnxruntime:, bfc_arena.cc:27 BFCArena] Creating BFCArena for CudaPinned with following configs: initial_chunk_size_bytes: 1048576 max_dead_bytes_per_chunk: 134217728 initial_growth_chunk_size_bytes: 2097152 max_power_of_two_extend_bytes: 1073741824 memory limit: 18446744073709551615 arena_extend_strategy: 0
2026-02-28 03:54:51.724164815 [V:onnxruntime:, bfc_arena.cc:64 BFCArena] Creating 21 bins of max chunk size 256 to 268435456
2026-02-28 03:54:51.724172355 [I:onnxruntime:, bfc_arena.cc:27 BFCArena] Creating BFCArena for Cpu with following configs: initial_chunk_size_bytes: 1048576 max_dead_bytes_per_chunk: 134217728 initial_growth_chunk_size_bytes: 2097152 max_power_of_two_extend_bytes: 1073741824 memory limit: 18446744073709551615 arena_extend_strategy: 0
2026-02-28 03:54:51.724178219 [V:onnxruntime:, bfc_arena.cc:64 BFCArena] Creating 21 bins of max chunk size 256 to 268435456
2026-02-28 03:54:51.753893270 [I:onnxruntime:, cuda_execution_provider.cc:234 PerThreadContext] cuDNN version: 90501
2026-02-28 03:54:51.753957920 [I:onnxruntime:, bfc_arena.cc:280 Reserve] Reserving memory in BFCArena for Cpu size: 4
2026-02-28 03:54:51.754015492 [I:onnxruntime:onnxruntime-genai, inference_session.cc:606 TraceSessionOptions] Session Options { execution_mode:0 execution_order:DEFAULT enable_profiling:0 optimized_model_filepath:"" enable_mem_pattern:1 enable_mem_reuse:1 enable_cpu_mem_arena:1 profile_file_prefix:onnxruntime_profile_ session_logid:onnxruntime-genai session_log_severity_level:-1 session_log_verbosity_level:0 max_num_graph_transformation_steps:10 graph_optimization_level:4 intra_op_param:OrtThreadPoolParams { thread_pool_size: 8 auto_set_affinity: 0 allow_spinning: 1 dynamic_block_base_: 0 stack_size: 0 affinity_str: set_denormal_as_zero: 0 } inter_op_param:OrtThreadPoolParams { thread_pool_size: 0 auto_set_affinity: 0 allow_spinning: 1 dynamic_block_base_: 0 stack_size: 0 affinity_str: set_denormal_as_zero: 0 } use_per_session_threads:1 thread_pool_allow_spinning:1 use_deterministic_compute:0 ep_selection_policy:0 config_options: { } }
2026-02-28 03:54:51.754036816 [I:onnxruntime:onnxruntime-genai, inference_session.cc:422 ConstructorCommon] Creating and using per session threadpools since use_per_session_threads_ is true
2026-02-28 03:54:51.754044708 [I:onnxruntime:onnxruntime-genai, inference_session.cc:446 ConstructorCommon] Dynamic block base set to 0
2026-02-28 03:54:51.768185220 [I:onnxruntime:onnxruntime-genai, inference_session.cc:2085 Initialize] Initializing session.
2026-02-28 03:54:51.768302264 [I:onnxruntime:onnxruntime-genai, inference_session.cc:2123 Initialize] Adding default CPU execution provider.
2026-02-28 03:54:51.768356929 [I:onnxruntime:, bfc_arena.cc:27 BFCArena] Creating BFCArena for Cuda with following configs: initial_chunk_size_bytes: 1048576 max_dead_bytes_per_chunk: 134217728 initial_growth_chunk_size_bytes: 2097152 max_power_of_two_extend_bytes: 1073741824 memory limit: 18446744073709551615 arena_extend_strategy: 0
2026-02-28 03:54:51.768403136 [V:onnxruntime:, bfc_arena.cc:64 BFCArena] Creating 21 bins of max chunk size 256 to 268435456
2026-02-28 03:54:51.768448478 [I:onnxruntime:, bfc_arena.cc:27 BFCArena] Creating BFCArena for CudaPinned with following configs: initial_chunk_size_bytes: 1048576 max_dead_bytes_per_chunk: 134217728 initial_growth_chunk_size_bytes: 2097152 max_power_of_two_extend_bytes: 1073741824 memory limit: 18446744073709551615 arena_extend_strategy: 0
2026-02-28 03:54:51.768498601 [V:onnxruntime:, bfc_arena.cc:64 BFCArena] Creating 21 bins of max chunk size 256 to 268435456
2026-02-28 03:54:51.768544860 [I:onnxruntime:, bfc_arena.cc:27 BFCArena] Creating BFCArena for Cpu with following configs: initial_chunk_size_bytes: 1048576 max_dead_bytes_per_chunk: 134217728 initial_growth_chunk_size_bytes: 2097152 max_power_of_two_extend_bytes: 1073741824 memory limit: 18446744073709551615 arena_extend_strategy: 0
2026-02-28 03:54:51.768588558 [V:onnxruntime:, bfc_arena.cc:64 BFCArena] Creating 21 bins of max chunk size 256 to 268435456
2026-02-28 03:54:51.768729738 [I:onnxruntime:onnxruntime-genai, graph_partitioner.cc:1220 InlineFunctionsAOT] This model does not have any local functions defined. AOT Inlining is not performed
2026-02-28 03:54:51.776319395 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer EnsureUniqueDQForNodeUnit modified: 0 with status: OK
2026-02-28 03:54:51.776572330 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer Level1_RuleBasedTransformer modified: 0 with status: OK
2026-02-28 03:54:51.776661891 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer DoubleQDQPairsRemover modified: 0 with status: OK
2026-02-28 03:54:51.776960357 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer ConstantSharing modified: 0 with status: OK
2026-02-28 03:54:51.777824328 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer CommonSubexpressionElimination modified: 0 with status: OK
2026-02-28 03:54:51.777988525 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer ConstantFolding modified: 0 with status: OK
2026-02-28 03:54:51.778081212 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer MatMulAddFusion modified: 0 with status: OK
2026-02-28 03:54:51.778169345 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer ReshapeFusion modified: 0 with status: OK
2026-02-28 03:54:51.778236717 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer FreeDimensionOverrideTransformer modified: 0 with status: OK
2026-02-28 03:54:51.778323948 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer GeluFusionL1 modified: 0 with status: OK
2026-02-28 03:54:51.778417992 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer LayerNormFusionL1 modified: 0 with status: OK
2026-02-28 03:54:51.778507267 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer QDQPropagationTransformer modified: 0 with status: OK
2026-02-28 03:54:51.778592580 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer WeightBiasQuantization modified: 0 with status: OK
2026-02-28 03:54:51.778677894 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer EnsureUniqueDQForNodeUnit modified: 0 with status: OK
2026-02-28 03:54:51.778762128 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer WhereDummyDq modified: 0 with status: OK
2026-02-28 03:54:51.779148160 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer TransposeOptimizer modified: 0 with status: OK
2026-02-28 03:54:51.783683041 [I:onnxruntime:onnxruntime-genai, fallback_cpu_capability.cc:89 operator()] Candidate for fallback CPU execution: /model/attn_mask_reformat/attn_mask_subgraph/Gather
2026-02-28 03:54:51.787829016 [I:onnxruntime:onnxruntime-genai, fallback_cpu_capability.cc:167 GetCpuPreferredNodes] ORT optimization- Force fallback to CPU execution for node: /model/attn_mask_reformat/attn_mask_subgraph/Gather because the CPU execution path is deemed faster than overhead involved with execution on other EPs capable of executing this node
2026-02-28 03:54:51.787889408 [I:onnxruntime:onnxruntime-genai, fallback_cpu_capability.cc:167 GetCpuPreferredNodes] ORT optimization- Force fallback to CPU execution for node: /model/attn_mask_reformat/attn_mask_subgraph/Gather/Cast because the CPU execution path is deemed faster than overhead involved with execution on other EPs capable of executing this node
2026-02-28 03:54:51.788210493 [I:onnxruntime:onnxruntime-genai, inference_session.cc:1508 TransformGraph] Running graph optimizations in loop 1 time/s (Graph Optimizations Loop Level : 1)
2026-02-28 03:54:51.788304292 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer Level2_RuleBasedTransformer modified: 0 with status: OK
2026-02-28 03:54:51.788499856 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer TransposeOptimizer_CPUExecutionProvider modified: 0 with status: OK
2026-02-28 03:54:51.788585990 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer QDQS8ToU8Transformer modified: 0 with status: OK
2026-02-28 03:54:51.788717901 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer QDQSelectorActionTransformer modified: 0 with status: OK
2026-02-28 03:54:51.788804233 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer GemmActivationFusion modified: 0 with status: OK
2026-02-28 03:54:51.788890031 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer MatMulIntegerToFloatFusion modified: 0 with status: OK
2026-02-28 03:54:51.788974724 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer DynamicQuantizeMatMulFusion modified: 0 with status: OK
2026-02-28 03:54:51.789062964 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer ConvActivationFusion modified: 0 with status: OK
2026-02-28 03:54:51.789108906 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer GeluFusionL2 modified: 0 with status: OK
2026-02-28 03:54:51.789154019 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer LayerNormFusionL2 modified: 0 with status: OK
2026-02-28 03:54:51.789247977 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer SimplifiedLayerNormFusion modified: 0 with status: OK
2026-02-28 03:54:51.789335997 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer AttentionFusion modified: 0 with status: OK
2026-02-28 03:54:51.789438331 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer EmbedLayerNormFusion modified: 0 with status: OK
2026-02-28 03:54:51.789671612 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer GatherSliceToSplitFusion modified: 0 with status: OK
2026-02-28 03:54:51.789761362 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer GatherToSliceFusion modified: 0 with status: OK
2026-02-28 03:54:51.789849693 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer MatmulTransposeFusion modified: 0 with status: OK
2026-02-28 03:54:51.789957353 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer BiasGeluFusion modified: 0 with status: OK
2026-02-28 03:54:51.790051310 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer GroupQueryAttentionFusion modified: 0 with status: OK
2026-02-28 03:54:51.790137096 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer MatMulAddFusion modified: 0 with status: OK
2026-02-28 03:54:51.790222009 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer SkipLayerNormFusion modified: 0 with status: OK
2026-02-28 03:54:51.790309070 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer FastGeluFusion modified: 0 with status: OK
2026-02-28 03:54:51.790396387 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer QuickGeluFusion modified: 0 with status: OK
2026-02-28 03:54:51.790487848 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer BiasSoftmaxFusion modified: 0 with status: OK
2026-02-28 03:54:51.790580458 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer BiasDropoutFusion modified: 0 with status: OK
2026-02-28 03:54:51.790673410 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer MatMulScaleFusion modified: 0 with status: OK
2026-02-28 03:54:51.790760777 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer MatMulActivationFusion modified: 0 with status: OK
2026-02-28 03:54:51.790846040 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer MatMulNBitsFusion modified: 0 with status: OK
2026-02-28 03:54:51.790934648 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer QDQFinalCleanupTransformer modified: 0 with status: OK
2026-02-28 03:54:51.791019158 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer NchwcTransformer modified: 0 with status: OK
2026-02-28 03:54:51.791170509 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer NhwcTransformer modified: 0 with status: OK
2026-02-28 03:54:51.791260304 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer ConvAddActivationFusion modified: 0 with status: OK
2026-02-28 03:54:51.791385738 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer RemoveDuplicateCastTransformer modified: 0 with status: OK
2026-02-28 03:54:51.791432085 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer CastFloat16Transformer modified: 0 with status: OK
2026-02-28 03:54:51.791515942 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer FuseFp16InitializerToFp32NodeTransformer modified: 0 with status: OK
2026-02-28 03:54:51.801010641 [I:onnxruntime:onnxruntime-genai, graph_transformer.cc:15 Apply] GraphTransformer MemcpyTransformer modified: 1 with status: OK
2026-02-28 03:54:51.804145588 [V:onnxruntime:onnxruntime-genai, session_state.cc:1307 VerifyEachNodeIsAssignedToAnEp] Node placements
2026-02-28 03:54:51.804236378 [V:onnxruntime:onnxruntime-genai, session_state.cc:1313 VerifyEachNodeIsAssignedToAnEp] Node(s) placed on [CPUExecutionProvider]. Number of nodes: 2
2026-02-28 03:54:51.804285061 [V:onnxruntime:onnxruntime-genai, session_state.cc:1315 VerifyEachNodeIsAssignedToAnEp] Gather (/model/attn_mask_reformat/attn_mask_subgraph/Gather)
2026-02-28 03:54:51.804337310 [V:onnxruntime:onnxruntime-genai, session_state.cc:1315 VerifyEachNodeIsAssignedToAnEp] Cast (/model/attn_mask_reformat/attn_mask_subgraph/Gather/Cast)
2026-02-28 03:54:51.804380899 [V:onnxruntime:onnxruntime-genai, session_state.cc:1313 VerifyEachNodeIsAssignedToAnEp] Node(s) placed on [CUDAExecutionProvider]. Number of nodes: 369
...
I tested your prompt ("tell me how the stock market works") on an A100 GPU.
python model-chat.py -m cuda/cuda-int4-kquant-block-32-mixed -e follow_config --verbose --timings
Here are the results I see.
Prompt length: 74, New tokens: 1919, Total tokens: 2067, Time to first: 0.42s, Prompt tokens per second: 177.25 tps, New tokens per second: 171.49 tps
Note that the new tokens being measured here include all of the reasoning and thinking tokens in addition to the actual output tokens.