diff --git "a/data/chunks/2603.10722_semantic.json" "b/data/chunks/2603.10722_semantic.json" new file mode 100644--- /dev/null +++ "b/data/chunks/2603.10722_semantic.json" @@ -0,0 +1,1052 @@ +[ + { + "chunk_id": "be7c4c0b-137a-4929-a4e7-d743864f0d9a", + "text": "UAV traffic scene understanding: A cross-spectral guided approach and a\nunified benchmark Yu Zhanga,b, Zhicheng Zhaoc,d,∗, Ze Luoa,b,∗, Chenglong Lic,d and Jin Tangc,d aComputer Network Information Center, Chinese Academy of Sciences, Beijing 100190, China\nbUniversity of Chinese Academy of Sciences, Beijing 100190, China\ncAnhui Provincial Key Laboratory of Multimodal Cognitive Computation, Anhui University, Hefei 230601, China\ndInformation Materials and Intelligent Sensing Laboratory of Anhui Province, Anhui University, Hefei 230601, China A R T I C L E I N F O A B S T R A C T", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 0, + "total_chunks": 50, + "char_count": 583, + "word_count": 89, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "7b3b4637-e065-45ac-9a9d-98a35b73511e", + "text": "Keywords: Traffic scene understanding from unmanned aerial vehicle (UAV) platforms is crucial for intelligent transVisual Question Answering portation systems due to its flexible deployment and wide-area monitoring capabilities. However, exUAV Traffic Scene Understanding isting methods face significant challenges in real-world surveillance, as their heavy reliance on optical\nOptical-Thermal imagery leads to severe performance degradation under adverse illumination conditions like nighttime\nCross-Spectral Fusion and fog. Furthermore, current Visual Question Answering (VQA) models are restricted to elementary\nCognitive Reasoning perception tasks, lacking the domain-specific regulatory knowledge required to assess complex traffic2026 behaviors. To address these limitations, we propose a novel Cross-spectral Traffic Cognition Network\n(CTCNet) for robust UAV traffic scene understanding. Specifically, we design a Prototype-Guided Knowledge Embedding (PGKE) module that leverages high-level semantic prototypes from an external Traffic\nRegulation Memory (TRM) to anchor domain-specific knowledge into visual representations, enablingMar the model to comprehend complex behaviors and distinguish fine-grained traffic violations. Moreover,\nwe develop a Quality-Aware Spectral Compensation (QASC) module that exploits the complementary\ncharacteristics of optical and thermal modalities to perform bidirectional context exchange, effectively11 compensating for degraded features to ensure robust representation in complex environments. In addition, we construct Traffic-VQA, the first large-scale optical-thermal infrared benchmark for cognitive UAV traffic understanding, comprising 8,180 aligned image pairs and 1.3 million question-answer\npairs across 31 diverse types. Extensive experiments demonstrate that CTCNet significantly outperforms state-of-the-art methods in both cognition and perception scenarios. The dataset is available at\nhttps://github.com/YuZhang-2004/UAV-traffic-scene-understanding.[cs.CV]\n1. Introduction tial visual feature corruption and semantic ambiguity, severely\nbottlenecking the perception capabilities of traditional visual\nWith the continuous advancement of remote sensing ob- models and limiting the practical deployment of UAV-VQA\nservation technology, Unmanned Aerial Vehicles (UAVs) have\nsystems.\nbecome essential components of Intelligent Transportation\nWhile existing VQA methods and Multimodal Large LanSystems (ITS). Compared to fixed ground-level surveillance guage Models (MLLMs) designed for aerial platforms have\ncameras, UAVs offer flexible deployment and wide-area moni- driven significant progress in generalized spatial comprehentoring capabilities, enabling comprehensive observation of dy- sion, they exhibit notable limitations. Representative methods\nnamic traffic flows. Consequently, UAV-based Visual Quessuch as GeoChat [5] and EarthGPT [6, 7] leverage pre-trained\ntion Answering (UAV-VQA) [1] has emerged as a transformalarge models as unified interfaces to handle diverse interpretative technology. Unlike conventional visual approaches con- tion tasks. However, a major vulnerability of these approaches\nstrained to fundamental object detection and counting, UAV- is their predominant reliance on high-quality, single-modality\nVQA enables operators to interact with traffic scenes through optical (OPT) imagery.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 1, + "total_chunks": 50, + "char_count": 3375, + "word_count": 408, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "d3b53cbc-01c4-4a10-8e0f-f275c6b36b39", + "text": "Since extracted feature representations\nnatural language queries (e.g., \"Is the white vehicle at the inter- are highly sensitive to input quality, optical signals are eassection violating traffic rules?\"), thereby providing actionablearXiv:2603.10722v1 ily disrupted by environmental noise [8]. Consequently, these\ninsights for dynamic traffic control and safety regulation.\nmodels often suffer severe perception failures when applied to\nHowever, deploying UAV-VQA in real-world traffic challenging, all-weather traffic scenarios where the optical sigsurveillance presents significant scientific challenges. Be- nal is significantly degraded.\nyond controlled ideal laboratory settings, practical UAV im- To mitigate the limitations of optical sensors, thermal inagery is captured under unconstrained open-world conditions. frared (TIR) imagery provides a complementary modality that\nThese scenes are characterized by extreme viewpoint variacaptures heat signatures independent of ambient illumination,\ntions, dense distributions of tiny objects (e.g., miniaturized ve- enabling effective performance in darkness and fog [2, 4, 9,\nhicles in wide-angle views), and severe vulnerability to envi- 10]. Fusing optical and TIR data represents a highly promising\nronmental degradation such as nighttime darkness, glare, or strategy for robust, all-weather perception. Mainstream multidense fog [2, 3, 4]. Furthermore, interpreting complex traf- modal fusion strategies have evolved from simple static confic behaviors requires domain-specific regulatory knowledge, catenation to sophisticated dynamic attention and correlationexposing a critical cognitive deficiency in current visual sysdriven mechanisms. Despite their success in low-level pertems.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 2, + "total_chunks": 50, + "char_count": 1743, + "word_count": 221, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "30609aee-b2e1-4051-84ca-3250449c4944", + "text": "As illustrated in Fig. 1, these factors lead to substanception tasks such as object detection, these advanced crosszhaozhicheng@ahu.edu.cn (Z. Zhao); luoze@cnic.cn (Z. Luo) spectral fusion techniques have rarely been integrated into the\nhigh-level cognitive context of UAV-VQA, leaving a signifiYu Zhang et al.: Page 1 of 16 UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark (a) Existing Datasets (b) Scenario 1 : Domain Knowledge Gap (Cognitive Missing)", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 3, + "total_chunks": 50, + "char_count": 494, + "word_count": 71, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "bb740f69-099d-4658-bcbc-7f7c6f91ba3a", + "text": "Satellite View Lacks Traffic\nSimple Counting : A white car is turning left. Rules Shallow\nHow many cars in the image ? CTCNet Illegal U-turn across\nOptical Only (Expected) double yellow lines. Huge Gap\nScenario 2 : Multi Modal Fusion Gap (Static Fusion)\nReal-World (Our Traffic-VQA)\nTraditional Fusion\nUAV View Cognitive Reasoning : No cars are parked. Is there any traffic violation …\noccurring here? Noise Interference\nStatic Fusion CTCNet One car is parked in\nAligned OPT-TIR the lower right.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 4, + "total_chunks": 50, + "char_count": 495, + "word_count": 80, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "fdd14c9b-5664-456b-8cff-0b3ccdc6cbc9", + "text": "(c) Data Gap Knowledge Gap Fusion Gap\nPrototype-Guided Knowledge Quality-Aware Spectral\nTraffic-VQA Dataset\nEmbedding (PGKE) Compensation (QASC) Figure 1: Current challenges in UAV-based traffic VQA. (a) Data Gap. Existing datasets (top) rely on single-modal optical imagery for\nelementary perception, whereas practical surveillance (bottom) demands aligned OPT-TIR data for complex cognitive understanding.\n(b) Methodological Bottlenecks. General MLLMs struggle with the Domain Knowledge Gap, failing to interpret specific traffic rules\n(e.g., missing the \"illegal\" attribute of a U-turn), and the Multi-Modal Fusion Gap, where static fusion allows degraded optical noise\nto corrupt robust thermal features under adverse conditions. (c) Our Solution. The proposed CTCNet systematically bridges these\ngaps through the Traffic-VQA dataset, the PGKE module, and the QASC module. cant gap in robust multimodal feature representation for traffic Guided Knowledge Embedding (PGKE) module that leverages\nbehavior understanding. an external Traffic Regulation Memory (TRM) constructed\nRegarding cognitive capabilities, state-of-the-art MLLMs from expert knowledge. By retrieving and aligning high-level\nhave made remarkable strides in generalized visual reasoning semantic prototypes with current visual features, the PGKE\nand complex deductive tasks [11, 12, 13, 14]. However, when module injects domain-specific cognitive capabilities into the\ndeployed in the highly dynamic and regulation-intensive traffic network. Simultaneously, we develop a Quality-Aware Specscenarios typical of UAV traffic surveillance, these models en- tral Compensation (QASC) module that orchestrates a bidireccounter a fundamental cognitive bottleneck. Current MLLMs tional context exchange between optical and thermal modalipredominantly rely on broad statistical priors and surface-level ties via a dynamic attention mechanism, allowing the network\nvisual-semantic alignment, critically lacking the capacity to to selectively transfer discriminative features from the reliable\nground complex, evolving visual states into domain-specific modality to compensate for the degraded one, ensuring roregulatory frameworks. For instance, as illustrated in Fig. 1, bust representation in complex environments. Beyond these\nwhen a vehicle crosses double yellow lines, a general-purpose methodological contributions, the research field also lacks a\nMLLM may provide only a shallow spatial description (e.g., large-scale benchmark dataset with aligned multi-spectral im-\n\"A white car is turning left\"), fundamentally missing the \"il- agery and cognitive annotations, which is crucial for trainlegal\" attribute of the U-turn. This failure occurs because the ing and evaluating robust traffic understanding models. To\nmodel cannot map the dynamic trajectory to the abstract traffic fill this gap, we construct Traffic-VQA, a large-scale OPTrule. Without explicit regulatory memory to anchor such spe- TIR benchmark for UAV traffic understanding.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 5, + "total_chunks": 50, + "char_count": 3006, + "word_count": 397, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "3a413bb8-68e3-429a-abb4-c3962b15e70c", + "text": "It contains\ncialized rules, these models struggle to maintain logical con- 8,180 meticulously aligned image pairs with over 1.3 million\nsistency in unconstrained traffic environments, often resorting question-answer pairs, covering diverse environmental condito spurious visual correlations that result in semantic halluci- tions (sunny, night, fog) and a comprehensive taxonomy of 31\nnations and a critical perception-cognition disconnect. question types, spanning from basic perception to complex viTo address these combined challenges of multi-spectral olation understanding. Extensive experiments conducted on\ndata fusion and domain-specific cognitive depth, we propose the proposed dataset demonstrate the superiority and effectivea novel Cross-spectral Traffic Cognition Network (CTCNet). ness of CTCNet compared to existing state-of-the-art methods. We decouple the problem into two complementary objectives: The main contributions of this work are summarized as folenhancing low-level perceptual robustness and injecting high- lows:\nlevel cognitive context. Specifically, we design a PrototypeYu Zhang et al.: Page 2 of 16 UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark • We introduce Traffic-VQA, the first large-scale OPT-TIR mutual attention for bidirectional guidance, whereas SHRbenchmark dedicated to cognitive traffic understanding. Net [17] and MQVQA [19] incorporated spatial hierarchical\nThis dataset contains a substantial number of complex reasoning and multistep attention to model high-order intratraffic scenarios and cognition-oriented QA pairs, pro- group object relations [17, 19]. As the field progressed toviding a critical foundation for advancing all-weather ward practical deployments, researchers developed domainUAV perception and cognition tasks. specific frameworks. FloodNet [20] provided evaluations exclusively designed for post-disaster damage assessment, while\n• Recognizing the vulnerability of single optical sig- CDVQA [21] introduced multitemporal analytical reasoning\nnals and the lack of domain-specific cognitive depth in to address semantic change detection. Furthermore, RSIVQA\ncurrent UAV traffic scene understanding, we propose [16] and TextRS [22] integrated multiple external sources to\nthe CTCNet framework to effectively harness comple- curate human-annotated answers that more closely mirror natmentary optical-thermal features and integrate domain- uralistic human queries.\nspecific regulatory knowledge. Despite these successes in fundamental perception tasks,\nearly VQA methods were constrained by rigid answer spaces • To bridge the domain knowledge gap, we design a PGKE\nand limited scalability across varied interpretation tasks. They module, which retrieves and embeds domain-specific\nprimarily focused on basic perceptual queries such as object regulatory semantics via a constructed traffic regulapresence and spatial counting, lacking the cognitive capacity tion memory bank. Furthermore, to enhance low-level\nto address complex, open-ended interpretation tasks in diverse perceptual robustness under adverse conditions, we furoperational environments. ther introduce a QASC module to guide the network in\nperforming dynamic bidirectional context exchange be-\n2.2. Multimodal VQA tween optical and thermal modalities. Multimodal VQA aims to leverage the complementary\n• Extensive experiments on the Traffic-VQA dataset val- physics of heterogeneous sensors—such as optical and thermal\nidate the effectiveness of the proposed CTCNet.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 6, + "total_chunks": 50, + "char_count": 3541, + "word_count": 462, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "507cfbf9-be1a-4543-b549-ab3bb3728f91", + "text": "Com- or synthetic aperture radar (SAR) configurations—to increase\npared with existing leading open-source MLLMs (e.g., informational density and analytical precision across diverse\nQwen2.5-VL [12], GeoChat [5]) and closed-source com- environmental contexts [6, 23]. Within this area, researchers\nmercial models (e.g., GPT-4o [14]), our method suc- have systematically addressed representational discrepancies\ncessfully anchors domain-specific regulatory knowl- originating from distinct imaging modalities and environmenedge, achieving substantial improvements in compre- tal degradation. To mitigate basic modality disparities, Li et\nhension accuracy for complex traffic behaviors across al. [24] introduced a multiscale feature fusion and enhanceall-weather scenarios. ment network designed to amplify foreground semantic objects, thereby improving the parsing of urban road networks\nunder suboptimal illumination. To address the intrinsic modal-\n2. Related Work\nity gap, Zhou et al. [25] developed M-SpecGene, a generalized\nThe proposed CTCNet framework builds upon and ad- foundation model that uses a cross-modality structural sparsity\nvances three interrelated research areas: visual question an- metric to quantify information density and extract modalityswering, multimodal integration, and the cognitive capabilities invariant representations. Concurrently, Zhao et al. [26] proof large language models. In what follows, we review repre- posed CDDFuse, a correlation-driven architecture that decomsentative works in each area and discuss their limitations in the poses features into modality-shared and modality-specific subcontext of UAV traffic understanding. components to improve cross-modal consistency. Moving beyond static integration approaches, Zhang et\n2.1. Visual Question Answering al. [27] formulated M2FNet, which dynamically aggregates\nVQA for aerial imagery has evolved significantly, pro- multi-spectral features via union-modal and cross-modal atgressing from modular specialist architectures to integrated an- tention mechanisms to ensure robust object detection regardalytical systems. Early methods predominantly utilized dual- less of illumination variance. Furthermore, Xu et al. [28] prostream architectures, employing Convolutional Neural Net- posed a unified unsupervised framework, U2Fusion, which auworks (CNNs) for visual feature extraction and Recurrent Neu- tonomously estimates the informational saliency of source imral Networks (RNNs) for question encoding [15, 16]. These agery to guide adaptive feature preservation. Generative and\nfoundational models, alongside early large-scale benchmarks prompt-guided strategies have also shown promise in bridgsuch as RSVQA-LR and RSVQA-HR [15], typically aggre- ing representational gaps. Advanced architectures such as\ngated features via basic element-wise operations. However, AT-GAN [29] and denoising diffusion models [30] have been\nsuch approaches often struggled to capture complex geospa- adapted to synthesize high-fidelity fused imagery while pretial interactions [17].", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 7, + "total_chunks": 50, + "char_count": 3063, + "word_count": 389, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "b2e203dc-b8f4-4b31-aab1-527d892d8030", + "text": "The scope of these initial methods was serving intricate structural details. Specifically targeting unbroadened by datasets like RSVQAxBEN [18], which intro- constrained aerial environments, Yang et al. [31] introduced\nduced logical connectors into queries to incrementally increase GDNet for the disentanglement of optical guidance features,\nthe complexity of structural linguistics. while PromptFusion [32] harmonized multi-modality images\nTo address the misalignment between spatial visual layouts guided by explicit semantic prompts through bi-level optimizaand linguistic tokens, subsequent research introduced refined tion.\nattention mechanisms. For instance, MAIN [16] leveraged Despite these promising results across various applicaYu Zhang et al.: Page 3 of 16", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 8, + "total_chunks": 50, + "char_count": 769, + "word_count": 100, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "d2ffc1c9-f292-4411-a47b-cb772c35dd83", + "text": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark tions, the potential of advanced integration techniques within integrates specialized domain rules into the visual interpretaexplicit cognitive contexts remains largely unexplored. Most tion process.\nexisting multimodal methods rely on rigid integration strategies that lack the dynamic, non-destructive context exchange\n3. Methodologyrequired to handle the extreme variance characteristic of allweather traffic scenarios. In this section, we present the proposed CTCNet framework, a cognitive prototype-anchored architecture designed to\n2.3. Multimodal Large Language Models augment MLLMs for robust traffic reasoning from UAV imRecent developments in MLLMs for aerial scenarios have agery. As illustrated in Fig. 2, our approach addresses two funtransitioned toward open-set generalization by adopting pre- damental challenges in RSVQA: the domain knowledge gap\ntrained Large Language Models (LLMs) as unified cognitive inherent in interpreting specialized traffic behaviors, and the\ninterfaces. Early efforts, such as EarthGPT [6] and EarthGPT- robust fusion of heterogeneous sensor modalities (OPT and\nX [7], integrated various multi-sensor interpretation tasks TIR).\nthrough cross-modal comprehension and visual prompting. Specifically, we first describe the overall Gated Parallel\nTo incorporate spatial-temporal and geo-context clues, Sky- Residual Architecture (Section 3.1). We then detail the conSense [33] introduced a factorized multi-modal spatiotempo- struction of the offline TRM (Section 3.2), which serves as\nral encoder pre-trained on a large scale. Similarly, Ring- an external knowledge repository. Subsequently, we elaboMoGPT [34] unified vision, language, and localization tasks rate on the two core trainable modules: the PGKE module\nusing a location- and instruction-aware querying transformer. (Section 3.3) for explicit domain knowledge injection, and the\nWhile these foundational architectures established versatile QASC module (Section 3.4) for dynamic, context-aware multibaselines for multi-sensor data, their holistic scene interpre- spectral integration.\ntation often lacked the fine-grained spatial awareness required\nfor small, densely distributed objects. 3.1. Overall Architecture\nTo address unique spatial complexities, subsequent re- The overall architecture of CTCNet is illustrated in Fig. 2.\nsearch focused on region-level reasoning and high-resolution The framework consists of a frozen MLLM backbone couprocessing. GeoChat [5] advanced local perception by ac- pled with two parallel, task-specific enhancement branches:\ncepting region inputs for region-specific dialogues and vi- the PGKE module and the QASC module.\nsual grounding. Extending this to dynamic scenarios, Earth- Given a well-aligned pair of optical and TIR images, deDial [23] enabled multi-sensory interactive dialogues, while noted as 𝐼𝑜𝑝𝑡∈ℝ𝐻×𝑊×3 and 𝐼𝑡ℎ∈ℝ𝐻×𝑊×3, alongside a\nAirSpatialBot [35] specifically targeted fine-grained vehicle natural language query 𝑄, CTCNet aims to generate a comattribute recognition and retrieval using a 3D visual ground- prehensive textual answer 𝐴.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 9, + "total_chunks": 50, + "char_count": 3182, + "word_count": 420, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "2b617536-0413-4bdf-9a16-ec9f565495f7", + "text": "To harness the generalized ining approach. Furthermore, handling ultra-high-resolution im- ferential capabilities of large-scale pre-training while adapting\nagery poses significant token explosion challenges. To address to the specificities of UAV traffic environments, we employ a\nthis, GeoLLaVA-8K [36] utilized background token pruning, frozen MLLM backbone (e.g., Qwen-VL) augmented with a\nand LRS-VQA [37] proposed a coarse-to-fine text-guided to- non-invasive residual enhancement mechanism.\nken pruning strategy. Despite improving spatial grounding, Let Φenc(⋅) denote the frozen visual encoder. Multi-scale\nthese methods predominantly rely on static feature extraction visual features are first extracted from both modalities, then\npathways that struggle with complex deductive logic. flattened and projected into a unified semantic embedding\nTo further enhance interpretive depth, inference-centric space, yielding the optical feature sequence 𝐅𝑜𝑝𝑡∈ℝ𝐿×𝐷and\napproaches utilizing reinforcement learning have recently the thermal feature sequence 𝐅𝑡ℎ∈ℝ𝐿×𝐷, where 𝐿is the segained traction. Frameworks such as Geo-R1 [38] and Vision- quence length of visual tokens and 𝐷is the latent feature diR1 [39] utilize verifiable rewards and group relative policy mensionality. Concurrently, the query 𝑄is tokenized and emoptimization to incentivize genuine geospatial reasoning. To bedded into a textual feature vector 𝐪∈ℝ𝐷.\novercome human annotation biases, GeoZero [40] attempts To prevent catastrophic forgetting of pre-trained knowlto elicit reasoning from scratch without predefined templates, edge, the backbone parameters are kept frozen while taskwhile RS-EoT [41] employs an iterative evidence-seeking ap- specific contextual information is injected via the parallel\nproach to mitigate pseudo-reasoning and the glance effect. trainable branches. The refined visual representation 𝐅res𝑚for\nHowever, comprehensive evaluations indicate that these mod- modality 𝑚∈{𝑜𝑝𝑡, 𝑡ℎ} is formulated as:\nels still struggle with dense, complex imagery and remain susceptible to semantic hallucinations during highly specialized 𝐅res𝑚= 𝐅𝑚+ 𝛼⋅Δ𝐅PGKE𝑚 + 𝛽⋅Δ𝐅QASC𝑚 , (1)\ninterpretation tasks. Despite these notable advancements, existing MLLMs ex- where Δ𝐅PGKE𝑚 and Δ𝐅QASC𝑚 denote the residual feature tensors\nhibit a critical perception-cognition gap when deployed in spe- generated by the PGKE and QASC modules, respectively. The\ncialized domains like UAV traffic surveillance. While profi- scalars 𝛼and 𝛽are learnable gating parameters initialized at\ncient at detecting general entities, they consistently fail to de- zero, enabling the network to autonomously regulate the injeccode complex traffic violations that require implicit regulatory tion intensity of cognitive prototypes and cross-modal context.\nknowledge. Furthermore, the absence of robust mechanisms to The fused representations from both modalities are concatealign multi-spectral observations with expert situational mem- nated and fed into the frozen LLM decoder for autoregressive\nory underscores the need for a novel architecture that explicitly answer generation. Yu Zhang et al.: Page 4 of 16", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 10, + "total_chunks": 50, + "char_count": 3152, + "word_count": 419, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "39357aa3-c8b9-4a39-baba-cb173a13967d", + "text": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark Gated Parallel Enhancement Architecture Fusion & Answering Inputs & Feature Extraction\n(a) TRM (b) PGKE\n푷ퟏ 푷 No illegal\nIs the illegal parking in the\nparking in the top … Top-K 푷ퟐ top area. Answer area? × Retrieval … … … Embedding Inputs 푷ퟑ Question … ∆푭풎푷푲푮푬\n풒 푭풐풑풕/푭풕풉 … 휶 휷 푭풎풓풆풔 풒 MLLM + + MLLM (c) QASC\n푭풊풎품\nOptical UAV Image … 푭풐풑풕 Add\n& Scale Softmax MatMul MatMul Decoder Encoder 푭풐풑풕 C 푭풊풎품 푭풕풉 Norm FeedForward … … …푭풐풑풕 … Weighted +\nSum\n… 푭풕풉 … 푸푨푺푪 …\n푭풐풑풕 ∆푭풎\n푭풕풉\nThermal UAV Image Normalize Linear LayerNorm Add C Concat … + Figure 2: Overall framework of CTCNet for multi-spectral UAV traffic VQA. The architecture adopts a Gated Parallel Residual\nparadigm in which the frozen, pre-trained MLLM visual features are adaptively augmented by domain-specific residual knowledge\ngenerated by the PGKE and QASC modules. The learnable gating parameters 𝛼and 𝛽regulate the intensity of cognitive and\nmultimodal context injection. Traffic Regulation Memory Construction grounding strategy across diverse scenarios, successfully idenExisting MLLMs frequently generate imprecise descrip- tifying static infrastructure such as \"linear walkways\" and\ntions when interpreting complex traffic violations, primarily \"parking areas,\" as well as dynamic anomalies like \"illegal\ndue to the absence of domain-specific episodic memory. To parking\" or \"vehicle turning.\" Notably, the grounding prompts\nbridge this gap, we establish an offline TRM, denoted as ∈ retain high efficacy even when the dominant visual features are\nℝ𝑁×𝐷𝑝, which stores 𝑁high-level semantic prototypes de- defined primarily by thermal signatures. To further improve\nrived from the training distribution.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 11, + "total_chunks": 50, + "char_count": 1760, + "word_count": 271, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "09224065-bb48-4e87-b447-adefe0820207", + "text": "As illustrated in Fig. 3, the spatial robustness against single-modality occlusions (e.g., a\nconstruction of the TRM is carried out through three phases: vehicle clearly visible in TIR imagery but obscured in the opsemantic distillation, multi-modal visual grounding, and situ- tical view by shadows or foliage), we compute the union of\nation feature aggregation. the modality-specific bounding boxes to define a unified event\nRegion of Interest (RoI), denoted as 𝐵𝑢𝑛𝑖𝑜𝑛:\n3.2.1. Semantic Phrase Generation\nFor each training triplet (𝐼𝑜𝑝𝑡, 𝐼𝑡ℎ, 𝑄, 𝐴), a text-only LLM 𝐵𝑢𝑛𝑖𝑜𝑛= 𝐵𝑜𝑝𝑡∪𝐵𝑡ℎ. (2)\nis used to distill the core visual scenario into a concise, specific\n3.2.3. Situation Feature Aggregation\nsemantic phrase, designated as 𝑃𝑠𝑒𝑚. Unlike unconstrained We propose an Epicenter Query mechanism to extract\ncaptions that tend to contain redundant information, the genera representative prototype vector from within the localized\nation prompt explicitly instructs the model to identify critical\n𝐵𝑢𝑛𝑖𝑜𝑛region. Let 𝐅𝑐𝑜𝑛𝑐𝑎𝑡= [𝐅𝑜𝑝𝑡; 𝐅𝑡ℎ] ∈ℝ𝐿×2𝐷denote thetraffic entities and their concurrent behaviors (e.g., \"a white\nconcatenated multi-modal feature map. The epicenter query\nSUV executing an illegal U-turn across double yellow lines\").\n𝐪𝑒𝑝𝑖is derived by mean-pooling over the features within theThis targeted distillation decouples the foundational visualspatial center neighborhood of 𝐵𝑢𝑛𝑖𝑜𝑛. The cognitive prototypesemantic elements from the grammatical structure of the raw\n𝐬∈ℝ2𝐷is then computed via a weighted aggregation:\nQA pair, yielding a clean textual anchor for the subsequent localization phase. ∑𝐿 exp(𝐅(𝑖)𝑐𝑜𝑛𝑐𝑎𝑡⋅𝐪⊤𝑒𝑝𝑖∕𝜏)\n𝐬= 𝐅(𝑖)𝑐𝑜𝑛𝑐𝑎𝑡, (3) ∑𝐿3.2.2. Multi-Modal Visual Grounding 𝑖=1 𝑗=1 exp(𝐅(𝑗)𝑐𝑜𝑛𝑐𝑎𝑡⋅𝐪⊤𝑒𝑝𝑖∕𝜏)\nTo spatially isolate the specified traffic event from the\ncomplex background, an open-set visual grounding framework where 𝜏is a temperature scaling factor. This aggregation yields\n(e.g., Qwen-VL or Grounding-DINO) is applied. Using the a refined prototype 𝐬that encapsulates the core visual-semantic\ndistilled 𝑃𝑠𝑒𝑚as the referential cue, the model predicts bound- essence of the localized traffic situation. The collection of\ning boxes for both the optical (𝐵𝑜𝑝𝑡) and thermal (𝐵𝑡ℎ) visual these vectors forms the TRM , which serves as an external\nplanes. knowledge base for subsequent complex cognitive tasks. As shown in Fig. 3, this process effectively translates ab-\n3.3.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 12, + "total_chunks": 50, + "char_count": 2393, + "word_count": 336, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "39d7a060-5f73-4046-b008-8b7b7c0bdbb8", + "text": "Prototype-Guided Knowledge Embeddingstract textual descriptors into precise, localized spatial coThe PGKE module bridges the gap between the currentordinates. The visualizations confirm the robustness of our\nvisual input and the external domain knowledge stored in the Yu Zhang et al.: Page 5 of 16 UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark How many linear walkways are there ?", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 13, + "total_chunks": 50, + "char_count": 425, + "word_count": 62, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "cccc4a09-433c-4fad-b9f9-c0fafaccac21", + "text": "Is exactly one case of illegal parking there ? How many vehicles are observed turning ? Is there a car turning around in the image ? A. yes\nGrounding Prompt: one linear walkway present Grounding Prompt: an orange car illegally parked Grounding Prompt: one vehicle turning Grounding Prompt: one vehicle turning around What type of vehicle is predominantly seen in the Q. Is the number of illegal parking instances one ? Can a gridline be seen on the right side ? What type of road facility is located in the upper\nlower left ?", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 14, + "total_chunks": 50, + "char_count": 525, + "word_count": 94, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "be315e1a-2461-4a3e-b786-bd185edd3a85", + "text": "A. cars A. yes A. yes right ? A. motor vehicle parking area\nGrounding Prompt: cars predominately seen in the Grounding Prompt: one instance of illegal parking Grounding Prompt: a gridline visible on the right side Grounding Prompt: a motor vehicle parking area\nlower left located in the upper right Figure 3: Multi-modal visual grounding in the TRM construction pipeline. Grounding prompts generated from semantic phrase\ndistillation are used to localize traffic entities and behaviors (e.g., linear walkways, vehicle turning). Red boxes indicate the extracted\nregions of interest, demonstrating accurate text-to-region alignment in both optical and thermal imagery. It operates via a retrieve-and-align strategy designed to\ngenerate the cognitive residual feature Δ𝐅PGKE𝑚 . 퓜 Similarity & Top-K 퓟풓풆풕풓풊풆풗풆풅\nPrototype Retrieval. The current textual question embed- 풔ퟏ 풔ퟐ 풔ퟑ 풔ퟒ Retrieval\nding 𝐪is first linearly projected into the prototype space using … 푭풒\na transformation matrix 𝐖𝑞. The system computes the cosine … … … … …\nsimilarity between this projected question vector and all pro- Qwen3-VL\ntotypes in , retrieving the top-𝐾most semantically relevant Text Encoder\nprototypes to form a support set 𝑟𝑒𝑡∈ℝ𝐾×𝐷: 푸풖풆풔풕풊풐풏 Is the illegal parking in the top area?\n( )\n(𝐪𝐖𝑞)⊤ (a) Prototype Retrieval 𝑟𝑒𝑡= TopK . (4)\n‖𝐪𝐖𝑞‖‖‖ 푭풐풑풕 푪푷푨 푭풐풑풕\nThese retrieved prototypes serve as reference anchors, supply- Linear 푸ing critical feature signatures for recognizing abstract traffic …\n… Multi-Headanomalies (e.g., recalling the visual pattern associated with a LN Linear + Attention\n\"rear-end collision\"). 푭풕풊풓 Linear 푲, 푽 Cross\nCognitive Prototype Alignment. The detailed mecha- 푪푷푨\nnism of the PGKE module, including the retrieve-and-align 퓟풓풆풕풓풊풆풗풆풅 푭풕풊풓\nphase via multi-head cross-attention, is illustrated in Fig. 4. … CPA Module …\nWe employ a Multi-Head Cross-Attention framework to inject 푭풕풊풓\nthe retrieved regulatory knowledge into the visual feature hi-\n(b) Cognitive Prototype Alignment\nerarchy. For modality 𝑚, the visual features 𝐅𝑚serve as the\nQuery (𝐐𝑝𝑔𝑘𝑒), while the retrieved prototypes 𝑟𝑒𝑡serve as Figure 4: Internal architecture of the PGKE module. The module\nboth the Key (𝐊𝑝𝑔𝑘𝑒) and Value (𝐕𝑝𝑔𝑘𝑒): performs question-guided similarity retrieval to identify the top-𝐾\nmost relevant prototypes from the TRM. These prototypes serve\nas keys and values in a Multi-Head Cross-Attention mechanism, , (5) 𝐐𝑝𝑔𝑘𝑒= LN(𝐅𝑚)𝐖𝑝𝑔𝑘𝑒𝑄 injecting situational domain knowledge into the visual feature\n. (6) streams as an optimized residual increment Δ𝐅PGKE. 𝐊𝑝𝑔𝑘𝑒, 𝐕𝑝𝑔𝑘𝑒= LN(𝑟𝑒𝑡)𝐖𝑝𝑔𝑘𝑒𝐾 , LN(𝑟𝑒𝑡)𝐖𝑝𝑔𝑘𝑒𝑉 The cognitive residual is then computed as:\n(𝐐𝑝𝑔𝑘𝑒𝐊⊤𝑝𝑔𝑘𝑒 ) Throughhigh-levelthisdomainmechanism,knowledge,the visualsubstantiallyfeaturesenhancingare alignedthewithnet-\n. (7) Δ𝐅PGKE𝑚 = softmax √ 𝐕𝑝𝑔𝑘𝑒𝐖𝑝𝑔𝑘𝑒𝑂 work's discriminative ability in fine-grained cognitive evalua- 𝑑𝑘 Yu Zhang et al.: Page 6 of 16", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 15, + "total_chunks": 50, + "char_count": 2900, + "word_count": 419, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "48631acb-ea43-479d-9efc-e07b47527c15", + "text": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark tions. interpretation of complex traffic environments. We categorize\nthe existing dataset landscape into three distinct groups:\n3.4.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 16, + "total_chunks": 50, + "char_count": 222, + "word_count": 29, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "e81a943a-2b66-4909-b4ec-277c4a8dbaac", + "text": "Quality-Aware Spectral Compensation Foundational Perception Benchmarks. Early datasets estabTo overcome the limitations of basic feature concatenation, lished the groundwork for fundamental remote sensing percepwe design the QASC module (depicted in Fig. 2 (c)), which fa- tion. The seminal RSVQA [15] and RSVQAxBEN [18] precilitates bidirectional, dynamically balanced context exchange dominantly employed template-based questions programmatbetween optical and thermal features. ically generated from OpenStreetMap (OSM) data overlaying\nIn contrast to simple concatenation or element-wise ad- low-resolution Sentinel-2 or aerial imagery. Although large\ndition, QASC leverages a symmetric bidirectional atten- in scale, the generated queries are structurally rigid (e.g., \"Is\ntion mechanism to exchange complementary context between there a building?\") and lack semantic diversity. RSIVQA [16]\nmodalities. For the fusion direction from thermal to optical and the Open-Ended dataset [22] introduced human-annotated\n(compensating 𝐅𝑜𝑝𝑡using information from 𝐅𝑡ℎ), the query is questions to increase natural language variety, yet they remain\nderived from the optical features, while the key and value pair fundamentally constrained to static object recognition within\nare derived from the thermal features: optimal optical imagery. (8) Δ𝐅QASC𝑜𝑝𝑡 = MHCA(𝐐= 𝐅𝑜𝑝𝑡, 𝐊= 𝐅𝑡ℎ, 𝐕= 𝐅𝑡ℎ).", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 17, + "total_chunks": 50, + "char_count": 1374, + "word_count": 182, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "4778d217-6b7d-4939-acc1-2a118966c55e", + "text": "Task-Specific and Disaster Response Benchmarks. A sec- ond group focuses on specific high-stakes operational scenarios. FloodNet [20] and SAM-VQA [42] exclusively addressConversely, the thermal features are simultaneously enriched\npost-disaster damage assessment, requiring models to evalu-by the optical context:\nate flood impacts or structural building damage. Similarly,\nΔ𝐅QASC = MHCA(𝐐= 𝐅𝑡ℎ, 𝐊= 𝐅𝑜𝑝𝑡, 𝐕= 𝐅𝑜𝑝𝑡). (9) CDVQA [21] introduces VQA specific to change detection on 𝑡ℎ\nmultitemporal images. While these datasets, alongside methStandard Layer Normalization (LN) and Feed-Forward Net- ods like TGFNet [3], advance domain-specific understanding,\nworks (FFNs) are applied following the standard Transformer their exclusive reliance on RGB sensors renders them unsuitblocks. This symmetric architecture ensures that under ad- able for continuous traffic monitoring, where nighttime and adverse conditions (e.g., deep darkness or dense fog), the modal- verse weather capabilities are essential.\nity with superior signal quality (predominantly thermal) effec- Cognition-Oriented and Large-Scale Benchmarks. Recent\ntively guides the refinement of the degraded modality (optical), efforts have aimed to align datasets with the capabilities of\nthereby maximizing the resilience and fidelity of the final joint MLLMs.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 18, + "total_chunks": 50, + "char_count": 1317, + "word_count": 175, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "96cb3801-df86-4740-9d6c-a4024a4f320b", + "text": "CRSVQA [19] introduces complex, multi-tiered\nrepresentation. questions to benchmark advanced question-driven systems. RSGPT [43] establishes a benchmark for simultaneous im-\n3.5. Loss Function age captioning and VQA. EarthVQA [44] emphasizes complex\nFollowing standard MLLM training protocols, CTCNet relational analysis for urban planning applications, whereas\nis optimized via an auto-regressive language modeling ob- LRS-VQA [37] targets gigapixel-level ultra-large imagery injective. Given the sequence of ground-truth answer tokens terpretation. EarthGPT [6] integrates a diverse set of tasks\n𝐲= {𝑦1, 𝑦2, ..., 𝑦𝑇}, the training objective is to minimize the but still relies fundamentally on optical data. Critically, none\nnegative log-likelihood: of these benchmarks provide well-aligned multimodal (optical\nand TIR) data for robust all-weather perception, nor do they\n𝑇∑ target the fine-grained cognitive tasks (e.g., traffic violation de-\n= − log 𝑃(𝑦𝑡|𝑦<𝑡, 𝐅res𝑜𝑝𝑡, 𝐅res𝑡ℎ, 𝐪; Θ), (10) tection, vehicle behavior analysis) that are central to intelligent\n𝑡=1\ntransportation systems.\nwhere Θ encompasses all trainable parameters within the In contrast, Traffic-VQA addresses these limitations by inPGKE and QASC modules, including the gating scalars. The troducing the aligned TIR modality alongside a hierarchical\nparameters of the pre-trained visual encoder and LLM back- cognitive structure tailored for dynamic, real-time traffic cogbone remain frozen throughout optimization. nition. Table 1 presents a comprehensive comparison between\nTraffic-VQA and existing datasets. Traffic-VQA Dataset 4.2. Data Construction Pipeline\nTo bridge the gap between elementary perception and com- Constructing a large-scale, cognitively rich, and robustly\nplex cognitive interpretation in traffic surveillance, we con- multimodal benchmark requires rigorous protocols to ensure\nstruct Traffic-VQA, the first large-scale OPT-TIR benchmark both the visual fidelity of sensor data and the semantic indedicated to cognitive traffic understanding. Unlike existing tegrity of linguistic annotations. To this end, we designed a\ndatasets that predominantly focus on rudimentary object detec- systematic Human-in-the-Loop LLM-Empowered Construction or counting in well-lit environments, Traffic-VQA is de- tion Pipeline that minimizes manual annotation effort while\nsigned to challenge models with intricate cognitive tasks under expanding the structural diversity and cognitive complexity\ndiverse and adverse environmental conditions. of the resulting queries. The construction workflow proceeds\nthrough three stages: Stage 1: Hardware-Synchronized Data\n4.1. Review of Existing Datasets Acquisition, Stage 2: Structured Attribute Annotation by ExWhile RSVQA has evolved considerably, existing bench- perts, and Stage 3: LLM-Empowered QA Generation.\nmarks exhibit clear limitations when applied to the cognitive", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 19, + "total_chunks": 50, + "char_count": 2896, + "word_count": 381, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "cc48d1ca-910f-4148-bb14-0d6028ec3b9d", + "text": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark (a) Aligned Optical-Thermal Infrared Image Pairs (c) Cognitive QA pairs for Realistic Traffic Is there a vehicle in the image that is changing A. Center / Central / Middle / Middle of the\nlanes and where is it ? (vehicle traffic behavior) image Is it possible to have a stopped bus in the image ? Yes / Correct / Right\n(deduce) / Obviously, yes", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 21, + "total_chunks": 50, + "char_count": 434, + "word_count": 76, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "1b9145d4-00dc-48f4-b539-8df2d7222d97", + "text": "Is there a possible vehicle blockage in the image A. Lower right / Bottom\nand where is it ? (traffic flow) right At the left of the image how many cars are A. 5 / Total five / Five\nlegally parked ? (vehicle traffic behavior) were sighted How many vehicles are parked illegally in the A. 3 / Three in the middle\nmiddle of the image ? (vehicle traffic violation) / Three illegal parking\nQ. Are there any vehicles occupying the non- A. Yes, 4 / Three on top,\nmotor lane to the left of the image and how many one on bottom, total 4\nin total ? (vehicle traffic violation) (b) Two-stage Annotation car behavior: change lane location: center + Prompt for Realistic Complex MLLM for free- number: 1 Gemini Agent form QA Traffic Scenarios quality: thermal Figure 5: Illustrative examples from the Traffic-VQA dataset. (a) Synchronized and co-registered optical and TIR UAV image\npairs across diverse urban traffic settings. (b) Examples of challenging cognitive question-answer pairs that require deep situational\nunderstanding, such as identifying traffic violations and inferring latent behavioral risks. Table 1\nComparison of Traffic-VQA with leading VQA datasets utilizing overhead imagery. (OPT:\nOptical, SAR: Synthetic Aperture Radar, TIR: Thermal Infrared) Num Num Num Qst Main QA Incl. Dataset Modal Key Tasks\nImg QA Type Generation Cognitive\nRSVQA-LR [15] 772 77k 5 Template ✗ OPT Counting, Presence\nRSVQA-HR [15] 10,659 1,066k 5 Template ✗ OPT Counting, Presence\nVQA-TextRS [22] 2,144 6.2k 4 Manual ✗ OPT Object Recognition\nRSIVQA [16] 37,264 111k 9 Template ✗ OPT Object Recognition\nCRSVQA [19] 4,639 4.6k 3 Manual ✗ OPT Multistep Analysis\nFloodNet-VQA [20] 2,188 7.4k 4 Manual ✗ OPT Disaster Assessment\nSAM-VQA [42] 2,348 10.5k 7 Template ✗ OPT Damage Analysis\nCDVQA [21] 4,662 >122k 5 Template ✓ OPT Change Detection\nRSIEval [43] 100 0.9k 4 Manual ✓ OPT Captioning & VQA\nEarthVQA [44] 6,000 209k 3 Manual + Template ✓ OPT Relational Analysis\nOSVQA [3] 6,008 pairs 1,037k 16 Manual + Template ✓ OPT & SAR Multimodal Perception\nTraffic-VQA (Ours) 8,180 pairs 1,301k 31 Manual + LLM ✓ OPT & TIR Traffic Cognition Stage 1: Hardware-Synchronized Data Acquisition. To en- ceptual robustness, flights were conducted under a wide range\nsure spatial and temporal consistency, we deployed the DJI of illumination conditions (daylight, dusk/dawn, and nightM300 RTK drone platform equipped with the Zenmuse H20T time) as well as adverse weather (dense fog). Post-acquisition,\nintegrated payload. This payload houses a calibrated wide- a manual screening protocol was enforced.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 22, + "total_chunks": 50, + "char_count": 2568, + "word_count": 416, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "bb05591e-e4ea-4e21-9e44-2da73d993534", + "text": "Image pairs exangle optical sensor alongside a radiometric thermal camera, hibiting motion blur, excessive occlusion, or corrupted sensor\nenabling hardware-synchronized capture of optical and TIR data were discarded, yielding a curated dataset of 8,180 highimagery. This integrated hardware approach eliminates the quality, well-aligned image pairs.\ntemporal desynchronization and spatial misalignment artifacts Stage 2: Structured Attribute Annotation by Experts. Recommon in conventional multi-sensor arrays.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 23, + "total_chunks": 50, + "char_count": 510, + "word_count": 63, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "668eb02c-fa63-4556-9957-8729601d6d27", + "text": "Data collection lying solely on direct manual annotation for large-scale QA\nwas carried out across diverse urban environments, includ- datasets frequently results in simplistic and repetitive linguising high-density arterial roads, complex signalized intersec- tic constructs. To address this, we adopted an attribute-centric\ntions, and highway on/off ramps. To ensure all-weather per- annotation paradigm. We first defined a comprehensive TrafYu Zhang et al.: Page 8 of 16 UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark light\nNormal\nimages\nThermal Figure 6: Sample aligned image pairs from the proposed Traffic-VQA dataset. The dataset spans diverse illumination conditions (e.g.,\ndaylight, low-light, and nighttime) alongside adverse weather scenarios (e.g., fog), providing well-aligned OPT and TIR images to\nsupport research in all-weather traffic understanding. traffic road 12.00\n142,631 Cloud1,260Cover 10.00\npedestrian 32,475 Low Light\nmodality 3,710 8.00\nroad 6.00 condition 119,929 Overcast65 Traffic 35,266\nScene 7,805\ntheme 36,288 4.00\ndeduce 110,202\nroad facility 2.00\nSunny 36,848\n3,145 0.00 location 100,819 Normal\nvehicle behavior Light 4,470\n39,432\nmatch\n83,198 SenceGeneral375 vehicle violation compare vehicle\n42,674 80,862 82,583 Light Weathe Scene Type\nCondition Condition\n(a) (b) Figure 7: Statistical distribution of the Traffic-VQA dataset. Left: Breakdown of the 8,180 image pairs by illumination conditions,\nweather, and scene type. Right: Distribution of the 31 question types, highlighting the significant proportion of high-level cognitive\ntasks (20.7%) and specialized multimodal queries. fic Cognition Ontology covering granular object categoriza- tions or behavioral anomalies (e.g., \"vehicle illegally potions (e.g., sedan, heavy truck, pedestrian), dynamic behav- sitioned on pedestrian sidewalk\").\nioral classifications (e.g., executing a turn, exceeding speed\nThis structured metadata repository serves as the factual foun-limits, queuing), and environmental state attributes (e.g., visidation for the subsequent generative phase.bility metrics, road surface conditions). Certified domain experts were then tasked with annotating the imagery according Stage 3: LLM-Empowered QA Generation and Verificato this ontology. Rather than drafting free-form questions, the tion.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 24, + "total_chunks": 50, + "char_count": 2349, + "word_count": 311, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "8ebe5677-8e0b-4e8e-bd96-b4db5d66656a", + "text": "Leveraging the capabilities of LLMs, we automatically\nexperts focused on extracting structured, ground-truth meta- synthesized the structured expert annotations into complex,\ndata: natural language QA pairs. This generation phase operates\nthrough two parallel processes:\n• Object-Level.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 25, + "total_chunks": 50, + "char_count": 286, + "word_count": 36, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "e9e78da4-582b-4e75-b6c7-7533f167dc91", + "text": "Precise bounding box annotations paired\nwith attribute tags for individual traffic participants. 1. Programmatic Generation for Precision. For objective queries requiring exact numerical counting or bi-\n• Scene-Level. Global tags classifying weather condi- nary existence verification (e.g., \"How many standard\ntions, ambient lighting, road type, and traffic density. passenger cars are currently visible?\"), we applied rule-\n• Event-Level. Identification of specific vehicular viola- based templates populated directly from the verified anYu Zhang et al.: Page 9 of 16 UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark Table 2 Table 3\nCategorization of the 31 question types in the Traffic-VQA dataset, divided into cognitive, Statistical overview of the Traffic-VQA dataset.\nperceptual, and multimodal domains. Statistics Numbers\nQuestion Type Object Type Specific Question Type Total QAs 1,301,466\nTotal OPT-TIR pairs 8,180 Abnormal Traffic Condition, Vehicle Violations, Small VehiAvg QA on img pair 159.1\ncle Violation, Pedestrian Violation, Vehicle Behavior, Small\nCognitive Traffic Participant\nVehicle Behavior, Deduce, Pedestrian Behavior, Traffic Par- Max question length 28\nticipant Interaction, Road Condition Avg question length 8.5\nQuestion vocab size 24,357\nTraffic Participant Traffic Road, Vehicle, Pedestrian, Road Facility Total question types 31\nPerceptual Compare, Presence, Location, Number, Shape, Most, Dis- Max answer length 16\nTotal Object tribution, Residential, Agricultural, Industrial, Fog, Dark, Avg answer length 1.2\nUrban, Theme, UAV Answer vocab size 4,418\nTotal unique answers 7,644\nModal Total Object Match, Modality deduce\nurban 12 locationcompare functional categories. The most frequent types include Trafmist modality fic Road (142,631 queries) and Modality (119,929 queries),\nnight 9 small vehicle violation reflecting the dataset's focus on structural road understanding\nnumber residential 6 and cross-modal perception. Crucially, the dataset emphasizes\ndistribution traffic participant interaction higher-order cognition: Deduce alone includes 110,202 sam- 3\nshape most ples, and Location comprehension contributes 100,819 sam-\n0 theme pedestrian behavior ples. Two other high-impact categories are Compare Vehicle\ntraffic road abnormal traffic situation (approximately 80k) and Vehicle Violation (42,674), both divehicle road condition rectly relevant to automated traffic surveillance. This distriagricultural pedestrian bution pushes evaluation beyond basic object presence toward\nindustrial vehicle violation situation understanding and rule-aware interpretation.\nroad facility match\nvehicle behavior pedestrian violation Environmental Diversity. To evaluate algorithmic robustness, Traffic-VQA incorporates a wide range of challengingFigure 8: Distribution of average question length (in words)\nacross varying question types within the Traffic-VQA dataset. environmental conditions. The dataset includes 4,470 image pairs captured under normal light, contrasted with 3,710\npairs acquired under low light, alongside dedicated subsets\nfor Sunny (3,145) and Cloud Cover (1,260) scenarios.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 26, + "total_chunks": 50, + "char_count": 3172, + "word_count": 414, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "bf34a827-0d27-490e-9983-e1285d9a250b", + "text": "This notation database, ensuring accuracy.\nstructured variety prevents models from overfitting to idealized\n2.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 27, + "total_chunks": 50, + "char_count": 110, + "word_count": 14, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "3a402e21-8b17-4267-90a2-64b1340f1f66", + "text": "LLM-Based Cognitive Context Expansion. For ad- conditions, instead requiring the learning of invariant feature\nvanced cognitive tasks (e.g., \"Is the current traffic con- representations across radical illumination shifts (e.g., transifiguration indicative of a dangerous anomaly?\"), the tioning from optical-dominant features during daylight to TIRstructured attributes were fed into GPT-4 using spe- dominant features at night).\ncialized, constrained prompts. These prompts guided Linguistic Complexity and Length. We quantified linguistic\nthe LLM to formulate multi-hop comprehension ques- complexity by computing the average question length across\ntions, logically connecting cause-and-effect relation- different question types. The distribution shows a natural variships and demanding comparative analysis of visual fea- ance, with average lengths ranging from 6 to 12 words. As\ntures across both optical and TIR modalities. expected, cognition-heavy queries tend to be longer; for example, questions related to Mist exhibit the highest average length\nFinally, a Human-in-the-Loop Verification mechanism was ap-\n(11.10 words), followed by Compare tasks (10.45 words) and\nplied. An independent team of verification experts sampled\nDeduce queries (7.36 words). Simpler perception tasks, such\nthe generated QA pairs, correcting minor logical inconsistenas Urban classification, average approximately 6.76 words.\ncies and filtering out ambiguous or poorly structured samples. This correlation between task difficulty and question length\nThis pipeline ultimately produced over 1.3 million high-quality\nconfirms that Traffic-VQA challenges MLLMs to process comQA pairs with diverse linguistic structures, far exceeding the\nplex, natural language queries that closely reflect the nuanced\nscale and cognitive depth achievable through purely manual\ninteractions of real-world operators.\nannotation. Dataset Challenges\n4.3. Dataset Analysis\nThe Traffic-VQA dataset introduces several key challenges\nThe Traffic-VQA dataset is characterized by its large scale,\nto contemporary VQA architectures and MLLM methodolodiversity, and cognitive depth. We present a detailed statistical\ngies, highlighting important directions for future research.\nanalysis of the distribution of question types, environmental\nconditions, and linguistic complexity. 1) Cross-Modality Semantic Alignment. Effectively fusing\nStatistical Distribution of Question Types. The finalized information from optical and TIR modalities remains a sigdataset comprises 1,300,620 verified QA pairs derived from nificant challenge. TIR images lack color information but\n8,180 image pairs.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 28, + "total_chunks": 50, + "char_count": 2641, + "word_count": 344, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "977dec4b-86d5-4984-b466-3f8dd90d7dc9", + "text": "The queries are distributed across distinct provide strong spatial contrast for heat-emitting objects, Yu Zhang et al.: Page 10 of 16", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 29, + "total_chunks": 50, + "char_count": 133, + "word_count": 21, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "72aaa0e3-d5f2-476b-8308-ae73d9320f8d", + "text": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark (a) Optical Images (b) Thermal Images Questions Answers How many cars are moving in the image ? (vehicle behavior) A. 1\nQ. What is the total number of cars in the image ? (vehicle) A. 5\nQ. Is there a vehicle on the road here ? (vehicle violation) A. Is this a residential area ? (theme) A. Which location in the image might be used for growing crops ? (presence) A. Is an intersection present in the central area ? (traffic road)\nA. What distribution pattern do the wide roads follow ? (distribution)\ndistribution Q. Are there more than 5 road dividers ? (road facility)\nA. Does the image primarily show a non-residential area ? (theme)\nA. Can any low-rise non-residential buildings be identified ? (presence) A.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 30, + "total_chunks": 50, + "char_count": 802, + "word_count": 139, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "a1140b73-411d-4929-bbff-c9cee915c9b1", + "text": "Is the athletic track described as having an irregular shape ? (shape) A. How many streets are annotated ? (traffic road) A. 1\nQ. Is vehicle queuing observed in the image ? (vehicle behavior) A. Can any mist be observed in the image ? (mist) A. What type of roads are visible in the top section ? (traffic road) A. What is the estimated number of cars in the lower right area ? (match) A. Where is the large vehicle situated ? (vehicle) A. What type of environment is depicted in the image ? (urban) A. Is the primary setting a residential area ? (theme) A.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 31, + "total_chunks": 50, + "char_count": 557, + "word_count": 106, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "bb03617c-9991-4873-b66d-a3e6f0ecd794", + "text": "How many vehicles are making a U-turn ? (vehicle behavior) A. 1\nQ. Are vehicles turning in the scene ? (vehicle behavior) A. What is the relative quality between modalities for the turning vehicle ? Almost same\n(modality)\nQ. How many distinct woodland areas are annotated ? (number) A. Where is the intersection located ? (traffic road) 10 areas\nQ. Is a bus lane present ? (traffic road) A. Where is the large vehicle located ? (vehicle) A. How many vehicles are performing a turn ? (vehicle behavior) A. Lower left\nA. 2 vehicles Figure 9: Representative question-answer samples from Traffic-VQA. The dataset covers diverse query types beyond object recognition,\nrequiring reasoning over spatial relations, vehicle behaviors, traffic-rule compliance, and scene context (e.g., queuing, environment\ntype). while optical images offer rich textural detail but fail un- graded. Traditional VQA models typically suffer signifder low-light conditions. Models must learn to dynami- icant performance drops when confronted with such excally weight the relevance of each modality depending on treme domain shifts.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 32, + "total_chunks": 50, + "char_count": 1103, + "word_count": 169, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "2449352f-d246-4325-b369-75576b7be90b", + "text": "Traffic-VQA therefore serves as a\nthe context (e.g., prioritizing TIR data at night) and align benchmark for developing robust representation learning\ninherently inconsistent semantic features across these dis- methods that maintain high cognitive performance regardparate sensors. less of adverse weather or lighting conditions. 2) Fine-Grained Object Cognition. UAV imagery typically In summary, Traffic-VQA provides a comprehensive platcaptures a large field of view, making the constituent ob- form for advancing multimodal cognitive intelligence in the\njects very small relative to the overall image dimensions. domain of intelligent transportation systems. Performing deep cognitive analysis on the behavior of a\nsingle small vehicle (e.g., determining \"Is the third car in 5. Experiments\nthe far-left lane actively executing a turn?\") requires precise\nIn this section, we present an empirical evaluation of the attention mechanisms and fine-grained spatial localization\nproposed CTCNet framework on the Traffic-VQA benchmark. capabilities, which are difficult to achieve with standard viWe first describe the experimental configuration and imple- sion encoders pre-trained on ground-level imagery.\nmentation details (Section 5.1). We then provide a quantita-\n3) Robustness to Environmental Degradation. The dataset tive comparison against state-of-the-art (SOTA) MLLMs (Secincludes a substantial proportion of foggy and low-light tion 5.2), focusing on performance gains attributable to our\nscenarios where standard visual features are severely de- cognitive and perceptual modules. Finally, ablation studies Yu Zhang et al.: Page 11 of 16 UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 33, + "total_chunks": 50, + "char_count": 1736, + "word_count": 237, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "10df2ec8-cc70-463f-87bf-d8f428579e1a", + "text": "(Section 5.3) verify the individual contributions of the PGKE pretation process in explicit, formalized domain knowledge\nand QASC modules, complemented by a fine-grained analysis rather than fragile statistical correlations.\nacross question types. Furthermore, CTCNet demonstrates strong performance\nin fine-grained violation detection, achieving notable accura-\n5.1. Implementation Details cies in Vehicle Violation (80.26%) and Pedestrian Violation\nModel Configuration. We employ Qwen3-VL-8B-Instruct as (77.78%). Notably, our method significantly outperforms leadthe frozen backbone, selected for its strong visual-linguistic ing commercial models such as GPT-4o (Cognitive: 75.28%)\nalignment capabilities. The input resolution for both optical and Gemini-2.5-flash (Cognitive: 74.92%), demonstrating that\nand TIR images is set to 640 × 512 pixels.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 34, + "total_chunks": 50, + "char_count": 851, + "word_count": 107, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "9ee642e8-e8e0-47b5-9f19-3d469e5db631", + "text": "To preserve the a domain-specialized, knowledge-anchored module can subgeneralization capabilities of the pre-trained MLLM and en- stantially surpass large-scale general-purpose models in highly\nsure parameter efficiency, the core vision encoder and LLM regulated vertical domains.\nbackbone remain entirely frozen. Only the parameters of the Robustness in Perceptual Tasks. For Traffic Perceptual\nGated Parallel Residual Architecture—comprising the PGKE Questions (e.g., object localization, presence detection), the\nand QASC modules—are updated during training. fully fine-tuned CTCNet attains 80.26% accuracy, surpassTraining Protocols. All models are implemented in PyTorch ing the corresponding fine-tuned baseline (75.30%) by nearly\nand trained on NVIDIA RTX 4090 GPUs. This improvement verifies that the QASC module effecoptimizer with a cosine learning rate scheduler (initial learn- tively enhances feature distinctiveness, particularly for small\ning rate 1 × 10−4) and a batch size of 16. To evaluate the effi- and densely packed UAV targets. Even when constrained to\ncacy and adaptability of our architecture, we adopt two train- the few-shot regime, our method (MUL OA: 61.94%) substaning protocols: (i) Few-Shot Learning. To demonstrate the tially outperforms all evaluated zero-shot open-source baseeffectiveness of prototype-guided knowledge injection under lines (peak OA around 47.62%), validating that our lightweight\ndata-scarce conditions, the model is fine-tuned on a randomly residual injection strategy preserves general perception abilisampled subset of 10,000 examples. (ii) Full Fine-Tuning. To ties while improving sensitivity to domain-specific semantic\nestablish the upper performance bound of the architecture, the objects.\nmodel is trained on the complete Traffic-VQA training set.\n5.2.2. Evaluation of Multi-Spectral RobustnessEvaluation Metrics. Following standard VQA evaluation\nTo assess the effectiveness of the QASC module underprotocols, we report both Accuracy (Acc) and CIDEr (C) metvarying environmental conditions, we analyze performance\nrics. The evaluation structure distinguishes between Cognitive\nacross different input modalities (OPT, TIR, and MUL) as\nand Perceptual tasks, and multi-modal versus single-modal inshown in Table 4.\nputs, to highlight the cognitive depth and environmental roA recurring limitation in zero-shot baselines is that multi-bustness introduced by the proposed method.\nmodal (MUL) performance often stagnates or degrades relative to single optical (OPT) performance. Quantitative Comparison\nQwen3-VL-8B (Base) shows only a marginal shift from an We benchmark the proposed CTCNet against two catOPT Overall Accuracy of 47.56% to a MUL OA of 47.62%,egories of baselines: (1) Zero-shot Open-source MLLMs\nand GeoChat experiences a similarly negligible gain (OPT:(e.g., MiniCPM-V, GeoChat, Qwen3-VL Base); (2) Closed-\n44.00% vs. This pattern suggests that naivesource Commercial MLLMs (e.g., GPT-4o, Gemini-2.5-flash).\nfeature concatenation introduces detrimental noise, renderingThe quantitative results, evaluated across single and multiple\ncomplementary thermal data largely ineffective without ex-modalities, are summarized in Table 4.\nplicit semantic alignment.\n5.2.1. Performance on Cognitive and Perceptual Tasks In contrast, CTCNet consistently achieves its best perforAs shown in Table 4, CTCNet achieves state-of-the-art per- mance in the MUL setting.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 35, + "total_chunks": 50, + "char_count": 3427, + "word_count": 455, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "a1b1e13e-78e4-4c59-8b6f-495327341003", + "text": "Under the few-shot protocol, CTCformance across the board, with a consistent and substantial Net improves the OPT baseline (47.56%) to a MUL OA of\nadvantage. It is particularly strong in complex cognitive sce- 61.94% (+14.38%). This result confirms that the QASC modnarios, effectively bridging the domain knowledge gap inher- ule successfully facilitates constructive interaction between\nent in general-purpose models. modalities. By routing bidirectional attention toward the most\ninformative regions of the TIR channel, the model effectivelySuperiority in Cognitive Understanding. The most criticompensates for optical degradations (e.g., deep shadows,cal evaluation domain is Traffic Cognitive Questions, which\nglare, or extreme low light) without incurring performancerequire an implicit understanding of traffic regulations compenalties from cross-modal feature conflict, ensuring reliablebined with advanced contextual interpretation. While the\nperformance across all-weather UAV surveillance conditions.Qwen3-VL-8B (Finetuned) baseline achieves a commendable\ncognitive accuracy of 80.55%—validating the utility of su-\n5.3. Ablation Studiespervised domain adaptation—our fully integrated Qwen3-VLTo better understand the contributions of each architec-\n8B + CTCNet architecture improves this to 84.81%, a martural component, we conduct ablation studies targeting the core\ngin of +4.26%. This improvement indicates that standard finemodules and fusion strategies, all under the few-shot setting to\ntuning is insufficient to capture the multi-layered logic govhighlight baseline structural differences.\nerning complex traffic violations. The advantage of CTCNet\nstems from the PGKE module, which retrieves expert prototypes from the external TRM, grounding the cognitive interYu Zhang et al.: Page 12 of 16 UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark Table 4\nQuantitative comparison on the Traffic-VQA test set. All numbers are Acc (%). Bold\nindicates the best result and underline indicates the second-best result within the same\nmodality for each metric.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 36, + "total_chunks": 50, + "char_count": 2107, + "word_count": 280, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "45a258d2-8c08-4932-a969-e3b619efad0e", + "text": "Cognition Perception Violation Behavior Violation Behavior OPT modality\nMiniCPM-V [45] arXiV 2024 45.27 39.7 45.43 37.12 39.57 47.57 40.13 44.82 41.22\nMiniGPT-v2 [46] arXiv 2023 36.04 35.3 35.95 33.74 37.38 37.96 29.54 39.05 40.14\nDeepSeek-VL [13] arXiV 2024 42.64 38.62 41.51 37.01 37.79 44.38 41.23 42.92 34.86\nGeoPix [47] GRSM 2025 43.59 38.13 40.48 37.16 39.49 45.1 38.07 43.94 41.86\nGeoChat [5] CVPR 2024 44 41.24 44.39 41.98 41.38 45.32 42.97 42.56 46.05\nFalcon [48] arXiV 2025 34.22 38.27 40.41 37.57 32.45 37.83 43.29 46.42 37.84\nQwen2.5-VL-7B [12] Ali 2025 43.69 40.04 60.61 39.5 39.02 47.19 48.97 44.53 43.4\nQwen3-VL-8B (Base) [49] Ali 2025 47.56 35.11 46.79 34.76 32.67 56.01 31.45 40.02 42.95\nGPT-4o [14] OpenAI 2024 64.81 71.92 72.33 74.15 71.22 73.27 64.3 73.8 62.46\nGemini-2.5-Flash [11] Google 2025 59.51 66.7 69.77 65 66.8 73.34 63.35 70.22 62.17\nGemini-2.5-Pro [11] Google 2025 56.17 65.98 63.18 64.95 64.16 67.44 56.24 62.85 58.27\nCTCNet (Few-Shot) - 47.56 46.79 35.11 34.76 32.67 56.01 42.95 40.02 31.45 TIR modality\nMiniCPM-V [45] arXiV 2024 43.62 40.83 43.92 39.1 39.27 46.24 39.65 47.01 41.78\nMiniGPT-v2 [46] arXiv 2023 34.47 31.5 33.45 28.65 33.05 33.34 28.44 35.91 34.45\nDeepSeek-VL [13] arXiV 2024 39.2 34.63 38.39 33.27 34.46 39.09 36.34 39.71 33.93\nGeoPix [47] GRSM 2025 42.73 39.7 42.03 37.45 41.36 46.29 43.13 45.99 44.13\nGeoChat [5] CVPR 2024 43.08 40.42 45.72 39.85 41 47.77 45.81 42.85 42.35\nFalcon [48] arXiV 2025 33.59 40.03 39.73 40.51 34.71 38.34 50.24 49.42 36.97\nQwen2.5-VL-7B [12] Ali 2025 40.57 38.98 40.49 38.41 37.89 44.12 46.76 43.43 41.9\nQwen3-VL-8B (Base) [49] Ali 2025 44.09 28.68 41.23 30.13 26.87 49.15 35.47 33.48 29.23\nGPT-4o [14] OpenAI 2024 63.66 69.97 70.67 71.13 66.74 71.98 65.72 76.06 59.23\nGemini-2.5-Flash [11] Google 2025 57.01 61.53 61.58 57.55 59.04 69.24 64.3 71.39 51.13\nGemini-2.5-Pro [11] Google 2025 51.5 57.51 55.14 53.18 55.17 64.76 47.87 61.39 46.98\nCTCNet (Few-Shot) - 44.09 41.23 28.68 30.13 26.87 49.15 35.47 33.48 29.23 MUL modality\nMiniCPM-V [45] arXiV 2024 45.37 39.28 60.06 36.46 38.61 47.93 42.02 46.13 41.74\nMiniGPT-v2 [46] arXiv 2023 36.68 35.91 37.69 34.2 38.11 39.23 31.75 39.2 42.54\nDeepSeek-VL [13] arXiV 2024 42.5 38.14 57.19 36.41 36.81 44.74 40.13 43.28 35.78\nGeoPix [47] GRSM 2025 43.62 38.11 51.42 37.21 39.9 45.12 37.91 43.21 41.17\nGeoChat [5] CVPR 2024 44.22 41.66 51.19 41.8 41.65 45.27 43.92 43.87 47.19\nFalcon [48] arXiV 2025 34.8 38.39 36.83 37.05 33.02 38.62 42.97 46.06 37.89\nQwen2.5-VL-7B [12] Ali 2025 43.73 39.75 43.2 39.81 39.21 47.59 49.61 43.87 42.73\nQwen3-VL-8B (Base) [49] Ali 2025 47.62 35.64 47.47 36.13 32.54 54.84 45.3 43.56 36.45\nGPT-4o [14] OpenAI 2024 67.72 75.28 73.96 77.67 73.13 74.06 69.51 78.03 69.13\nGemini-2.5-Flash [11] Google 2025 67.47 74.92 73.65 75.91 71.37 75.39 75.99 79.2 69.76\nGemini-2.5-Pro [11] Google 2025 61.41 70.05 64.93 69.38 66.03 68.64 68.72 72.99 62.15\nCTCNet (Few-Shot) - 61.94 55.74 61.28 57.66 52.41 62.48 55.13 58.48 56.99\nQwen3-VL-8B (Finetuned) Ali 2025 79.598 80.546 75.296 76.747 75.65 79.766 76.496 71.245 69.661\nQwen3-VL-8B + CTCNet - 83.156 84.812 80.259 80.259 68.189 83.762 77.778 80.579 74.882 Effectiveness of Proposed Modules gap by grounding visual representations in expert regulatory\nWe investigate the individual and combined contributions prototypes.\nof the PGKE and QASC modules by incrementally integrating Effectiveness of QASC (Exp3). Integrating only the QASC\nthem into the baseline architecture (Qwen3-VL with standard module also produces a significant performance improvement,\nconcatenation).", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 37, + "total_chunks": 50, + "char_count": 3559, + "word_count": 559, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "db44b63d-2417-4ce1-b048-a6263a07b31b", + "text": "The results are reported in Table 5. achieving an OA of 61.27%. This gain highlights the necesEffectiveness of PGKE (Exp2). Adding the PGKE module sity of dynamic, bidirectional context exchange between the\nalone yields a substantial gain in Overall Accuracy (OA), ris- optical and thermal modalities. By selectively compensating\ning from the baseline's 47.62% to 60.67%. This improvement for degraded features, the QASC module establishes a robust\nhighlights the vital role of explicitly injecting domain-specific perceptual foundation, which is essential for reliably identifyknowledge. Without the external TRM, the baseline model ing small and densely distributed traffic objects under adverse\nstruggles to map elementary visual features to abstract traf- conditions.\nfic rules. The PGKE module effectively bridges this semantic Synergy of PGKE and QASC (Exp4). Yu Zhang et al.: Page 13 of 16 UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark Table 5 adaptively weight the importance of optical versus TIR feaAblation study isolating the PGKE and QASC modules under tures, a shortcoming that is exacerbated when one modality\nthe few-shot setting. Exp1 represents the unmodified baseline is heavily degraded by environmental noise. In contrast, our\nusing simple feature concatenation. OA: Overall Accuracy, AA: QASC mechanism achieves an OA of 61.94%, demonstrating\nAverage Accuracy. that dynamic, attention-based bidirectional context exchange\nID PGKE (Cognition) QASC (Perception) OA (%) AA (%) is essential for robust all-weather traffic analysis.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 38, + "total_chunks": 50, + "char_count": 1593, + "word_count": 229, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "3489d2a9-18c2-405c-b8d9-d3b1106a4490", + "text": "Exp1 ✗ ✗ 47.62 46.93\nExp2 ✓ ✗ 60.67 59.41 5.3.3. Fine-Grained Analysis by Question Type\nExp3 ✗ ✓ 61.27 59.86 To evaluate the model's performance under specific enExp4 ✓ ✓ 61.94 59.97\nvironmental and task-oriented conditions, we conduct a finegrained analysis structured across different question types, as\nTable 6 visualized in the radar chart (Fig. 10). Comparison of different modality integration methods. The pro- The breakdown reveals two key observations regarding enposed QASC module consistently outperforms all static fusion vironmental robustness:\noperations. For queries tagged with \"Night\", the opticalIntegration Method Overall Accuracy (OA) Δ vs. Baseline only (OPT) baseline suffers a significant performance drop\nOptical Only (Baseline) 47.56 - (CIDEr: 12.93), while the standalone TIR modality remains\nThermal Only 44.09 - robust (CIDEr: 80.55). Our fused model (MUL) effectively\nElement-wise Add 47.62 +0.06 leverages the TIR stream to maintain strong performance\nConcatenation 49.12 +1.56 (CIDEr: 78.84), demonstrating that the QASC module corCTCNet (QASC) 61.94 +14.38\nrectly identifies and prioritizes the reliable TIR channel when\nthe optical stream is rendered ineffective by darkness. MUL OPT TIR 1-abnormal traffic situation\n2-agricultural\n3-compare Fog or Mist Scenarios. Under \"Mist\" conditions, the fused\n1 4-deduce\n2 31 100 30 3 5-distribution6-industrial model (MUL) achieves a CIDEr score of 79.69, outperforming 90\n29 4 7-location 80 both single-modality inputs (OPT: 60.13, TIR: 71.75). This 8-match 28 5 70 9-mist\n60 10-modality result indicates that even when both sensors are partially de-\n27 6 11-most 50 12-night graded by fog, the cross-spectral fusion strategy allows the\n26 40 7 13-non-motorised vehicle behavior\n30 14-non-motorised vehicle violation model to aggregate complementary cues, yielding a result su-\n15-number 25 20 8 16-pedestrian\n10 17-pedestrian behavior perior to either individual modality.\n0 18-pedestrian violation 24 9 19-presence CTCNet shows consistently strong performance across a\n20-residential\n23 10 21-road condition diverse set of tasks, spanning both high-level cognitive analysis\n22-road facility\n23-shape (e.g., \"Abnormal Traffic Situation\") and low-level perception\n22 11 24-theme\n25-traffic participant interaction queries (e.g., \"Location\"). These results suggest that the model 26-traffic road 21 12\n27-uav 28-urban generalizes well across task types and is a practical candidate 20 13\n29-vehicle\n19 14 30-vehicle behavior for real-world UAV traffic surveillance scenarios.\n18 17 16 15 31-vehicle violation Figure 10: Performance comparison (quantified via CIDEr score)\n6. Conclusionacross the 31 question types in Traffic-VQA. The radar chart\nhighlights the operational stability of the proposed multi-modal In this paper, we construct a large-scale, unified bench-\n(MUL) approach, particularly in challenging environmental cat- mark dataset, Traffic-VQA, designed to advance all-weather\negories such as \"Night\" and \"Mist,\" where single-modality ap- UAV traffic cognitive understanding. Comprising 8,180 wellproaches consistently underperform. aligned OPT-TIR image pairs and over 1.3 million questionanswer pairs, the dataset covers diverse environmental conditions and 31 distinct cognitive tasks.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 39, + "total_chunks": 50, + "char_count": 3276, + "word_count": 459, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "4b7026c5-250d-40e0-8a44-dcc1f2ae1cf2", + "text": "To address the limitaframework achieves the best overall performance (OA 61.94%, tions of existing methods regarding domain knowledge defiAA 59.97%). The gain over single-module variants suggests ciency and cross-modality interference, we propose CTCNet.\nthat perceptual robustness and knowledge-guided reasoning The CTCNet incorporates a PGKE module working in conare complementary. Specifically, QASC improves the stabil- junction with an external TRM to inject domain-specific situaity of input features under changing environmental conditions, tional knowledge into visual features. Furthermore, the QASC\nwhich in turn allows PGKE to retrieve more accurate and con- module adaptively integrates complementary spectral informatextually relevant prototypes to support complex reasoning. tion through dynamic, attention-driven context exchange. Extensive experiments on Traffic-VQA demonstrate the effective-\n5.3.2. Analysis of Fusion Mechanisms ness of CTCNet, which significantly outperforms contempoWe further compare the proposed QASC module against rary state-of-the-art MLLMs, particularly in high-level cognistandard fusion operations, as shown in Table 6. tive scenarios. Simple element-wise addition and concatenation yield OA We further highlight several directions for future research.\nvalues of 47.62% and 49.12%, respectively, offering only The first concerns the transition from static spatial obsermarginal improvements over the single-modality baseline. vation to continuous dynamic analysis. While Traffic-VQA\nThese results confirm that rigid, static fusion strategies fail to provides a solid foundation for multi-spectral image compreYu Zhang et al.: Page 14 of 16 UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark hension, practical traffic surveillance inherently requires the [10] Zhicheng Zhao, Wei Zhang, Yun Xiao, Chenglong Li, and Jin Tang.\nanalysis of continuous behavioral trajectories. Extending the Reflectance-guided progressive feature alignment network for all-day\nbenchmark toward video-based spatio-temporal VQA would uav object detection. IEEE Transactions on Geoscience and Remote\nSensing, 2025.\nenhance comprehensive event interpretation capabilities. The [11] Google DeepMind. Gemini 2.5, 2025.\nsecond direction focuses on the development of more powerful [12] Shuai Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Sibo\ncognitive mechanisms.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 40, + "total_chunks": 50, + "char_count": 2427, + "word_count": 315, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "1dfba588-e93d-4129-99e5-b7c35fc4ab44", + "text": "Compared to generalized visual per- Song, Kai Dang, Peng Wang, Shijie Wang, Jun Tang, et al. Qwen2.\nception tasks, specialized traffic behavior understanding relies 5-vl technical report. arXiv preprint arXiv:2502.13923, 2025.\nheavily on regulatory constraints, highlighting the importance [13] Haoyu Lu, Wen Liu, Bo Zhang, Bingxuan Wang, Kai Dong, Bo Liu,\nJingxiang Sun, Tongzheng Ren, Zhuoshu Li, Hao Yang, et al. Deepseekof integrating explicit prior knowledge into large-scale foun- vl: towards real-world vision-language understanding. arXiv preprint\ndation models rather than relying solely on data-driven statis- arXiv:2403.05525, 2024.\ntical correlations. In future work, we plan to extend this frame- [14] OpenAI. Hello GPT-4o, 2024.\nwork to video-based UAV-VQA and investigate the integration [15] Sylvain Lobry, Diego Marcos, Jesse Murray, and Devis Tuia. Rsvqa:\nof additional complementary sensor modalities, with the goal Visual question answering for remote sensing data. IEEE Transactions\non Geoscience and Remote Sensing, 58:8555–8566, 2020.\nof advancing robust, all-weather intelligent transportation sys- [16] Xiangtao Zheng, Binqiang Wang, Xingqian Du, and Xiaoqiang Lu. Mutems. tual attention inception network for remote sensing visual question answering. IEEE Transactions on Geoscience and Remote Sensing, 60:1–\n14, 2022. Acknowledgement [17] Zixiao Zhang, Licheng Jiao, Lingling Li, Xu Liu, Puhua Chen, Fang Liu,\nThis work was supported in part by the National Natural Yuxuan Li, and Zhicheng Guo.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 41, + "total_chunks": 50, + "char_count": 1521, + "word_count": 213, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "6f5e9b92-0d6d-4277-a2a9-5b161a328f93", + "text": "A spatial hierarchical reasoning network\nfor remote sensing visual question answering. IEEE Transactions on\nScience Foundation of China (No. 62306005, 62006002, and\nGeoscience and Remote Sensing, 61:1–15, 2023.\n62076003), in part by the Joint Funds of the National Nat- [18] Sylvain Lobry, Begüm Demir, and Devis Tuia. Rsvqa meets bigearthnet:\nural Science Foundation of China (No. U20B2068), in part A new, large-scale, visual question answering dataset for remote sensing.\nby the Natural Science Foundation of Anhui Province (No. In 2021 IEEE International Geoscience and Remote Sensing Symposium\n2208085J18 and 2208085QF192), and in part by the Natu- IGARSS, pages 1218–1221, 2021.\n[19] Meimei Zhang, Fang Chen, and Bin Li. Multistep question-driven viral Science Foundation of Anhui Higher Education Institution\nsual question answering for remote sensing. IEEE Transactions on Geo-\n(No. 2022AH040014). science and Remote Sensing, 61:1–12, 2023.\n[20] Maryam Rahnemoonfar, Tashnim Chowdhury, Argho Sarkar, Debvrat\nVarshney, Masoud Yari, and Robin Roberson Murphy. Floodnet: A\nReferences high resolution aerial imagery dataset for post flood scene understand-\n[1] Stanislaw Antol, Aishwarya Agrawal, Jiasen Lu, Margaret Mitchell, ing. IEEE Access, 9:89644–89654, 2021. Lawrence Zitnick, and Devi Parikh.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 42, + "total_chunks": 50, + "char_count": 1304, + "word_count": 183, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "925000d6-491a-4d73-ad0e-18e870b929ad", + "text": "Vqa: Visual ques- [21] Zhenghang Yuan, Lichao Mou, Zhitong Xiong, and Xiao Xiang Zhu.\ntion answering. In Proceedings of the IEEE International Conference Change detection meets visual question answering. IEEE Transactions\non Computer Vision (ICCV), pages 2425–2433, 2015. on Geoscience and Remote Sensing, 60:1–13, 2022.\n[2] Zhicheng Zhao, Juanjuan Gu, Chenglong Li, Chun Wang, Zhongling [22] Sara O. Alsaleh, Yakoub Bazi, Mohamad M. Al Rahhal, and Mansour\nHuang, and Jin Tang. Guidance disentanglement network for Al Zuair. Open-ended visual question answering model for remote sensoptics-guided thermal uav image super-resolution. arXiv preprint ing images.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 43, + "total_chunks": 50, + "char_count": 659, + "word_count": 93, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "7fb5c332-2809-43be-ae05-f80483bcc642", + "text": "In IGARSS 2022 - 2022 IEEE International Geoscience and\narXiv:2410.20466, 2024. Remote Sensing Symposium, pages 2848–2851, 2022.\n[3] Zhicheng Zhao, Changfu Zhou, Yu Zhang, Chenglong Li, Xiaoliang Ma, [23] Sagar Soni, Akshay Dudhane, Hiyam Debary, Mustansar Fiaz, Muhamand Jin Tang. Text-guided coarse-to-fine fusion network for robust re- mad Akhtar Munir, Muhammad Sohail Danish, Paolo Fraccaro, Campmote sensing visual question answering. ISPRS Journal of Photogram- bell D Watson, Levente J Klein, Fahad Shahbaz Khan, et al. Earthdial:\nmetry and Remote Sensing, 230:1–17, 2025. Turning multi-sensory earth observations to interactive dialogues. In\n[4] Zhicheng Zhao, Juanjuan Gu, Chenglong Li, Chun Wang, Zhongling Proceedings of the Computer Vision and Pattern Recognition ConferHuang, and Jin Tang. Guidance disentanglement network for optics- ence, pages 14303–14313, 2025.\nguided thermal uav image super-resolution. ISPRS Journal of Pho- [24] Wujie Zhou, Xinyang Lin, Jingsheng Lei, Lu Yu, and Jenq-Neng Hwang.\ntogrammetry and Remote Sensing, 228:64–82, 2025. Mffenet: Multiscale feature fusion and enhancement network for rgb–\n[5] Kartik Kuckreja, Muhammad Sohail Danish, Muzammal Naseer, Abhi- thermal urban road scene parsing. IEEE Transactions on Multimedia,\njit Das, Salman Khan, and Fahad Shahbaz Khan. Geochat: Grounded 24:2526–2538, 2021.\nlarge vision-language model for remote sensing. arXiv preprint [25] Kailai Zhou, Fuqiang Yang, Shixian Wang, Bihan Wen, Chongde Zi,\narXiv:2311.15826, 2023. Linsen Chen, Qiu Shen, and Xun Cao.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 44, + "total_chunks": 50, + "char_count": 1545, + "word_count": 215, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "00fdebb6-44d7-46df-a317-bd112feddfb2", + "text": "M-specgene: Generalized\n[6] Wei Zhang, Miaoxin Cai, Tong Zhang, Yin Zhuang, and Xuerui Mao. foundation model for rgbt multispectral vision. In Proceedings of the\nEarthgpt: A universal multi-modal large language model for multi- IEEE/CVF International Conference on Computer Vision, pages 7861–\nsensor image comprehension in remote sensing domain. IEEE Trans- 7872, 2025.\nactions on Geoscience and Remote Sensing, 2024. [26] Zixiang Zhao, Haowen Bai, Jiangshe Zhang, Yulun Zhang, Shuang Xu,\n[7] Wei Zhang, Miaoxin Cai, Yaqian Ning, Tong Zhang, Yin Zhuang, Shi- Zudi Lin, Radu Timofte, and Luc Van Gool. Cddfuse: Correlationjian Lu, He Chen, Jun Li, and Xuerui Mao. Earthgpt-x: A spatial mllm driven dual-branch feature decomposition for multi-modality image fufor multilevel multisource remote sensing imagery understanding with sion. In Proceedings of the IEEE/CVF conference on computer vision\nvisual prompting. IEEE Transactions on Geoscience and Remote Sens- and pattern recognition, pages 5906–5916, 2023.\ning, 63:1–21, 2025. [27] Vishal Chudasama, Purbayan Kar, Ashish Gudmalwar, Nirmesh Shah,\n[8] Michael Schmitt and Xiao Xiang Zhu. Data fusion and remote sensing: Pankaj Wasnik, and Naoyuki Onoe. M2fnet: Multi-modal fusion\nAn ever-growing relationship. IEEE Geoscience and Remote Sensing network for emotion recognition in conversation. In Proceedings of\nMagazine, 4:6–23, 2016. the IEEE/CVF conference on computer vision and pattern recognition,\n[9] Lei Liu, Mengya Zhang, Cheng Li, Chenglong Li, and Jin Tang. Cross- pages 4652–4661, 2022.\nmodal object tracking via modality-aware fusion network and a large- [28] Han Xu, Jiayi Ma, Junjun Jiang, Xiaojie Guo, and Haibin Ling.\nscale dataset. IEEE Transactions on Neural Networks and Learning Sys- U2fusion: A unified unsupervised image fusion network. IEEE transtems, 36(4):6981–6994, 2024. actions on pattern analysis and machine intelligence, 44(1):502–518,\n2020. Yu Zhang et al.: Page 15 of 16", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 45, + "total_chunks": 50, + "char_count": 1955, + "word_count": 281, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "7eb12b60-0449-45de-8680-691b215f4218", + "text": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark [29] Yujing Rao, Dan Wu, Mina Han, Ting Wang, Yang Yang, Tao Lei, [45] Yuan Yao, Tianyu Yu, Ao Zhang, Chongyi Wang, Junbo Cui, Hongji\nChengjiang Zhou, Haicheng Bai, and Lin Xing.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 46, + "total_chunks": 50, + "char_count": 268, + "word_count": 44, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "7e98df7b-0ed5-49e1-995a-326c21416aa7", + "text": "At-gan: A generative Zhu, Tianchi Cai, Haoyu Li, Weilin Zhao, Zhihui He, et al. Minicpm-v:\nadversarial network with attention and transition for infrared and visible A gpt-4v level mllm on your phone. arXiv preprint arXiv:2408.01800,\nimage fusion. Information Fusion, 92:336–349, 2023. 2024.\n[30] Zixiang Zhao, Haowen Bai, Yuanzhi Zhu, Jiangshe Zhang, Shuang Xu, [46] Jun Chen, Deyao Zhu, Xiaoqian Shen, Xiang Li, Zechun Liu, Pengchuan\nYulun Zhang, Kai Zhang, Deyu Meng, Radu Timofte, and Luc Van Gool. Zhang, Raghuraman Krishnamoorthi, Vikas Chandra, Yunyang Xiong,\nDdfm: denoising diffusion model for multi-modality image fusion. In and Mohamed Elhoseiny. Minigpt-v2: large language model as a uniProceedings of the IEEE/CVF international conference on computer vi- fied interface for vision-language multi-task learning. arXiv preprint\nsion, pages 8082–8093, 2023. arXiv:2310.09478, 2023.\n[31] Zhicheng Zhao, Juanjuan Gu, Chenglong Li, Chun Wang, Zhongling [47] Ruizhe Ou, Yuan Hu, Fan Zhang, Jiaxin Chen, and Yu Liu.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 47, + "total_chunks": 50, + "char_count": 1020, + "word_count": 147, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "1ca4d7a0-9c2b-4322-9887-9b5470e1fa9c", + "text": "Geopix:\nHuang, and Jin Tang. Guidance disentanglement network for Multi-modal large language model for pixel-level image understanding\noptics-guided thermal uav image super-resolution. arXiv preprint in remote sensing. arXiv preprint arXiv:2501.06828, 2025.\narXiv:2410.20466, 2024. [48] Kelu Yao, Nuo Xu, Rong Yang, Yingying Xu, Zhuoyan Gao, Titinunt\n[32] Jinyuan Liu, Xingyuan Li, Zirui Wang, Zhiying Jiang, Wei Zhong, Wei Kitrungrotsakul, Yi Ren, Pu Zhang, Jin Wang, Ning Wei, et al. Falcon:\nFan, and Bin Xu. Promptfusion: Harmonized semantic prompt learning A remote sensing vision-language foundation model. arXiv preprint\nfor infrared and visible image fusion. IEEE/CAA Journal of Automatica arXiv:2503.11070, 2025. Sinica, 2024. [49] Shuai Bai, Yuxuan Cai, Ruizhe Chen, Keqin Chen, Xionghui Chen, Ze-\n[33] Xin Guo, Jiangwei Lao, Bo Dang, Yingying Zhang, Lei Yu, Lixiang Ru, sen Cheng, Lianghao Deng, Wei Ding, Chang Gao, Chunjiang Ge, WenLiheng Zhong, Ziyuan Huang, Kang Wu, Dingxiang Hu, et al. Skysense: bin Ge, Zhifang Guo, Qidong Huang, Jie Huang, Fei Huang, Binyuan\nA multi-modal remote sensing foundation model towards universal inter- Hui, Shutong Jiang, Zhaohai Li, Mingsheng Li, Mei Li, Kaixin Li,\npretation for earth observation imagery. In Proceedings of the IEEE/CVF Zicheng Lin, Junyang Lin, Xuejing Liu, Jiawei Liu, Chenglong Liu,\nConference on Computer Vision and Pattern Recognition, pages 27672– Yang Liu, Dayiheng Liu, Shixuan Liu, Dunjie Lu, Ruilin Luo, Chenxu\n27683, 2024. Lv, Rui Men, Lingchen Meng, Xuancheng Ren, Xingzhang Ren, Sibo\n[34] Peijin Wang, Huiyang Hu, Boyuan Tong, Ziqi Zhang, Fanglong Yao, Song, Yuchong Sun, Jun Tang, Jianhong Tu, Jianqiang Wan, Peng Wang,\nYingchao Feng, Zining Zhu, Hao Chang, Wenhui Diao, Qixiang Ye, and Pengfei Wang, Qiuyue Wang, Yuxuan Wang, Tianbao Xie, Yiheng\nXian Sun.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 48, + "total_chunks": 50, + "char_count": 1835, + "word_count": 273, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "c3f16d21-95f0-4288-8ebf-160a9bedbdcf", + "text": "Ringmogpt: A unified remote sensing foundation model for Xu, Haiyang Xu, Jin Xu, Zhibo Yang, Mingkun Yang, Jianxin Yang,\nvision, language, and grounded tasks. IEEE Transactions on Geoscience An Yang, Bowen Yu, Fei Zhang, Hang Zhang, Xi Zhang, Bo Zheng,\nand Remote Sensing, 63:1–20, 2025. Humen Zhong, Jingren Zhou, Fan Zhou, Jing Zhou, Yuanzhi Zhu, and\n[35] Yue Zhou, Ran Ding, Xue Yang, Xue Jiang, and Xingzhao Liu.", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 49, + "total_chunks": 50, + "char_count": 416, + "word_count": 68, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "e539894c-e5b9-4e7e-a91f-a20bbe9c9467", + "text": "Qwen3-vl technical report. arXiv preprint arXiv:2511.21631,\ntialbot: A spatially-aware aerial agent for fine-grained vehicle attribute 2025.\nrecognization and retrieval. IEEE Transactions on Geoscience and Remote Sensing, pages 1–1, 2025.\n[36] Fengxiang Wang, Mingshuo Chen, Yueying Li, Di Wang, Haotian\nWang, Zonghao Guo, Zefan Wang, Boqi Shan, Long Lan, Yulin Wang,\nHongzhen Wang, Wenjing Yang, Bo Du, and Jing Zhang. Geollava-8k:\nScaling remote-sensing multimodal large language models to 8k resolution. arXiv preprint arXiv:2505.21375, 2025.\n[37] Junwei Luo, Yingying Zhang, Xue Yang, Kang Wu, Qi Zhu, Lei Liang,\nJingdong Chen, and Yansheng Li. When large vision-language model\nmeets large remote sensing imagery: Coarse-to-fine text-guided token\npruning. arXiv preprint arXiv:2503.07588, 2025.\n[38] Zilun Zhang, Zian Guan, Tiancheng Zhao, Haozhan Shen, Tianyu Li,\nYuxiang Cai, Zhonggen Su, Zhaojun Liu, Jianwei Yin, and Xiang Li. Geo-r1: Improving few-shot geospatial referring expression understanding with reinforcement fine-tuning. arXiv preprint arXiv:2509.21976,\n2025.\n[39] Wenxuan Huang, Bohan Jia, Zijie Zhai, Shaosheng Cao, Zheyu Ye, Fei\nZhao, Zhe Xu, Yao Hu, and Shaohui Lin. Vision-r1: Incentivizing reasoning capability in multimodal large language models. arXiv preprint\n[40] Di Wang, Shunyu Liu, Wentao Jiang, Fengxiang Wang, Yi Liu, Xiaolei\nQin, Zhiming Luo, Chaoyang Zhou, Haonan Guo, Jing Zhang, et al. Geozero: Incentivizing reasoning from scratch on geospatial scenes.\n[41] Run Shao, Ziyu Li, Zhaoyang Zhang, Linrui Xu, Xinran He, Hongyuan\nYuan, Bolei He, Yongxing Dai, Yiming Yan, Yijun Chen, et al. Asking\nlike socrates: Socrates helps vlms understand remote sensing images.\n[42] Argho Sarkar, Tashnim Chowdhury, Robin Roberson Murphy, Aryya\nGangopadhyay, and Maryam Rahnemoonfar. Sam-vqa: Supervised\nattention-based visual question answering model for post-disaster damage assessment on remote sensing imagery. IEEE Transactions on Geoscience and Remote Sensing, 61:1–16, 2023.\n[43] Yuan Hu, Jianlong Yuan, Congcong Wen, Xiaonan Lu, and Xiang Li. Rsgpt: A remote sensing vision language model and benchmark. arXiv\n[44] Junjue Wang, Zhuo Zheng, Zihang Chen, Ailong Ma, and Yanfei Zhong. Earthvqa: Towards queryable earth via relational reasoning-based remote sensing visual question answering. In Proceedings of the AAAI\nConference on Artificial Intelligence, volume 38, pages 5481–5489,\n2024. Yu Zhang et al.: Page 16 of 16", + "paper_id": "2603.10722", + "title": "UAV traffic scene understanding: A cross-spectral guided approach and a unified benchmark", + "authors": [ + "Yu Zhang", + "Zhicheng Zhao", + "Ze Luo", + "Chenglong Li", + "Jin Tang" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10722v1", + "chunk_index": 50, + "total_chunks": 50, + "char_count": 2449, + "word_count": 344, + "chunking_strategy": "semantic" + } +] \ No newline at end of file