diff --git "a/data/chunks/2603.10623_semantic.json" "b/data/chunks/2603.10623_semantic.json" new file mode 100644--- /dev/null +++ "b/data/chunks/2603.10623_semantic.json" @@ -0,0 +1,1872 @@ +[ + { + "chunk_id": "dd48b6d7-f4cf-45e5-bc4f-b72398e5c960", + "text": "Geo-ATBench: A Benchmark for Geospatial Audio\nTagging with Geospatial Semantic Context Yuanbo Houa,1,∗, Yanru Wub,1, Qiaoqiao Renc, Shengchen Lib, Stephen\nRobertsa, Dick Botteldoorend aMachine Learning Research Group, Engineering Science, University of Oxford, UK\nbDepartment of Intelligent Science, Xi'an Jiaotong-Liverpool University, China\ncEECS, KTH Royal Institute of Technology, Sweden\ndWAVES Research Group, Information Technology, Ghent University, Belgium2026\nMar\nAbstract\nEnvironmental sound understanding in computational auditory scene analysis (CASA) is often formulated as an audio-only recognition problem. formulation leaves a persistent drawback in multi-label audio tagging (AT): acoustic similarity can make certain events difficult to separate from wave-[eess.AS] forms alone. In such cases, disambiguating cues often lie outside the waveform. Geospatial semantic context (GSC), derived from geographic information system data, e.g., points of interest (POI), provides location-tied environmental priors that can help reduce this ambiguity.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 0, + "total_chunks": 85, + "char_count": 1060, + "word_count": 133, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "363ca725-8b3b-4dd1-ad01-f9daedb93727", + "text": "A systematic study of this direction is enabled through the proposed geospatial audio tagging (Geo-AT) task, which conditions multi-label sound event tagging on GSC alongside audio. benchmark Geo-AT, the Geo-ATBench dataset is introduced as a polyphonic audio benchmark with geographical annotations, containing 10.71 hours of realworld audio across 28 event categories; each clip is paired with a POI-derivedarXiv:2603.10623v1 GSC representation constructed from 11 semantic context categories. Furthermore, GeoFusion-AT is proposed as a unified geo-audio fusion framework that ∗Corresponding author: Yuanbo Hou, Machine Learning Research Group, University of\nOxford, UK. Email: Yuanbo.Hou@eng.ox.ac.uk\n1Equal contribution.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 1, + "total_chunks": 85, + "char_count": 724, + "word_count": 93, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "ac51f657-e9e9-44dc-8849-815f02a5a28e", + "text": "evaluates feature-level, representation-level, and decision-level fusion on three representative audio backbones, with audio-only and GSC-only baselines. Experiments show that incorporating GSC generally improves AT performance, especially on acoustically confounded labels, indicating that geospatial semantics can provide an effective prior beyond audio alone. A crowdsourced listening study with 10 participants on 579 samples shows that there is no significant difference in performance between the models on the Geo-ATBench labels and on aggregated human labels, supporting Geo-ATBench as a human-aligned benchmark. Overall, the proposed Geo-AT task, the open benchmark Geo-ATBench, and the reproducible geo-audio fusion framework GeoFusion-AT provide a solid foundation for studying audio tagging with geospatial semantic context within For the dataset, source code, and models, please see the project homepage (https://github.com/WuYanru2002/Geo-ATBench). Computational auditory scene analysis, Multi-label audio tagging, Geospatial semantic context, Points of interest, Multimodal fusion Environmental sound understanding is one of the core goals of computational auditory scene analysis (CASA) [1].", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 2, + "total_chunks": 85, + "char_count": 1207, + "word_count": 152, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "d4352110-acab-4cc9-a69e-5bdcd79c18d7", + "text": "In many practical applications, the target output is multi-label audio tagging (AT) [2], where each recording may contain multiple sound events and the system predicts the set of event AT supports applications such as acoustic surveillance [3], smart-city sensing [4], multimedia retrieval [5], and intelligent domestic assistants [6]. Despite strong progress in deep learning models for environmental audio, AT is commonly treated as an audio-only recognition problem [7, 8]. AT backbones, including convolutional neural networks (CNNs) and Transformers, learn powerful acoustic representations from time-frequency features such as Mel spectrograms [9].", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 3, + "total_chunks": 85, + "char_count": 654, + "word_count": 90, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "d1a443eb-7de8-4bdb-99c2-2666e396f1df", + "text": "However, a persistent drawback remains. Acoustic similarity can make certain events difficult to distinguish from waveforms alone, especially when different sources produce highly similar time-frequency In such cases, disambiguating cues often lie beyond the A key source of such cues is the physical environment in Sound events are produced by sources embedded in specific places, and their occurrence is shaped by location-tied environmental factors [14]. Location-tied conditions can induce systematic associations between event labels and geospatial semantic context (GSC) [15]. provide complementary cues when waveforms alone are ambiguous. This work focuses on sound source-associated GSC, which refers to locationtied environmental priors derived from geographic information systems data, such as points of interest (POI) [16]. Compared with raw GPS coordinates, POI-derived GSC provides structured semantic descriptions of the physical environment surrounding sound sources that can be aligned with audio representations [17]. Progress in this direction remains limited by the lack of", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 4, + "total_chunks": 85, + "char_count": 1092, + "word_count": 150, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "9be526c6-2315-4467-a187-57a1b2bed215", + "text": "standardized tasks and benchmark datasets that pair audio with reliable, structured GSC under reproducible evaluation [18]. Recent mobile recording devices and location-aware media platforms increasingly associate recordings with geographic coordinates [18], making relevant audio-GSC pairs increasingly accessible. This trend creates a timely opportunity to investigate how to leverage GSC to support multi-label AT in the real world. To address the gap that AT is often formulated without sound sourceassociated location-tied GSC, this paper proposes the geospatial audio tagging (Geo-AT) task, which conditions multi-label AT on GSC alongside audio Geo-AT aims to assess whether location-tied environmental priors help disambiguate events that are difficult to distinguish from audio alone. To benchmark Geo-AT, we release the Geo-ATBench dataset, a geographi- cally annotated polyphonic audio benchmark containing 3,854 clips with 28 event labels; each clip is paired with a GSC representation constructed from POI semantics over 11 context categories, enabling reproducible studies of how geospatial semantics interact with acoustic representations in multi-label AT. The proposed benchmark design of Geo-ATBench does not specify how GSC should be integrated into AT models [2, 3, 19], and different integration choices may lead to different outcomes.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 5, + "total_chunks": 85, + "char_count": 1356, + "word_count": 188, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "12c2f043-92b3-4423-ba40-bc042517e5a8", + "text": "Therefore, GeoFusion-AT is introduced as a unified geo-audio fusion framework for the proposed Geo-AT task to benchmark representative fusion strategies and to report reference results on GeoATBench. Specifically, GeoFusion-AT evaluates three typical fusion strategies,", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 6, + "total_chunks": 85, + "char_count": 269, + "word_count": 34, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "7567179f-f45e-4c52-bc24-edffa6582648", + "text": "feature-level, representation-level, and decision-level fusion, across three representative audio backbones, the CNN-based pretrained audio neural networks (PANNs) [20], the Transformer-based audio spectrogram Transformer (AST) [9], and contrastive language-audio pretraining (CLAP) [21]. GSC-only baselines are included to isolate the contribution of each modality and to identify when fusion improves performance beyond either input alone. The main contributions are: 1) Geo-AT is introduced as a standardized", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 7, + "total_chunks": 85, + "char_count": 511, + "word_count": 64, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "12b14b74-8935-4106-8c2f-449152bac194", + "text": "task formulation for multi-label audio tagging in CASA that integrates audio with geospatial semantic context (GSC); 2) Geo-ATBench is released as an open benchmark for reproducible Geo-AT evaluation, containing 3,854 realworld polyphonic audio clips annotated with 28 event labels, where each clip is paired with a GSC representation constructed from POI semantics over 11 semantic context categories; 3) GeoFusion-AT is introduced as a unified geo-audio fusion framework that benchmarks representative fusion strategies across representative audio backbones on Geo-ATBench to report reference results; 4) A crowdsourced listening study with 10 participants on 579 samples is conducted, showing that model performance is comparable when evaluated against GeoATBench labels and aggregated human labels, supporting Geo-ATBench as a human-aligned benchmark. We have released the dataset, code, and models. The rest of this paper is organized as follows. Section 2 reviews related work Section 3 formalizes the Geo-AT task. the Geo-ATBench dataset. Section 5 presents the GeoFusion-AT framework with fusion strategies based on representative audio backbones.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 8, + "total_chunks": 85, + "char_count": 1155, + "word_count": 161, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "6c5d7dce-815b-47dd-9863-88c92f4989fd", + "text": "Section 6 reports experimental results and analysis. Section 7 details the human evaluation Section 8 concludes the paper.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 9, + "total_chunks": 85, + "char_count": 122, + "word_count": 18, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "37921fd9-dbb4-467a-9001-bfcdad153911", + "text": "This section positions the proposed geospatial audio tagging (Geo-AT) task within prior work on multi-label audio tagging (AT), context-aware sound understanding, and POI-derived geospatial semantic context from geographic information systems. The discussion motivates the need for a standardized GeoAT task under reproducible evaluation.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 10, + "total_chunks": 85, + "char_count": 338, + "word_count": 43, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "2f70767c-9465-4b5a-9660-453b1de9820f", + "text": "Multi-Label Audio Tagging and Acoustic Ambiguity Multi-label AT is a central task in CASA [1], where an audio clip may contain polyphonic sound events, and the goal is to predict the set of event", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 11, + "total_chunks": 85, + "char_count": 195, + "word_count": 34, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "91d5963f-6add-429f-bbf8-8dba755c04b8", + "text": "Large-scale benchmarks and challenges [22, 23] have driven steady progress in model architectures and backbones, such as CNN-based PANNs [20] and MobileNet [24], Transformer-based Hierarchical Token-Semantic Audio Transformer [25] and AST [9], with contrastive learning-based CLAP that aligns audio and language representations [21]. These backbones have become", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 12, + "total_chunks": 85, + "char_count": 361, + "word_count": 47, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "f1cb5a77-30ef-4099-a5ab-776a48aa1592", + "text": "common reference points for representation learning in AT tasks. Despite architectural advances, AT in real-world conditions continues to face persistent ambiguity [26]. Polyphonic recordings often contain overlapping sources, and different events can produce similar time-frequency patterns [10, 11], leading audio-only AT to struggle with confusable events and mis- External priors like sound source-associated GSC provide complementary cues by encoding location-tied environmental priors into a structured POI-derived semantic representation [16], such as nearby place categories and their composition around the sound source.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 13, + "total_chunks": 85, + "char_count": 629, + "word_count": 81, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "db1b706c-a6f4-41f4-946c-5c4a35102183", + "text": "Location-tied GSC constrains the set of plausible events for a scene and can support disambiguation when acoustic evidence alone is insufficient. Context and Auxiliary Information for Sound Understanding Context-aware sound understanding extends AT by incorporating information beyond acoustic representations.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 14, + "total_chunks": 85, + "char_count": 310, + "word_count": 39, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "31443641-466c-45d9-9996-e20452b155b0", + "text": "Prior work [27, 28] can be divided into two groups, distinguished by whether the additional signal is time-aligned with One group [29] uses paired sensory streams, where video frames or other time-aligned inputs are available together with the audio. group [30] uses auxiliary metadata that is linked to the recording environment but is not time-aligned with the audio signal. Geo-AT concerns the second group. Location-tied descriptors operate as scene-level priors and remain available in many deployments. Existing studies [31] that incorporate auxiliary metadata vary in metadata representation, audio-metadata pairing rules, data splits, and reporting practice. and metadata-only baselines are not always reported. These inconsistencies limit reproducible comparison across studies and motivate a standardized task for evaluating auxiliary metadata in multi-label AT.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 15, + "total_chunks": 85, + "char_count": 872, + "word_count": 121, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "6403981f-276b-4018-b2bc-0e5023f53601", + "text": "POI-Derived Geospatial Semantic Context (GSC) Geospatial information has become increasingly available in audio collections due to mobile recording devices and location-aware media platforms that associate recordings with geographic coordinates [18]. Several datasets include geographic or location-related annotations, enabling spatial analyses of urban sound environments and regional differences [15]. However, geospatial information is usually used for organization, mapping, or descriptive analysis rather", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 16, + "total_chunks": 85, + "char_count": 510, + "word_count": 61, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "90bdce0f-0ee7-419d-aaa6-2f4a12c7631c", + "text": "than as an explicit model input for sound event recognition [32]. Points of interest (POI) in geographic information systems translate location into interpretable semantic descriptors. POI encodes nearby places and compositions, representing location-tied environmental priors [16]. GSC contains scene-level descriptors that can be paired with audio recordings. However, prior work rarely formalizes POI-derived GSC as part of the AT The lack of consistent task definitions and benchmarks makes it difficult to assess whether and how geospatial semantics should be integrated.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 17, + "total_chunks": 85, + "char_count": 576, + "word_count": 80, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "35749f64-28c1-4932-9267-38752119656c", + "text": "Taken together, prior work leaves AT largely audio-only and rarely evaluates POI-derived GSC as a task input under reproducible protocols. missing piece is a standardized Geo-AT task definition and a benchmark that enables controlled comparisons. The Geo-AT task addresses this gap by defining AT conditioned on sound source-associated, POI-derived GSC alongside audio, enabling controlled evaluation of geospatial priors in AT tasks. The proposed Geospatial Audio Tagging (Geo-AT) task Geospatial audio tagging (Geo-AT) formalizes AT conditioned on sound source-associated geospatial semantic context (GSC) derived from geographic information systems resources, such as Points of Interest (POI). multimodal learning task that enables controlled study of how POI-derived GSC interact with acoustic representations in AT tasks [2, 19]. Given each recording is represented by an acoustic representation A and a GSC vector g ∈RDGSC constructed from geographic information systems, Geo- AT uses a paired input (A, g). Geo-AT assumes that g is available as recording", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 18, + "total_chunks": 85, + "char_count": 1061, + "word_count": 151, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "50bdbced-b81c-4fc5-ab87-561869ea6204", + "text": "metadata at inference time, alongside A. The learning objective is to predict the set of event labels present in the clip. Let Y denote the event label set. target for each clip is a multi-label vector y ∈{0, 1}|Y|, where yk = 1 indicates the presence of event k in the clip. Geo-AT aims to learn a function f : (A, g) → y, where g encodes information about the surrounding environment through POI-derived semantic descriptors (e.g., proximity to beaches, highways, train stations, residential areas, or industrial facilities). Geo-AT does not prescribe a", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 19, + "total_chunks": 85, + "char_count": 555, + "word_count": 93, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "f404e8de-785e-4b9e-b578-2ec3932b7912", + "text": "specific integration mechanism between A and g, leaving model design choices open for evaluation under a shared task definition. Geo-AT focuses on multi-label tagging rather than single-label classification, emphasizing label prediction under polyphonic conditions, where multiple events may co-occur in a clip. The purpose of Geo-AT is not to replace the AT task, but to study when and how spatial evidence complements audio representations, particularly for acoustically confusable events and polyphonic Geo-AT is motivated by the use of contextual knowledge in auditory perception and location-tied metadata in real deployments [33]. Geo-AT provides a framework for building and evaluating more robust machine listening systems in geographically diverse environments, including urban noise monitoring, context-aware assistive hearing, and scalable acoustic surveillance [18] [34]. The benchmark dataset for the Geo-AT task: Geo-ATBench The audio recordings for Geo-ATBench are sourced from Freesound.org [35], a public repository of user-contributed sounds, as well as from the dataset presented in [33], which includes audio files with GPS information and a diverse Figure 1: The number of recordings with GPS information uploaded to Freesound each year. range of sound events.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 20, + "total_chunks": 85, + "char_count": 1281, + "word_count": 181, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "c8928dbb-e759-4738-831f-7ea995934545", + "text": "Audio clips were selected based on the inclusion of geotagging information, specifically latitude and longitude coordinates provided by the uploaders, and underwent careful manual review of coordinate validity and obvious mismatches between tags and location for quality control.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 21, + "total_chunks": 85, + "char_count": 279, + "word_count": 38, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "5c096c7b-a72c-4364-9227-3c48ea8516da", + "text": "Sound event and GSC annotation GSC construction: For recordings sourced from Freesound, we specifically select data spanning from 2012 to 2025. This temporal filtering is applied because the scale of geo-tagged audio prior to 2012 is relatively limited, as shown in Fig. 1, and the geographical information of regions may differ across long time spans.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 22, + "total_chunks": 85, + "char_count": 352, + "word_count": 55, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "7a905e92-64ce-4838-abca-1077081f84f6", + "text": "The GPS coordinates of each recording were obtained from Freesound or the original dataset for others. These coordinates are used to query the OpenStreetMap (OSM) geospatial database via the Overpass For each recording with GPS coordinates, a square with a fixed side length is drawn around the location, and OSM entities within this square are identified based on 11 OSM feature keys, covering categories such as land use, amenities, and natural. While a circular region may be conceptually aligned with the isotropic nature of sound propagation, a square region is adopted to", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 23, + "total_chunks": 85, + "char_count": 577, + "word_count": 92, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "99d1049d-92b9-4d2d-97af-04d4c304de52", + "text": "enable efficient bounding-box queries within standard OSM-based geographic This choice provides a computationally practical approximation of the local acoustic environment while maintaining spatial consistency The resulting GSC representation is a POI-derived semantic", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 24, + "total_chunks": 85, + "char_count": 268, + "word_count": 32, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "50619350-2c99-46fb-a7bb-7cf9ccf2c4c8", + "text": "Natural Sounds Human Sounds Sounds of Things Bird sounds 1024 8191 Speech 794 5133 Car 463 3068\nCrickets 343 3091 Footsteps 288 2225 Plane 340 3092\nFalling water 325 2922 Music Instru. 188 1593 Train 165 1291\nFlowing water 319 2774 Music 144 1330 Bell 121 835\nWaves 307 2754 Singing 81 624 Boat 115 927\nInsects(Flying) 137 824 Shout/Scream 79 249 Tram 111 731\nWind 83 737 Laughter 53 125 Vehicle horn 107 293\nExplosion 93 431\nBus 74 461\nSiren 69 509\nMetro 63 454\nHelicopter 58 496\nDog 209 969\nTruck 42 237 Table 1: Sound classes in Geo-ATBench, grouped by Natural, Human, and Thing. Dur.\ndenotes the total duration (in seconds) of each class, and Cnt denotes sample count. Musical\ninstru. abbreviates Musical instrument, and Falling water denotes Falling water/Rain.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 25, + "total_chunks": 85, + "char_count": 766, + "word_count": 134, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "d35047cf-41cc-4b2b-8718-f05d3c741d1d", + "text": "descriptor extracted from these OSM annotations and used as the location-tied input described in Section 3. The square side length and the 11 feature keys", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 26, + "total_chunks": 85, + "char_count": 154, + "word_count": 25, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "66e72f4e-8852-495c-be19-7af0870734fc", + "text": "are the same for all clips to keep GSC extraction consistent across the dataset. Sound event annotation: Many Freesound clips include user-provided tags, and the perception of audio events is usually based on human hearing. Therefore, each recording is manually reviewed by listening to the audio track and assigning the heard event labels. When a label is uncertain, the recording is replayed and re-checked until a decision can be made. After manual annotation, the labels are cross-validated with the user-provided tags on Freesound.org Recordings with disagreements are re-examined and corrected, and when", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 27, + "total_chunks": 85, + "char_count": 609, + "word_count": 92, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "7e420c4e-ff2b-4286-9d2e-55d276630e13", + "text": "needed, the corresponding GPS metadata is used to extract POI-derived OSM annotations as an auxiliary cue to support label verification. each audio clip is paired with its POI-derived OSM annotations to form an Audio–GSC pair in Geo-ATBench. The initial annotation took about 600 person-hours, and cross-validation and re-checking took about 200 additional person-hours, for a total of about 800 person-hours over four months. End-toend dataset collection, preparation, and annotation took about six months. A curation process is performed to map unstructured annotation labels into Figure 2: Summary of sound classes and acoustic similarity. (Left) Distribution of three\ncoarse-grained sound classes. (Right) Intra-class similarity across 28 sound event classes\ncomputed from log-Mel spectrogram features. a controlled vocabulary, resulting in 28 sound event classes. are grouped into three main categories aligned with the AudioSet taxonomic structure [26]: 1) Natural Sounds, which include sounds originating from nature; 2) Human Sounds, which encompass sounds produced by humans; and 3) Sounds of Things, which represent mechanical and man-made noises.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 28, + "total_chunks": 85, + "char_count": 1157, + "word_count": 164, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "845bde56-2d14-4ec3-8244-76db8660a88b", + "text": "The sample counts and total durations for these categories are illustrated in Table 1, while the coarse-grained distributions and corresponding intra-class similarities are visualized in Fig. 2 (right), where the violin-plot similarities are calculated based on log-Mel spectrogram features, and similarity is measured using cosine similarity between feature vectors, a widely adopted metric in audio and sound analysis [37]. Additionally, Fig. 3 provides an overview of the dataset's composition, encompassing 28 event types and 11 OSM categories. The dataset is inherently multi-label, accounting for the co-occurrence of multiple sound events within a single 10-second recording. Dataset Organization and Statistics Following cleaning and selection, the final Geo-ATBench dataset comprises 3,854 audio clips, totaling 10.71 hours of audio. Each data point consists of a", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 29, + "total_chunks": 85, + "char_count": 872, + "word_count": 122, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "d918ae5d-7751-4eb4-a73a-821bf90f2384", + "text": "triplet: (i) a 10-second audio clip, (ii) a multi-label clip-level label vector over 28 event classes, and (iii) a POI-derived GSC representation constructed from OSM annotations over 11 semantic context categories. To ensure consistency", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 30, + "total_chunks": 85, + "char_count": 237, + "word_count": 34, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "cb5ce65d-63a0-4bad-85b3-35e0c1c45acb", + "text": "Figure 3: Sankey diagram summarizing co-occurrence links from left to right: 3 coarsegrained sound classes, 28 fine-grained sound event classes, GSC types, and the Geo-ATBench\ndataset. Flow width indicates co-occurrence strength. This diagram represents the distribution of audio events and GSC types within the dataset, and is not intended to imply precise\nreal-world relationships, as sound occurrences can vary significantly depending on the specific\ngeographical context (e.g., residential roads vs highways). for modeling tasks, all collected recordings are processed into a standardized Each audio clip has a fixed duration of 10 seconds, encoded as a singlechannel (mono) WAV file with a sampling rate of 16 kHz and a bit depth of 16.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 31, + "total_chunks": 85, + "char_count": 741, + "word_count": 112, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "0cf4c744-8f0d-41bf-8bce-78bd06332d87", + "text": "For more details and access to the dataset, please visit the project homepage. The GeoFusion-AT framework and instantiations The GeoFusion-AT framework As shown in Fig. 4, GeoFusion-AT provides reference implementations of three typical fusion points for the Geo-AT task on the Geo-ATBench dataset. All variants take paired inputs (A, g) and output multi-label logits z ∈RC for C event classes, followed by a sigmoid for tag probabilities. Figure 4: Overall architecture of the GeoFusion-AT framework for Geo-AT task. GeoFusion-Early: feature-level fusion Early fusion [38], also known as feature-level fusion, integrates geospatial context and acoustic information at the input of the network. begins by transforming the raw audio waveform into a log-Mel spectrogram A ∈R1×T ×F , where T and F denote the number of time frames and frequency bins, respectively. Concurrently, GSC vector g ∈RDGSC is projected into a length-F vector g′ ∈RF via a linear transformation: g′ = Wprojg,", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 32, + "total_chunks": 85, + "char_count": 980, + "word_count": 150, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "883fad76-c055-47a7-9b90-8927d706cbbb", + "text": "where Wproj ∈RF ×DGSC is a learnable projection matrix. frequency resolution F for the projection so that g′ can be interpreted as a location-conditioned spectral prior (i.e., a per-frequency weighting/gating signal): different geographic contexts tend to correlate with different dominant sound sources and background noise, which manifest as characteristic energy distributions over frequency bands. The projected vector g′ is then broadcast across the temporal dimension to form a broadcast GSC tensor G ∈R1×T ×F . The audio spectrogram and the broadcast GSC tensor are concatenated along the channel dimension to produce the fused representation Xfused = Concat(A, G) ∈R2×T ×F , which serves as the input to the backbone network. When a backbone does not accept a two-channel spectrogram input, an input adapter is applied to map Xfused into the backbone's expected input shape and channel format; all subsequent backbone components remain unchanged.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 33, + "total_chunks": 85, + "char_count": 954, + "word_count": 143, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "8a256db4-4d58-4b45-9cca-880c002a2d73", + "text": "GeoFusion-Inter: representation-level fusion Intermediate fusion [39], or representation-level fusion, combines information in the latent space after each modality has been processed by separate Let Φaudio be an audio encoder that maps an input spectrogram A to an audio embedding Eaudio ∈RDemb, where Demb is the embedding dimension. Similarly, the GSC vector g is processed through a multi-layer perceptron (MLP) projection to produce a GSC embedding EGSC ∈RDemb of the same Here, both embeddings are clip-level representations, implying that temporal information in A has been aggregated by Φaudio prior to fusion. Intermediate fusion implements a symmetric cross-modal attention [40] module that supports bidirectional refinement between the audio and GSC embeddings. Given Q, K, V are the query, key and value, attention is computed as QKTAttention(Q, K, V) = softmax V, where K = V, the factor √Demb √Demb stabilizes optimization [40].", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 34, + "total_chunks": 85, + "char_count": 941, + "word_count": 140, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "c37895eb-f0d4-40ed-8c37-0b4a056206c2", + "text": "Accordingly, the cross-modal attention operates on global embeddings rather than temporal tokens, serving as feature-wise conditioning instead of frame-level alignment. The audio embedding Eaudio is enhanced by treating it as a query and the GSC embedding EGSC as the Symmetrically, EGSC is enhanced using Eaudio as context. complementary fusion streams are formed by residual mixing. combines the cross-attention refined audio embedding with the original GSC The other stream combines the cross-attention refined GSC embedding with the original audio embedding. This symmetric design preserves both the cross-modal updates and the original modality information. streams are then concatenated and passed through a learnable linear projection to produce a single fused embedding, which is fed to the classification head to output multi-label tagging logits. GeoFusion-Late: decision-level fusion", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 35, + "total_chunks": 85, + "char_count": 894, + "word_count": 124, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "d71c7a57-6f39-4eac-b090-fdb913b77001", + "text": "Late fusion [38], or decision-level fusion, combines the outputs of two independent streams, one for each modality. In this paradigm, an audio branch, Φaudio, processes the audio representation A to produce class-wise logits, zaudio ∈ RC, where C is the number of event classes.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 36, + "total_chunks": 85, + "char_count": 278, + "word_count": 44, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "b02a0634-0620-4f9e-8e61-9fd60e01bc55", + "text": "In parallel, a GSC branch, ΦGSC, takes the POI-derived GSC vector g as input and produces its own logits, zGSC. The fusion is performed by a weighted combination of these two logits. than using a single scalar weight, a learnable, class-specific weighting vector This design assigns a separate GSC weight to each class while keeping the audio branch unchanged.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 37, + "total_chunks": 85, + "char_count": 360, + "word_count": 59, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "3b2f7b66-ae13-426c-91b3-5f25336f0b15", + "text": "The fused logits zfused are computed as: zfused = zaudio + λ ⊙zGSC (1) where ⊙denotes element-wise multiplication, λ is constrained to be nonnegative via a softplus activation function [41], λ = softplus(λraw), and λraw The fusion is performed in the logit (pre-sigmoid) domain, where z denotes class-wise log-odds scores.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 38, + "total_chunks": 85, + "char_count": 322, + "word_count": 50, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "dd4f821c-7958-413f-8668-0a4b26fadcc4", + "text": "Thus, Eq. (1) combines modalityspecific evidence before the final sigmoid mapping to probabilities. class probabilities are obtained by applying a sigmoid function to zfused. The GeoFusion-AT framework uses the standard multi-label AT objective Auxiliary losses and regularizers are optional and not", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 39, + "total_chunks": 85, + "char_count": 299, + "word_count": 41, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "03dbb76c-a582-439e-93b8-dca1a961e352", + "text": "required by the framework definition. Instantiations of the GeoFusion-AT framework GeoFusion-AT is instantiated on three representative audio backbones to provide benchmark results for the Geo-AT task. PANNs [20] is a CNN-based pretrained audio backbone, AST [9] is a patch-based Transformer backbone that applies attention over spectrogram patch embeddings, and CLAP [21] is a contrastively pretrained audio–text backbone.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 40, + "total_chunks": 85, + "char_count": 423, + "word_count": 57, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "7b651527-28a6-4016-964d-10e130776da2", + "text": "All instantiations follow the definitions in Section 5.1: feature-level fusion (GeoFusion-Early), representation- Figure 5: Instantiations of GeoFusion-Early (feature-level fusion). level fusion (GeoFusion-Inter), and decision-level fusion (GeoFusion-Late). code and model checkpoints are available on the project homepage. Instantiations of GeoFusion-Early GeoFusion-Early implements feature-level fusion by constructing an acoustic representation tensor and a broadcast GSC tensor, as shown in Fig. 5. GeoFusion-Early-PANNs. The instantiation on PANNs [20] follows Section 5.1.1. The GSC vector g ∈RDGSC (with DGSC = 768) is linearly projected to a length-F vector and broadcast along time to form a broadcast GSC tensor Audio preprocessing operations are applied to A before fusion. The fused input is Xfused = Concat(A, G) ∈R2×T ×F . The first convolutional layer is adapted to accept two input channels. Weights for the audio channel", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 41, + "total_chunks": 85, + "char_count": 938, + "word_count": 130, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "e5b0e3e1-81c4-477e-ba7b-73926b812f39", + "text": "are initialized from the PANNs checkpoint, and weights for the GSC channel are zero-initialized to preserve the pretrained audio pathway at initialization and let the model learn to use g during fine-tuning. For AST [9], GeoFusion-Early is implemented as feature-level fusion in the token sequence. Instead of channel-wise concatenation, the GSC vector g is mapped to the AST embedding dimension and injected as a dedicated [GSC] token. The Transformer input sequence contains the standard [CLS] token, the [GSC] token, and the audio patch tokens. The positional embedding table is expanded to (1, Npatches + 2, Demb) (with Demb = 768), and the new [GSC] position is zero-initialized while the original positions retain their pretrained values from the AST checkpoint. uses the output embedding of the [CLS] token. GeoFusion-Early-CLAP. The CLAP audio encoder [21] accepts a spectrogram input and is instantiated with the same two-channel construction as", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 42, + "total_chunks": 85, + "char_count": 954, + "word_count": 146, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "2d8ba32d-b6a3-4571-8560-afa26c2c8c4d", + "text": "GeoFusion-Early-PANNs. A broadcast GSC tensor G is constructed from g and concatenated with A to form Xfused. Weights for the audio channel are initialized from the checkpoint, while the GSC channel is zero-initialized to avoid perturbing pretrained audio representations early in training.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 43, + "total_chunks": 85, + "char_count": 290, + "word_count": 42, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "3e404f71-a5eb-4d6c-acb0-02dfcf74fe50", + "text": "Instantiations of GeoFusion-Inter GeoFusion-Inter is a representation-level fusion variant that combines the audio embedding Eaudio and the GSC embedding EGSC using the symmetric cross-modal attention module in Section 5.1.2, as shown in Fig. 6. GeoFusion-Inter-PANNs. For PANNs, its pretrained backbone serves as", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 44, + "total_chunks": 85, + "char_count": 313, + "word_count": 42, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "18094d79-97e6-46f0-96d6-fd1dcfd473cd", + "text": "a feature extractor to produce audio embedding Eaudio ∈RDemb (Demb = 2048 In parallel, the GSC vector g ∈RDGSC is projected by a two-layer MLP into GSC embedding EGSC ∈RDemb. The embeddings are combined using the symmetric cross-modal attention module in Section 5.1.2 to produce Efused, which is fed to the classification head to output multi-label logits. For AST, the [CLS] output embedding is used as Eaudio ∈RDemb with Demb = 768. The GSC vector g ∈RDGSC has DGSC = 768 and is used to form EGSC at the same dimension.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 45, + "total_chunks": 85, + "char_count": 522, + "word_count": 91, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "edcd0bee-20c7-4d90-8b12-faacd302b047", + "text": "The attention module in Section 5.1.2 produces Efused for tagging. GeoFusion-Inter-CLAP. For CLAP, its audio encoder produces Eaudio ∈ RDemb with Demb = 1024. Concurrently, a two-layer MLP projects the GSC vector g into a matching GSC embedding EGSC. The attention module in Figure 6: Instantiations of GeoFusion-Inter (representation-level fusion). Section 5.1.2 combines the embeddings to produce Efused for tagging. Instantiations of GeoFusion-Late GeoFusion-Late implements decision-level fusion by combining audio logits zaudio and GSC logits zGSC using Eq. 1, as shown in Fig. 7.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 46, + "total_chunks": 85, + "char_count": 585, + "word_count": 84, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "ef190254-ae7f-4630-8663-43dc2f274d5b", + "text": "GeoFusion-Late-PANNs. The audio branch is the PANNs model and The GSC branch is an MLP that maps g to zGSC. zfused are computed via Eq. 1 and optimized with the multi-label objective.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 47, + "total_chunks": 85, + "char_count": 183, + "word_count": 32, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "3562be41-1775-43ae-a9ca-38112a846751", + "text": "The audio branch is the AST model and outputs The GSC branch and logit fusion follow GeoFusion-Late-PANNs, producing zfused for multi-label tagging. The audio branch uses the CLAP audio encoder to produce an audio embedding, followed by a classification head to output The GSC branch and logit fusion follow GeoFusion-Late-PANNs. Figure 7: Instantiations of GeoFusion-Late (decision-level fusion). Experiments and results analysis Experimental setup and evaluation metrics Geo-ATBench is evaluated as a 28-class multi-label AT task; each audio clip is represented by the acoustic input and the paired POI-derived GSC, described in Section 3 and Section 4.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 48, + "total_chunks": 85, + "char_count": 655, + "word_count": 96, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "0a4f4ae9-a18d-4ee0-94f3-7dafaa13ad05", + "text": "For repeated evaluation, five independent runs are conducted with different random seeds. In each run, the dataset is split into 70% training, 15% validation, and 15% test. A multi-label stratification procedure is used to keep per-label prevalence and co-occurrence patterns comparable across splits so that all event classes are represented in the test The split is performed at the clip level. The GSC representation is not", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 49, + "total_chunks": 85, + "char_count": 426, + "word_count": 66, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "e80940c4-e795-47ac-b475-608c93e3a24b", + "text": "constructed from precise geographic identifiers such as GPS coordinates, street Instead, it encodes high-level semantic context derived Specifically, raw OSM tags, such as amenity: school and", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 50, + "total_chunks": 85, + "char_count": 191, + "word_count": 26, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "59c331bb-11a5-4632-a006-66e605c4339a", + "text": "highway: bus stop, are extracted and converted into descriptive strings. resulting strings are encoded using a pretrained BERT model [42], and elementwise mean pooling is applied to the embeddings to capture local land-use characteristics and area semantics. Similar GSC patterns may occur across different recording locations, while recordings in the same area may still differ in their Thus, the reported benchmark results should be interpreted", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 51, + "total_chunks": 85, + "char_count": 446, + "word_count": 65, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "9c23ed4d-ebf5-4ade-91e5-ea695c009793", + "text": "as evaluating generalization under clip-level partitioning with location-derived semantic context, rather than under strict geographic hold-out. The three backbones (PANNs [20], AST [9], and CLAP [21]) used in this paper are pretrained on large-scale AudioSet [26] and have reported strong performance on AudioSet with 527 audio event classes at the time of their In the benchmark construction for Geo-AT on Geo-ATBench, finetuning is applied to adapt these backbones to the 28-class multi-label task while limiting changes to their pretrained audio representations. through a small learning rate and early stopping. Models are trained on an NVIDIA GeForce RTX 4090 GPU and fine-tuned for a maximum of 100 epochs using the AdamW optimizer with a learning rate of 1e-5. applied to prevent overfitting; training stops if the validation F1 score does not improve for 15 consecutive epochs. The training objective is binary crossentropy (BCE) loss [20]. Audio inputs are 10-second clips and are resampled to", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 52, + "total_chunks": 85, + "char_count": 1003, + "word_count": 153, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "a77220ba-d313-491b-bacc-94c5784f999f", + "text": "match each backbone's requirements. All models are initialized from pretrained weights, and audio-only baselines are included for comparison. Model performance is evaluated by mean Average Precision (mAP) [20], area under the ROC curve (ROC AUC), and F1 score, with mean ± standard deviation across the 5 independent runs. All metrics are micro-averaged unless", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 53, + "total_chunks": 85, + "char_count": 360, + "word_count": 53, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "e5e49ab2-ba06-4ab0-ac56-b1216de0ae24", + "text": "Besides the multi-label AT on 28 event classes, a 3-class coarse-grained AT is reported as a supplementary analysis. code, models, and the dataset, please see the project homepage. This section evaluates Geo-ATBench from three complementary perspectives.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 54, + "total_chunks": 85, + "char_count": 254, + "word_count": 36, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "cec69a68-8df3-4317-ab3e-9bc711fa4a24", + "text": "First, the feasibility of performing audio tagging with GSC alone is evaluated as a GSC-only baseline under different POI extraction ranges. Second, audio-only zero-shot baselines are reported for three strong AudioSetpretrained audio backbones to characterize backbone behaviour before finetuning on Geo-ATBench. Third, fine-tuned Geo-AT results on Geo-ATBench are reported for audio-only and GeoFusion-AT variants under identical data splits, enabling a controlled comparison of feature-level, representation-level, and decision-level fusion. Per-label performance changes and error patterns are used to identify which labels and confusions benefit most from GSC, with emphasis on acoustically confusable labels. GSC-only baseline and GSC range sensitivity", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 55, + "total_chunks": 85, + "char_count": 758, + "word_count": 98, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "c917582c-d602-494f-8b18-0343ea6b965f", + "text": "In practice, sound events differ in how broadly they can be perceived and in how strongly they correlate with nearby place semantics. extraction range affects the amount and composition of POI-derived context available for constructing the GSC vector g. To benchmark the Geo-only baseline on Geo-ATBench, a GSC-only baseline is evaluated under multiple POI extraction ranges, as shown in Fig. 8. For each POI extraction range defined by a distance threshold, implemented as the square side length, a square is centered at the clip's GPS coordinate. Figure 8: Average Precision (AP) for GSC-only multi-label tagging under different POI\nextraction ranges. POI-derived GSC is constructed from OSM entities retrieved with the\nsame 11 OSM feature keys (e.g., land use, amenities) and encoded into the fixed-length GSC\nvector g. mAP is computed on the test set over 5 independent runs.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 56, + "total_chunks": 85, + "char_count": 879, + "word_count": 138, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "effc8c64-35fa-4179-adb9-0a0b30cd010f", + "text": "though a circular neighborhood may better approximate isotropic sound propagation, a square region is employed to enable efficient bounding-box queries in OSM-based geographic information systems. OSM entities within the square are retrieved using the same 11 OSM feature keys [36] as in Section 4. resulting POI composition is encoded by a pretrained BERT model [42] into", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 57, + "total_chunks": 85, + "char_count": 372, + "word_count": 56, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "c601d6b1-199c-452c-8631-3af9ec3faf00", + "text": "the fixed-length 768-dimensional GSC vector g, and the same GSC-only classifier is evaluated across all ranges. The Geo-only baseline uses BERT-base to produce g, followed by a 3-layer MLP with 1024, 512, and 28 units to perform 28-label multi-label tagging on Geo-ATBench. During training, the BERT tokenizer and BERT encoder [42] are frozen, and only the 3-layer MLP classifier Source code, extracted GSC vectors, and implementation details are available on the project homepage. The GSC-only results increase with larger distance thresholds on GeoATBench, and the 1000-metre range yields the highest performance.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 58, + "total_chunks": 85, + "char_count": 615, + "word_count": 91, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "c477df95-cd53-419d-a934-d68313939971", + "text": "One possible explanation is that OpenStreetMap (OSM) coverage can be sparse in some regions [43], so smaller squares may return fewer entities for constructing the GPS accuracy can also vary across devices and conditions [44], which can shift the queried area and affect POI retrieval. may also contribute, as sound events differ in how broadly their semantics relate to nearby places and in how tightly they align with POI-derived context. example, mobile sources such as birds or crickets can be heard across a natural area and may be associated with woodland or water POIs beyond the immediate vicinity of the recording point. Human speech can also reflect nearby attractions or pedestrian flow, where people move toward or away from a site and produce speech sounds over a broader area. In contrast, events associated with fixed sources, such as breaking waves at a shoreline, are typically constrained to more local place semantics; thus, shorter ranges can be sufficient in In summary, this section presents Geo-only performance on GeoATBench with different POI extraction ranges, providing a detailed reference for Geo-only comparison on the proposed Geo-ATBench dataset.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 59, + "total_chunks": 85, + "char_count": 1178, + "word_count": 184, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "54b5c9cb-9a57-4ae5-9d74-2b57718c91e9", + "text": "Audio-only zero-shot baselines Audio-only zero-shot tagging inference is reported to characterize three AudioSet-pretrained models' behaviour before fine-tuning on Geo-ATBench. The AudioSet-pretrained backbones PANNs [20], AST [9], and CLAP [21] are evaluated through direct inference, providing a reference point for the finetuned audio-only and GeoFusion-AT results reported later. A direct zero-shot benchmark on Geo-ATBench labels is not possible from the original AudioSet-pretrained model outputs, because these backbones are trained on AudioSet with 527-class event labels, and their native outputs do not match the 28 Geo-ATBench event labels. A comparable 28-label zeroshot benchmark is defined by first producing class predictions over the 527 AudioSet labels for each Geo-ATBench clip, and then mapping these 527 outputs to the 28 Geo-ATBench labels using the pretrained Word2Vec model (\"word2vec-google-news-300\") [45], which provides 300-dimensional word em- Figure 9: ROC curves for zero-shot audio-only tagging inference on Geo-ATBench labels. Micro and macro ROC AUC are reported for AudioSet-pretrained PANNs, AST, and CLAP\nafter AudioSet-to-Geo-ATBench label mapping. Overall performance: PANNs (Micro AUC\n0.8576, Macro AUC 0.8409), AST (0.6672, 0.6443), and CLAP (0.8325, 0.8022).", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 60, + "total_chunks": 85, + "char_count": 1299, + "word_count": 175, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "bec72835-9f87-4cef-a404-648b1c49e4af", + "text": "beddings trained on the Google News corpus. The AudioSet-to-Geo-ATBench label mapping and code are released on the project homepage for reproduction. Fig. 9 shows zero-shot audio-only tagging performance on Geo-ATBench for three AudioSet-pretrained backbones. PANNs achieves the strongest performance, followed by CLAP, while AST performs the worst under the same AudioSet-to-Geo-ATBench label mapping.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 61, + "total_chunks": 85, + "char_count": 402, + "word_count": 53, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "dd554d95-807a-4a70-9026-e3e7b75e1003", + "text": "Several factors may contribute to First, PANNs is trained to produce strong clip-level tag predictions from log-Mel inputs [20], which can transfer more directly to GeoATBench under label-space mapping. Second, CLAP learns aligned audio-text representations [21], which can preserve semantic separation that remains useful after mapping AudioSet labels to Geo-ATBench labels. Third, AST relies on spectrogram patch tokenization and positional embeddings [9], and its AudioSet pre-training configuration may transfer less effectively to the GeoATBench distribution under direct inference without task-specific adaptation. Similarly, the visualisation of the audio embeddings under zero-shot inference", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 62, + "total_chunks": 85, + "char_count": 699, + "word_count": 91, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "2a9d49de-0c6d-4b24-867b-94656afc12a1", + "text": "shows the same trend in Fig. 10. Embeddings from PANNs and CLAP form more separable clusters across the 28 Geo-ATBench classes, whereas AST embeddings show stronger overlap and concentrate in a smaller region of the Higher-resolution versions of Fig. 9 and Fig. 10 are available on the Figure 10: t-SNE visualization of audio embeddings from zero-shot inference for PANNs,\nAST, and CLAP on the 28 Geo-ATBench event classes; clusters show effective separation,\nwhile overlaps highlight acoustically similar classes. project homepage due to space constraints. Fine-tuned Geo-AT results on Geo-ATBench", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 63, + "total_chunks": 85, + "char_count": 598, + "word_count": 89, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "332b42f7-acbc-4754-926f-25f9c0bfd07c", + "text": "Table 2 shows the results of audio backbones under the three fusion strategies described in Section 5. Audio-only and GSC-only baselines are reported, and GeoFusion-AT variants are compared under identical data splits on the 28- All fine-tuned models are trained on the 28 Geo-ATBench labels, without the AudioSet label mapping used in the zero-shot baselines.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 64, + "total_chunks": 85, + "char_count": 360, + "word_count": 55, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "3d8b1c2a-0156-4586-bbe9-7d5243a5924d", + "text": "Across early, representation-level, and late fusion, incorporating GSC improves 28-class Geo-AT performance for all three backbones. Welch two-sample t-tests indicate significant improvements compared with the corresponding audio-only baselines for AST with early fusion (p < 0.05), PANNs with late fusion (p < 0.001), and CLAP with intermediate fusion GeoFusion-Early-AST achieves the best mAP on the fine-grained Fine-grained (28 classes) Coarse-grained (3 classes)\nStrategy\nPANNs AST CLAP PANNs AST CLAP", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 65, + "total_chunks": 85, + "char_count": 506, + "word_count": 69, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "f32c2390-b5d0-40f9-b078-4092f3cdafcf", + "text": "Audio-Only 0.770±0.006 0.820±0.015 0.824±0.008 0.961±0.006 0.904±0.012 0.966±0.005\nGSC-Only 0.767±0.010 0.867±0.009 GeoFusion-Early 0.812±0.010 0.846±0.010 0.826±0.010 0.954±0.004 0.914±0.009 0.950±0.008\nGain (∆) +0.042 +0.026 +0.002 -0.007 +0.010 -0.016 GeoFusion-Inter 0.824±0.010 0.829±0.003 0.842±0.006 0.964±0.008 0.912±0.011 0.968±0.005\nGain (∆) +0.054 +0.009 +0.018 +0.003 +0.008 +0.002 GeoFusion-Late 0.833±0.007 0.843±0.010 0.831±0.007 0.949±0.004 0.939±0.008 0.966±0.004\nGain (∆) +0.063 +0.023 +0.007 -0.012 +0.035 0.000 Table 2: The mean average precision (mAP) of different models on the Geo-ATBench dataset. The rows labeled Gain (∆) represent the performance difference relative to the audio-only\nbaseline. All metrics are averaged across 5 independent experimental runs. Figure 11: Per-class average precision across 28 classes for the audio-only AST, the GSConly baseline, and GeoFusion-Early-AST, the integration of geospatial information improves\nperformance for multiple classes. 28-class multi-label tagging task, while no significant difference is observed between GeoFusion-Early-AST and GeoFusion-Inter-CLAP (p > 0.5), indicating comparable performance. After fine-tuning, AST yields the strongest overall performance on the 28-class Geo-AT task, followed by CLAP and then PANNs. This ordering differs from the zero-shot baseline ranking in Fig. 9. difference is that the zero-shot baseline predicts in the 527-class AudioSet label space and then maps the outputs to the 28 Geo-ATBench labels, whereas finetuned models are trained directly on Geo-ATBench labels.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 66, + "total_chunks": 85, + "char_count": 1585, + "word_count": 201, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "f5434eaf-d023-4d8e-86b8-c993ac5c610e", + "text": "can introduce label-aggregation and calibration effects that vary across backbones, and cross-dataset domain shift further limits direct transfer under direct Fine-tuning removes the label-space mismatch by optimizing directly on Geo-ATBench targets, resulting in better results. Fig. 11 further shows the", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 67, + "total_chunks": 85, + "char_count": 305, + "word_count": 40, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "2ac3c28f-3c5e-48ac-9288-41994774b151", + "text": "average precision of GeoFusion-Early-AST across Geo-ATBench event classes. In addition to the 28-class fine-grained tagging task, Table 2 shows a supplementary 3-class coarse-grained tagging task that groups the 28 events into Natural Sounds, Human Sounds, and Sounds of Things, as described in Section 4.2.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 68, + "total_chunks": 85, + "char_count": 307, + "word_count": 44, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "79b8330a-a170-46aa-9b11-76ef68a00c76", + "text": "On this coarse-grained task, GeoFusion-Inter-CLAP achieves the best Representation-level fusion improves coarse-grained performance for all three backbones, suggesting that combining audio and GSC high-level representations with symmetric cross-modal attention in the GeoFusion-Inter frame- Figure 12: Per-class AP change (fusion minus audio-only) for the GeoFusion-Early-AST\nexemplar on the 28-class Geo-AT task. work is effective at this level of semantic granularity. To further explore which event labels benefit most from incorporating geospatial semantic context (GSC) under the Geo-AT task, Fig. 12 uses GeoFusionEarly-AST as an exemplar and visualizes per-label average precision (AP) for the audio-only and audio-GSC fusion variants. Fig. 12 also shows the per-label change ∆AP = APaudio+GSC−APaudio, shown by the purple curve. the audio-only reference, incorporating GSC yields more than a 5% AP increase", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 69, + "total_chunks": 85, + "char_count": 914, + "word_count": 124, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "3599e207-1d95-476e-81ff-c3324dbfece3", + "text": "for 17 of the 28 event classes. These 17 classes are grouped as GSC-benefiting events in this paper. Among them, Helicopter shows the largest gain, with an absolute change of ∆AP = 0.3378, corresponding to a relative increase of about +52.62% compared with the audio-only AP, which is consistent with the fact that helicopter sounds tend to occur in a limited set of places and are often associated with specific location semantics, making POI-derived GSC informative for disambiguation. For 9 of the 28 event classes, ∆AP remains within ±5%, and these classes are grouped as GSC-neutral events, such as Bell, Singing, and Footsteps, which are common everyday sounds and are of- ten weakly tied to specific place semantics. It is worth noting that Explosion", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 70, + "total_chunks": 85, + "char_count": 757, + "word_count": 124, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "a6e214a2-e6d1-4d09-95e0-56042d96f52e", + "text": "shows a near-zero change in this dataset; this pattern is consistent with the Explosion samples retrieved from Freesound.org [35] being dominated by daily activities such as fireworks. Finally, two classes, Speech and Laughter, show", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 71, + "total_chunks": 85, + "char_count": 232, + "word_count": 34, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "e96561fb-2c56-4fde-a070-dc29ab3cb238", + "text": "decreases below −5% and are grouped as GSC-nonbenefiting events. be because speech and laughter are broadly distributed across locations, so associating them with POI-derived GSC does not help with recognition. Fig. 12 indicates that GSC helps for the recognition of a majority of sound event classes, has a limited impact on a subset of common sound event classes, and may not help for some widespread human vocalization events that are not related to specific locations and places. Human evaluation of the Geo-ATBench dataset To assess how well models trained on the Geo-ATBench dataset align with human auditory judgements, a crowdsourced human listening study is conducted. This study examines the correspondence between model predictions and human multi-label event judgements. Using the collected annotations, (1) annotation agreement is summarized with descriptive consistency measures and chance-corrected reliability statistics, and (2) model–human alignment is assessed by comparing model predictions with aggregated human consensus", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 72, + "total_chunks": 85, + "char_count": 1042, + "word_count": 149, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "0cfa8349-43a3-4cea-bbf9-616a4bbc4af2", + "text": "labels at the clip level. Participants and Experimental Design Ten Chinese participants (3 females, 7 males; M = 22.4, SD = 0.70 years) took part in the assessment experiment. Participants shared a similar language background to support consistent understanding of the annotation interface and The study adhered to the ethical guidelines of Xi'an JiaotongLiverpool University, and informed consent was obtained from all participants. To assess the perceptual validity of the Geo-ATBench labels and downstream model predictions, a within-subject human annotation experiment is Participants listen to 579 Geo-ATBench audio clips and indicate", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 73, + "total_chunks": 85, + "char_count": 639, + "word_count": 91, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "aff7e198-3f40-4e41-90d4-6d9e07f73fdc", + "text": "event presence by selecting \"exist\" whenever the corresponding event is clearly Multiple events may be marked as present within a single clip, consistent with the multi-label tagging formulation in Geo-AT. receives all clips and a checklist of event categories consistent with the 28 GeoATBench event labels. Audio clips can be replayed until a confident judgment Participants are instructed to rely on auditory perception, consistent with the Geo-ATBench labelling procedure described in Section 4.2. annotation task is split into short sessions and presented in random order to reduce fatigue and order effects. Each participant completes the annotation of all audio clips within 14 days. Human listening study results: reliability and model–human alignment Inter-rater reliability of human multi-label event annotations Descriptive agreement with the aggregated human consensus labels is computed for each participant. Across all participants, the mean agreement is 0.97, indicating that participants made similar decisions across audio clips and sound event categories.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 74, + "total_chunks": 85, + "char_count": 1073, + "word_count": 152, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "72da7c9a-4665-4321-ab16-0b12685f069c", + "text": "The annotation matrix is sparse, with only about 4.5% of clip–event positions marked as 1 (present). Such class imbalance inflates raw percent agreement, because the majority of annotations belong to the same (absent) category.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 75, + "total_chunks": 85, + "char_count": 227, + "word_count": 34, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "9f200d29-198e-41a4-87fe-673cfa8dc515", + "text": "To obtain a chance-corrected estimate of reliability, Krippendorff's alpha for nominal data is computed. Each clip-by-event pair is treated as one item, yielding 16,212 items across ten participants. The resulting reliability coefficient is αnominal(N = 16,212, R = 10) = 0.486, indicating moderate agreement among 10 participants, suggesting variability in auditory perception for multi-label polyphonic sound events in individual auditory ex- perience and interpretations.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 76, + "total_chunks": 85, + "char_count": 474, + "word_count": 64, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "159f1067-130b-4211-9fb0-c8c73c371ab3", + "text": "Given this moderate agreement, majority voting is used to derive clip-level consensus labels for each event as the reference for After data collection, binary event matrices are generated for each participant and aggregated per clip–event pair: a value of 1 is assigned when at least 5 of 10 participants marked \"exist\", and 0 otherwise. Overall, the annotations show high raw agreement but only moderate chancecorrected reliability, which is expected under sparse binary multi-label tagging. Majority-vote consensus labels are used as the clip-level reference for model–human alignment, with cautious interpretation for low-prevalence events. The next subsection compares model predictions against the aggregated human consensus labels to quantify model–human alignment on Geo-ATBench. Model–human alignment under two label references To assess how sensitive model evaluation is to the choice of label reference, model predictions are evaluated against two label sets on the same test set of 579 clips: (i) the Geo-ATBench labels and (ii) the aggregated human consensus labels from 10 participants.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 77, + "total_chunks": 85, + "char_count": 1099, + "word_count": 159, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "a00eedc6-beb8-4976-a81d-6cd6f70ee4e9", + "text": "A consensus threshold of 0.5 is used, meaning that an event is considered present when at least 5 of 10 annotators labeled it as This comparison aims to evaluate whether the Geo-ATBench label reference is consistent with independent human judgements. Results are reported for the audio-only-CLAP baseline and the GeoFusionAT variant GeoFusion-Inter-CLAP, given its strong performance on the 28- class fine-grained and 3-class coarse-grained Geo-AT tasks. signed-rank tests are performed on the 28 per-class F1 scores under the two label references. The result shows that for the audio-only-CLAP, there is no", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 78, + "total_chunks": 85, + "char_count": 607, + "word_count": 91, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "4d12679c-1a19-4d1f-bda9-92c3b0c2f016", + "text": "statistically significant difference in its performance between Geo-ATBench labels and aggregated human consensus labels, and the same conclusion applies to the GeoFusion-Inter-CLAP. Specifically, paired Wilcoxon signed-rank Event F1 (Label) F1 (Human) Dur. Event F1 (Label) F1 (Human) Dur.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 79, + "total_chunks": 85, + "char_count": 290, + "word_count": 38, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "58942a32-313e-409e-9443-250d12290ae3", + "text": "Bird sounds 0.861 0.856 8191 Falling water 0.872 0.786 2922\nSpeech 0.827 0.869 5133 Flowing water 0.774 0.667 2774\nPlane 0.779 0.514 3092 Waves 0.756 0.677 2754\nCrickets 0.883 0.836 3091 Footsteps 0.629 0.607 2225\nCar 0.491 0.404 3068 Musical instru. 0.651 0.667 1593 Table 3: Top-10 event classes (total = 34,843 s ≈9.68 h) by descending total duration in GeoATBench. F1 (Label) and F1 (Human) are F1 scores of GeoFusion-Inter-CLAP predictions\nevaluated against Geo-ATBench labels and aggregated human consensus labels, respectively. Dur. denotes total event duration (seconds). Musical instru. denotes Musical instrument,\nand Falling water corresponds to Falling water/Rain.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 80, + "total_chunks": 85, + "char_count": 676, + "word_count": 100, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "f66b4d97-9007-4511-bcf9-81bf0663e075", + "text": "tests indicated that the audio-only-CLAP's F1 score does not differ significantly (W = 181, p > 0.05) between Geo-ATBench labels (F1 = 0.628) and 10 participant human consensus labels (F1 = 0.570). For brevity, Table 3 reports per-class F1 scores for the ten event classes with the largest total duration, while the statistical test uses all 28 classes.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 81, + "total_chunks": 85, + "char_count": 353, + "word_count": 58, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "b15b24b1-4bc2-413a-969d-89cea43b3852", + "text": "A similar pattern is observed for the GeoFusion-Inter-CLAP, with stable F1 scores across Geo-ATBench labels (F1 = 0.649) and 10 participant human consensus labels (F1 = 0.592; W = 187, p > 0.05). Overall, model evaluation remains consistent under Geo-ATBench labels and aggregated human consensus labels on the annotated test set of 579 clips, and the paired Wilcoxon signed-rank tests do not indicate a statistically significant difference between the two label", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 82, + "total_chunks": 85, + "char_count": 462, + "word_count": 71, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "ec544a66-e698-4a58-aea2-e2b68883da0c", + "text": "This complements the inter-rater reliability analysis and supports Geo-ATBench as a human-aligned benchmark for clip-level evaluation. Environmental sound events do not exist in isolation: they are physical phenomena generated and perceived within specific geographic environments. Nevertheless, computational auditory scene analysis (CASA) often treats multilabel audio tagging as an audio-only inference problem. overlap, audio information can be insufficient to separate labels with similar acoustic patterns, and disambiguating cues may lie outside the waveform. In response to this gap, we introduce the Geospatial Audio Tagging (GeoAT) task, which frames multi-label audio tagging conditioned on paired audio and geospatial semantic context (GSC). Geo-AT focuses on POI-derived, location-tied semantics as complementary priors that are not encoded in the This task-level formulation provides a principled foundation for integrating spatial semantics into machine listening, extending the scope of CASA beyond independent signal analysis. Geo-ATBench is released to support reproducible Geo-AT evaluation. GeoATBench contains 3,854 clips (10.71 hours) with 28 event classes of real-world", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 83, + "total_chunks": 85, + "char_count": 1192, + "word_count": 157, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "643c6cbe-ff46-4fde-b5cf-df74902df4d8", + "text": "polyphonic audio, and each clip is paired with a POI-derived GSC representation constructed from OSM annotations over 11 semantic context categories. By explicitly encoding the semantic characteristics of recording environments, Geo-ATBench addresses an important resource gap in the field. controlled studies on how spatial context interacts with acoustic representations and offers a shared benchmark for evaluating geospatially grounded sound classification. GeoFusion-AT is introduced to report reference results on Geo-ATBench using feature-level, representation-level, and decision-level fusion with three AudioSet-pretrained backbones, PANNs, AST, and CLAP. backbones and fusion points, incorporating GSC is associated with improved mAP on the 28-class Geo-AT task in most configurations. A crowdsourced listening study with 10 participants further supports GeoATBench as a human-aligned reference on the annotated test set of 579 clips. Together, the Geo-AT task, the Geo-ATBench dataset, and the GeoFusion-AT framework provide a concrete basis for studying how location-tied semantics can complement acoustic representations in CASA.", + "paper_id": "2603.10623", + "title": "Geo-ATBench: A Benchmark for Geospatial Audio Tagging with Geospatial Semantic Context", + "authors": [ + "Yuanbo Hou", + "Yanru Wu", + "Qiaoqiao Ren", + "Shengchen Li", + "Stephen Roberts", + "Dick Botteldooren" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10623v1", + "chunk_index": 84, + "total_chunks": 85, + "char_count": 1142, + "word_count": 150, + "chunking_strategy": "semantic" + } +] \ No newline at end of file