diff --git "a/data/chunks/2603.10677_semantic.json" "b/data/chunks/2603.10677_semantic.json" new file mode 100644--- /dev/null +++ "b/data/chunks/2603.10677_semantic.json" @@ -0,0 +1,2567 @@ +[ + { + "chunk_id": "a42677c9-89b7-4ce2-ac78-8abcf43d4dbb", + "text": "Emulating Clinician Cognition via Self-Evolving\nDeep Clinical Research Ruiyang Ren1†, Yuhao Wang1†, Yunsen Liang1, Lan Luo2,\nJing Liu3*, Haifeng Wang3*, Cong Feng4*, Yinan Zhang5,\nChunyan Miao5, Ji-Rong Wen1, Wayne Xin Zhao1* 1Gaoling School of Artificial Intelligence, Renmin University of China,\nBeijing, China.\n2Peking University Third Hospital, Beijing, China.2026\n3Baidu Inc., Beijing, China.\n4Chinese PLA General Hospital, Beijing, China.Mar 5Joint NTU-UBC Research Centre of Excellence in Active Living for\nthe Elderly, Nanyang Technological University, Singapore. *Corresponding author(s). E-mail(s): batmanfly@ruc.edu.cn;\n†These authors contributed equally to this work.\n[cs.AI]\nAbstract", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 0, + "total_chunks": 95, + "char_count": 696, + "word_count": 87, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "42a05b45-6477-48b9-80e8-66b6628bbbd6", + "text": "Clinical diagnosis is a complex cognitive process, grounded in dynamic cue\nacquisition and continuous expertise accumulation. Yet most current artificial\nintelligence (AI) systems are misaligned with this reality—treating diagnosis as\nsingle-pass retrospective prediction while lacking auditable mechanisms for governed improvement. We developed DxEvolve, a self-evolving diagnostic agent\nthat bridges these gaps through an interactive deep clinical research workflow. The framework autonomously requisitions examinations and continually\nexternalizes clinical experience from increasing encounter exposure as diagnostic\ncognition primitives.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 1, + "total_chunks": 95, + "char_count": 641, + "word_count": 76, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "6cb8a621-a55a-43aa-937c-1e9947123ba6", + "text": "On the MIMIC-CDM benchmark, DxEvolve improved diagnostic accuracy by 11.2% on average over backbone models and reached 90.4%\non a reader-study subset, comparable to the clinician reference (88.8%). DxEvolve improved accuracy on an independent external cohort by 10.2% (categories\ncovered by the source cohort) and 17.1% (uncovered categories) compared to\nthe competitive method. By transforming experience into a governable learningarXiv:2603.10677v1 asset, DxEvolve supports an accountable pathway for the continual evolution of\nclinical AI. The mastery of diagnostic reasoning represents a defining hallmark of clinical expertise, a sophisticated cognitive process where rigorous investigation and experiential\ngrowth are inextricably linked [1–5]. In routine care, a seasoned clinician does not\nmerely identify a disease from a static set of symptoms; they act as a dynamic investigator, navigating uncertainty through active, evidence-driven inquiry [6, 7]. Moreover,\neach patient encounter serves as a feedback loop through which clinicians refine their\ninternal mental scripts. Over time, these refinements accumulate into transferable\nexperiential policies that make future decisions more robust and less prone to error [8–\n10]. This dual capacity for systematic investigation and continuous self-improvement\nunderpins the maturation of clinical mastery. Despite remarkable proficiency in medical knowledge synthesis [11–15], current AI\nsystems remain fundamentally misaligned with the cognitive architecture of human\nexpertise. First, a profound process gap exists [16–18]: most clinical AI systems treat\ndiagnosis as a static, full-information task, collapsing the step-wise investigative rigor\nof the bedside into a single retrospective prediction [19–26]. Second and more critically, a developmental misalignment persists: whereas clinical mastery thrives on the\nrefletive consolidation of experience, these systems function as ossified snapshots of\ntheir training data. Devoid of mechanisms to distill longitudinal practice into transferable experiences [27, 28], parameter-based updating leaves much of the learned\nbehavior implicit. This creates a dual challenge of clinical governance: it lacks clinical\nauditability, as the latent logic accrued over time remains impervious to human inspection [29–32], and it precludes procedural governance, leaving the system immune to\nexpert intervention or alignment with evolving standards [33–35]. Consequently, many\nsystems lack an auditable, governed pathway for learning from practice—an ability\nthat in medicine is not merely advantageous but integral to safety. Addressing these cognitive misalignments necessitates a conceptual pivot: reconceptualizing the diagnostic process not as a mere route to a prediction, but as the\nessential substrate for longitudinal evolution.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 2, + "total_chunks": 95, + "char_count": 2833, + "word_count": 382, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "fb54e6c6-e164-44e3-8365-a58867f1eb4d", + "text": "To faithfully emulate human diagnostic\nreasoning, an agent must navigate a structured investigative framework that produces\ntraceable trajectories of evidence acquisition and hypothesis refinement that mirror\nthe uncertainty-laden nature of clinical practice [36–38]. Such trajectories provide the\nnecessary learning substrate: they expose what was asked, observed and inferred at\neach step, enabling post hoc attribution, review and distillation of reusable experience\nartifacts rather than embedding all adaptation implicitly in model parameters [39]. By forging a symbiotic link between procedural rigor and governable evolution, it\nbecomes possible to develop agents that not only achieve expert-level performance but\nalso continuously cultivate their mastery that is aligned with the rigorous standards\nof the medical community. In this study, we introduce DxEvolve, a self-evolving diagnostic agent that reconciles the identified gaps in existing medical AI systems by integrating a dynamic\ninvestigative workflow with an explicit experiential learning mechanism (Fig. 1). At its\nfoundation, DxEvolve operationalizes diagnosis through deep clinical research (DCR),\nan evidence-centered paradigm that reconfigures static prediction into active inquiry,\nsynthesizing clinical findings with external medical knowledge. Within this substrate,", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 3, + "total_chunks": 95, + "char_count": 1344, + "word_count": 177, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "f7280ef7-aa79-49dc-b160-3f2413dcaad1", + "text": "the agent actively requisitions evidence, refines diagnostic hypotheses as cues emerge,\nand grounds every decision in observations with traceable provenance. Crucially, DxEvolve leverages these high-fidelity trajectories to support longitudinal self-evolution\nby distilling clinical encounters into diagnostic cognition primitives (DCPs)—explicit\ncarriers of clinical experiments that link salient presentation patterns to actionable\nworkup strategies and diagnostic insights. Unlike the opaque black-box updates, DCPs\nprovide a portable repository of clinical expertise that can be selectively recalled\nto navigate future uncertainty. This architecture establishes a transparent pathway\nfor clinician-led oversight and continuous improvement, while offering the practical\nadvantage of bypassing the computationally-intensive and inflexible cycles of offline\nretraining. Systematic evaluation on the MIMIC-CDM benchmark [40] demonstrates that\nDxEvolve consistently enhances diverse backbone models, yielding an 11.2% mean\naccuracy gain over the competitive baseline system. Rather than relying on specific\nmodels, the framework's efficacy is architectural: when integrated with state-of-the-art\nbackbones, it attained expert-level proficiency under stringent dynamic constraints,\nachieving 90.4% accuracy and surpassing the 88.8% human expert (Fig. 2c). Beyond\nstatic benchmarks, independent validation at the Chinese PLA General Hospital\nconfirmed the framework's robust portability across institutional and linguistic boundaries. The DCR architecture and distilled DCP repository yielded a 10.2% accuracy\ngain on translated records and a 11.9% improvement on raw Chinese documentation,\nwith advantages extending to diagnostic categories entirely absent from the initial\nrepository (17.1% gain). This sustained performance is underpinned by an evolution process that resolves\nthe developmental misalignment characteristic of static systems. We observed a longitudinal maturation effect, where experience harvested from later-stage encounters\npossessed higher diagnostic utility than earlier encounters. This evolution is further\ncharacterized by an error-driven dividend, where heuristics distilled from diagnostic\nfailures catalyzed greater performance gains than those from successes. Process-level\nanalyses confirm that DxEvolve's investigative behavior aligns with real-world clinical\npractices and established clinical guidelines, ensuring that its progression is grounded\nin sound medical heuristics rather than statistical artifacts. Together, these findings advance a view of clinical AI systems in which competence is defined not only by snapshot performance, but by how reliably an agent\nimproves with exposure when diagnosis is executed as procedural evidence acquisition\nunder workflow constraints. Our findings demonstrate that diagnostic excellence is\nnot merely a function of static medical knowledge utilization, but a dynamic capability realized through the synergy of structured investigative workflows and progressive\nexperiential maturation. By operationalizing these core pillars of human expertise,\nDxEvolve establishes that expert-level proficiency emerges when AI moves beyond\nstatistical prediction toward the active, longitudinal cultivation of clinical wisdom.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 4, + "total_chunks": 95, + "char_count": 3288, + "word_count": 413, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "556f1aaf-6b1b-4883-9897-974f507001ce", + "text": "This framework provides a deployable path for clinical systems that couples workflow\nfaithfulness with governance, supporting inspection, curation and controlled updating\nas standards of care and medical evidence evolve. To facilitate future research in this\ndirection, we provide open access to our DxEvolve agentic system.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 5, + "total_chunks": 95, + "char_count": 324, + "word_count": 45, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "4c713404-2411-496e-b473-2b0e23ca3beb", + "text": "Full clinical narrative, History Patient Request PE Order labs RequestCT Final all results at once\nDiagnosis\nInteractive Reasoning with Evidence Acquisition Static Reasoning Encounter-time workflow Retrospective chart review b Deep Clinical Research (DCR) Workflow High-salience Encounter Status Plan Next Action\n(Medical evaluation / Searching external sources)\nPositives / Negatives / Open questions:\n• RLQ tenderness, rebound tenderness,\nWBC 15k, elevated CRP…\n• No fever, LFTs normal, urinalysis\nnegative… Execute Evaluations Search Sources Observe Evidence\n• Need imaging for appendix\nvisualization…\nIntegrate & Update Encounter State Patient …\nHistory Action1 Observation1 Action2 Observation2 Action3 Observation3 Dx\n(Request PE) (PE report) (Order Labs) (Lab Results) (Request US) (CT Report) (Final Diagnosis)", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 6, + "total_chunks": 95, + "char_count": 818, + "word_count": 109, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "ac1877f3-568f-4616-aca7-614e379d243a", + "text": "c Experience-driven Self-Evolution Mechanism Episode Trajectory Reuse Next Diagnosis Cognitive Primitive encounter\nPatient History (DCP) A1 & O1 Experience Pattern: Source Case:\nRequest & Observe PE Acute RLQ pain … ID-1234 DCP Repository\nIndexed experience\nA2 & O2 Outcome:\nOrder & Observe Labs Investigation Guidance: Acute Consolidate appendicitis Prioritize CT abdomen… Reflect & A3 & O3\nExtract\nRequest & Observe CT Correctness:\n… Decision Guidance: Incorrect\nHigh suspicion for diagnosis\nDx appendicitis…\nFinal Diagnosis d In-institution MIMIC-CDM Evaluation Cohort Repository MIMIC-CDM DCP DCP Held-out evaluation Indexed experience\nAccrual Pool consolidation\nEncounters for experience under DCR\naccumulation External Hospital Cohort\nCross-institution Out-of-distribution evaluation Fig. 1 DxEvolve: workflow-aligned diagnosis with experience-driven self-evolution. a,\nDxEvolve frames diagnosis as evidence-centered sequential reasoning, contrasting the static, singlepass inference typical of retrospective evaluations using complete records. b, Deep clinical research\n(DCR) workflow. From the patient history context, the agent iteratively plans the next step, requests\nevaluations (physical examination, laboratory tests and imaging) and, when necessary, consults external sources (guidelines and PubMed); only requested observations are revealed and are integrated into\na compact high-salience encounter state to guide subsequent actions until final diagnosis. c, Diagnostic cognition primitives (DCPs). After each diagnosis reasoning, DxEvolve consolidates a DCP from\nthe trajectory, consisting of a retrievable presentation pattern and evidence-linked guidance for investigation planning and diagnostic decision-making; DCPs are indexed in a repository and selectively\nreused in later encounters as an action like medical evaluation and searching external sources under\nthe same DCR workflow. d, Cohorts and protocol. DCPs are built from a MIMIC-CDM accrual pool\nthat is strictly non-overlapping with evaluation encounters, then assessed on a held-out in-distribution\nMIMIC-CDM cohort and an external hospital cohort for out-of-distribution evaluation.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 7, + "total_chunks": 95, + "char_count": 2165, + "word_count": 279, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "b5484129-4b6f-44f1-9c0d-50d787ddb6f0", + "text": "2.1 Experimental design and the DxEvolve framework To bridge the gap between static biomedical knowledge and dynamic clinical reasoning (Fig. 1a), we developed DxEvolve to operationalize this dynamic reasoning process\nby coupling a high-fidelity investigative workflow with a mechanism for explicit experiential growth. The framework is sustained by two synergistic pillars. First, the deep\nclinical research (DCR) workflow ensures that every diagnostic step remains grounded\nin a traceable evidence base (Fig. 1b). Second, a self-evolution mechanism distills these\ninvestigative trajectories into diagnostic cognition primitives (DCPs), effectively transforming individual patient encounters into a library of reusable, governable clinical\nwisdom (Fig. 1c). We designed an evaluation roadmap to rigorously test this framework (Fig. 1d). First, we utilized the MIMIC-CDM benchmark [40], a curated dataset of 2,400 acute\nabdominal presentations designed specifically for stepwise diagnosis. For primary comparisons, we predefined a held-out evaluation cohort (n=400) randomly sampled from\nMIMIC-CDM and reserved all remaining non-overlapping encounters exclusively for\nDCP accrual; unless noted otherwise, all analyses involving DCP retrieval use this\nfixed accrual pool under the same split. To provide a direct anchor to human expertise, we further validated DxEvolve against another encounter split from a published\nclinician-benchmarked reader-study subset [40] (n=80) and reserved all remaining\nnon-overlapping encounters exclusively for DCP accrual in this setting. Finally, to ensure the robustness extends beyond curated environments, we conducted external validation using an independent cohort from the Chinese PLA General\nHospital (N=293). This real-world dataset, which includes diagnostic categories both\noverlapping with and absent from the primary benchmark, provides a stringent test\nof DxEvolve's generalizability across differing healthcare systems, institutional workflows, and documentation practices. All evaluations were conducted in accordance with\nstrict data-governance protocols, utilizing locally deployed models to ensure patient\nprivacy and institutional compliance (\"Ethics approval and governance\", Methods).", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 8, + "total_chunks": 95, + "char_count": 2238, + "word_count": 292, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "fcaa2915-9cb1-4815-9388-4f3207283c1c", + "text": "2.2 DxEvolve achieves clinician-level diagnostic performance We first evaluated DxEvolve on the MIMIC-CDM evaluation cohort (n=400), where\nFig. 2a exhibited consistent diagnosis accuracy gains (P <0.001) across all base\nLLM backbones comparing with the established CDM baseline [40] (11.2% mean\naccuracy gain) and DxEvolve w/o DCP (9.1% gain). Ablating clinical guideline and\nPubMed retrieval resulted in only a modest mean accuracy decrease (0.9%), suggesting that the core gains primarily arise from workflow scaffolding and experience\nretrieval, with external retrieval providing complementary support in selected cases. Critically, as these gains were achieved using off-the-shelf backbones without weight\nupdates, the improvements reflect the efficacy of the proposed investigative workflow\nand experiential mechanisms rather than task-specific fine-tuning. To characterize the utility of DxEvolve across different clinical scenarios, we stratified encounters by investigative complexity, utilizing the evidence-acquisition volume Fig. 2 Main diagnostic performance results on MIMIC-CDM. a, Diagnosis accuracy on the\nMIMIC-CDM evaluation cohort (n=400), reported per pathology and as the average. For each base\nLLM (color), we compare the CDM baseline, DxEvolve without DCP retrieval (DxEvolve w/o DCP),\nand DxEvolve over multiple seeds. b, Accuracy improvement of DxEvolve over the CDM baseline\nstratified by encounter-level diagnostic burden (easy versus hard). Points show the stratum-specific\nimprovement for each base LLM; annotations indicate the improvement in each stratum and the\nbetween-stratum difference. c, Diagnosis accuracy on a reader-study subset of MIMIC-CDM (n=80). Bars report average diagnostic accuracy for CDM and DxEvolve distinguished by light and dark\nshades of the same color, together with single-pass full-information (FI) inference (hatched). Specialist\nmedical LLMs with limited action compliance are reported under FI only. The clinician reference\n(Doctors) corresponds to the published reader-study subset with full information available [40]. of the baseline model as a proxy for diagnostic burden. DxEvolve improved accuracy\nacross all strata, with the most pronounced gains concentrated in the high-burden\ngroup, representing a 40%–169% relative increase in gain magnitude over low-burden\ncounterparts (Fig. 2b). We next evaluate DxEvolve against human expertise using a reader-study subset\nof the MIMIC-CDM dataset [40] (n=80).", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 9, + "total_chunks": 95, + "char_count": 2469, + "word_count": 336, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "c9477c3d-6803-458a-8206-a08962e7ee01", + "text": "In the original reader study, clinicians issued\nretrospective diagnoses under a full-information (FI) regime, where all evidence was\nprovided upfront. In contrast, DxEvolve operated under a significantly more stringent, workflow-aligned regime, requiring it to autonomously decide which evidence to\nacquire and when. Despite this informational disadvantage, DxEvolve attained expertlevel proficiency: paired with state-of-the-art backbones, the agent achieved 90.4%\naccuracy, surpassing the 88.8% human expert (Fig. 2c). Notably, the clinician reference comes from the published reader-study subset under FI conditions; we use it as\nan anchor for human-level performance rather than a head-to-head comparison under\nmatched information access. Intriguingly, DxEvolve surpassed the corresponding single-pass FI baselines across\nbase large language models (LLMs), including medical-domain LLMs (ClinicalCamel\nand MedGemma) evaluated under the FI regime due to their inability to comply with\ninteractive action constraints (Fig. 2c). This advantage is consistent with two complementary mechanisms: first, the DCR workflow provides a reasoning scaffold that\nmaintains clinical saliency and prevents the \"cue dilution\" common in long, unstructured records; and second, DCP-guided evolution sharpens uncertainty calibration,\nallowing the agent to prioritize decisive findings. In summary, these results demonstrate that DxEvolve couples workflow-aligned\nexecution with longitudinal self-evolution to reach expert-level diagnostic proficiency. By externalizing improvement through explicit clinical experiences rather than opaque\nparametric changes, the system provides an auditable pathway for achieving highfidelity diagnostic performance that is robust to the complexities of the real-world\nclinical environment. 2.3 External validation supports cross-institution portability\nof experiential gains To evaluate the external validity of DxEvolve, we conducted independent validation\non a cohort from the Chinese PLA General Hospital, representing a substantial shift\n(\"Evaluation cohorts\" in Methods). To decouple institutional variance from linguistic\nfactors, we applied the DCP repository distilled from 2,000 MIMIC-CDM encounters\nto standardized English translations of these clinical records. DxEvolve consistently\nelevated performance across all base LLMs, yielding a 10.2% mean accuracy gain\nover the CDM baseline and a 5% improvement over the DCP-free ablation (Fig. 3a). This sustained efficacy across distinct national and institutional contexts suggests that\ndistilled DCPs capture trans-institutional diagnostic heuristics rather than narrow,\ndataset-specific shortcuts tied to the originating environment. While overall accuracy on the external cohort was comparable to that on\nMIMIC-CDM, we observed notable heterogeneity across disease states. a DeepSeek-V3.2 Qwen3-30B Qwen3-235B GLM-4.7 CDM DxEvolve w/o DCP DxEvolve\n(%) 80", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 10, + "total_chunks": 95, + "char_count": 2933, + "word_count": 379, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "e1d5758c-9ac2-47f2-a263-8fe1e33b2301", + "text": "Appendicitis Cholecystitis Pancreatitis Mean 20 Diagnostic\nLiver Abscess Urinary Tract Infection Mean Appendicitis Cholecystitis Pancreatitis Mean Fig. 3 External validation on an independent hospital cohort. a, Diagnostic accuracy on\ndiagnoses overlapping with MIMIC-CDM (appendicitis, cholecystitis and pancreatitis) and their\nmean, evaluated using standardized English translations of the structured records. b, Category-level\ntransfer on diagnoses that were never used for DCP accrual (liver abscess, urinary tract infection)\nand their mean, evaluated under the same protocol. c, Robustness to documentation with native\ninstitutional language, evaluated on the same external encounters using the original Chinese records. appendicitis and cholecystitis decreased, whereas performance on pancreatitis encounters improved.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 11, + "total_chunks": 95, + "char_count": 824, + "word_count": 103, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "96cb422d-ddc1-49d0-97f8-980e3f156c53", + "text": "While the source of this variance likely reflects institution-specific\nworkup pathways and documentation nuances, highlighting the necessity of evaluating clinical agents across diverse practice environments where diagnostic thresholds\nand recording standards may differ. We further probed the framework's adaptability on diagnostic categories absent\nfrom the initial repository, including liver abscess and urinary tract infection (UTI). In these out-of-distribution settings, DxEvolve yielded a 17.1% mean accuracy gain\naveraged across liver abscess and UTI cohorts over the CDM baseline and a 4.5%\nimprovement over the DCP-free ablation (Fig. 3b). Notably, while liver abscess\nshares the abdominal domain of the original benchmark, UTI represents a distinct a b\nImproved cases 90 30\nTotal cases P = 1.56 × 10 −4\n*** (%)\nP = 1.10 × 10 −5 85 (%) 25 22.6%\nrate *** P = 4.76 × 10 −5\n20 18.8% *** accuracy\n15.8%\n14.9%\n15 experience\n11.2% diagnostic 75\n10 9.1%\nIncorrect Overall 70 Qwen3-30B 5\nDeepSeek-V3\nQwen3-235B\n65 0\nQwen3 Qwen3\n30B 235B 0100200 500 1000 2000 DeepSeekV3.2\nNumber of accrued encounters Fig. 4 Exposure-dependent self-evolution and provenance of retrieved experience. a,\nOverall diagnosis accuracy on the fixed MIMIC-CDM evaluation cohort (n=400) as the DCP accrual\npool increases, shown for three representative base LLM backbones. Accuracy improves with additional accrual encounters and then tapers, yielding a saturating learning curve. b, Provenance of\nretrieved experience during evaluation. Bars show the fraction of retrieved DCPs whose source\naccrual episode ended in an incorrect diagnosis (\"incorrect experience rate\"), computed separately\nfor improvement cases and for all evaluation encounters pooled. P values indicate enrichment of\nincorrect-source DCPs among retrievals in improvement cases. These gains indicate that distilled DCPs encode portable, domainagnostic heuristics that transcend specific disease labels. While the full scope of\ntransferability across heterogeneous syndromes warrants further investigation, these\nresults demonstrate the robust scalability of experience-guided evolution in previously\nunencountered clinical domains. Finally, we assessed the cross-lingual robustness of DxEvolve by evaluating its performance on original Chinese clinical records. In this practical deployment scenario,\npatient encounters were processed in their native language, while the underlying reasoning framework and the accumulated DCP repository remained in English. Despite\nthis linguistic mismatch, DxEvolve yielded an 11.9% mean accuracy gain over the\nCDM baseline and a 6.3% improvement over the DCP-free ablation (Fig. 3c). Notably,\nabsolute diagnostic accuracy remained comparable to that achieved using standardized English translations.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 12, + "total_chunks": 95, + "char_count": 2781, + "word_count": 390, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "aad6a29e-a34f-40d3-89b1-5b8e9d551ac6", + "text": "These observations demonstrate that the DCR framework\nand experiential heuristics within DxEvolve are language-agnostic, confirming the\nframework's viability in diverse, multilingual clinical environments. Together, these external evaluations demonstrate that DxEvolve's self-evolution\nmechanism confers substantial portability across institutional boundaries, documentation languages, and diagnostic categories. By externalizing clinical wisdom as\nsymbolic, governable assets, the framework provides a rigorous trajectory for maintaining high-fidelity performance amidst the inherent heterogeneity of real-world clinical\npractice.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 13, + "total_chunks": 95, + "char_count": 631, + "word_count": 69, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "1c52e694-2418-4ce2-9d2e-67b77d17c162", + "text": "2.4 Self-evolution shows exposure-dependent scaling behavior\nand error-driven correction We next studied whether DxEvolve exhibits exposure-dependent improvement consistent with clinician-like development, and whether the gains can be traced to reusable\nexperience rather than incidental trajectory variation. We therefore quantified selfevolution by scaling the pool of encounters available for DCP accrual while holding\nthe evaluation cohort fixed (\"Evaluation and analysis\" in Methods). Accuracy matured longitudinally, yielding reproducible learning curves across all\nevaluation schedules (Fig. 4a), with a mean accuracy gain of 8.97% after accrual\nover the first 0–1,000 encounters and a further 0.9% gain over 1,000–2,000 encounters. While initial gains were remarkable, trajectories eventually diverged by model\ncapacity: whereas weaker backbones reached an asymptotic plateau, more capable\nmodels sustained incremental growth throughout the accrual period. This divergence\nsuggests that the saturation point of experience-guided evolution is governed by the\nbase LLM's reasoning capability; stronger architectures demonstrate a superior ability\nto mine from complex, long-tail scenarios, effectively raising the ceiling of attainable\ndiagnostic expertise. To identify which experiences drive error correction, we analyzed improvement\ncases—encounters where DxEvolve succeeded but its baseline failed. In these cases,\nretrieved DCPs were significantly enriched with experiences distilled from prior diagnostic failures compared to the general retrieval distribution (Fig. 4b). This highlights\nan error-driven dividend, where heuristics rooted in past mistakes contribute more to\nsubsequent performance gains. These results suggest that failures represent high-value\nlearning events, providing the critical corrective logic necessary to navigate complex\ndiagnostic pitfalls that successful encounters may overlook. Together, these analyses connect exposure-dependent performance gains to an\ninspectable mechanism: improvement scales with accumulated experience, and the\nexperience invoked when errors are corrected exhibits a systematic provenance structure. This motivates examining not only how the repository grows, but how the content\nof accrued DCPs matures with continued exposure. 2.5 Self-evolution is accompanied by progressive maturation\nof experience To quantify the functional maturation of the experience repository, we examined\nwhether DCPs accrued in later developmental stages exhibit superior clinical utility\nand broader applicability than early-stage heuristics. This progression was validated through blinded expert assessment and comprehensive retrieval-log analyses\n(\"Evaluation and analysis\" in Methods).", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 14, + "total_chunks": 95, + "char_count": 2733, + "word_count": 348, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "1ece5d30-6c71-4c8f-9b5c-1d2a7d354656", + "text": "In a clinician reader study blinded to study condition, we randomly sampled 20\nDCPs from an early exposure window (encounters 1–300) and 20 from a late window (encounters 1700–2000). Two clinicians rated each DCP on clinical correctness\n(including safety concerns), actionability (guiding evidence acquisition and hypothesis\nrefinement) and generality (reusability beyond the source encounter and pathology). The robustness of the expert evaluation framework was confirmed by high inter-rater a Early (n = 20) Late (n = 20) P = 0.005 P = 0.16 P = 0.021 P = 0.007\n** n.s. * **", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 15, + "total_chunks": 95, + "char_count": 575, + "word_count": 93, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "d9b2ea8f-6412-44ea-820c-3f86bbacbd30", + "text": "Clinical Actionability Generalizability Mean\nCorrectness Score c Total cases Improved cases 15.9\nClinical Correctness 14.8\n5 Actionability 15 13.9 13.5\nGeneralizability 12.9 12.4\nscore (%) 10 rate\n2 4 experience\nExpert Late retrieval 5\nBubble size ∝ n 3\nICC (total) = 0.81 3 4 5 Qwen3-30B Qwen3-235B DeepSeek-V3.2\nExpert 1 score Fig. 5 Maturation of accrued experience artifacts with encounter exposure. a, Blinded\nclinician ratings of diagnostic cognition primitives (DCPs) sampled from an early exposure window\n(encounters 1–300; n=20) and a late window (encounters 1700–2000; n=20). DCPs were scored for\nclinical correctness, actionability and generalizability, with the mean shown as an aggregate. Boxes\ndenote interquartile range, centre line the median, and points individual DCPs; two-sided P values\nare shown (n.s., not significant). b, Inter-rater reliability of clinician ratings for the aggregate DCP\nscore (ICC=0.81), supporting the reliability of the clinician assessment. c, Evaluation-time retrieval\nsignal for late-stage DCPs, quantified as the fraction of retrieval events that involve DCPs in the late\nencounter window. reliability for the aggregate DCP scores (intraclass correlation coefficient (ICC)=0.81;\nFig. 5b). Late-stage DCPs scored higher across dimensions than early-stage DCPs,\nwith mean clinician rating 4.47 vs 4.17 on a 5-point scale (Fig. 5a). Both sets often\ncontained clinically reasonable guidance, but later DCPs more consistently articulated it in reusable, action-oriented terms (for example, clearer conditional checks and\nescalation cues), whereas early DCPs more often remained context-bound, supporting\ngradual maturation with exposure. To complement clinician ratings with a usage-based signal, we analyzed evaluationtime DCP retrieval logs. Using the same early and late exposure windows, we", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 16, + "total_chunks": 95, + "char_count": 1837, + "word_count": 259, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "a08730ff-74ea-4c49-b632-354c8a185170", + "text": "quantified for each DCP (i) retrieval breadth (the number of distinct evaluation encounters in which it was retrieved) and (ii) association with error-correcting episodes\n(retrieval events in encounters where DxEvolve was correct but DxEvolve w/o DCP\nwas incorrect). Retrieval log analyses confirmed that late-stage DCPs possess superior functional utility. While these artifacts maintained a baseline retrieval rate of\n12.4%–13.5% across total encounters, their prevalence increased to 13.9%–15.9%\nwithin error-correcting episodes (Fig. 5c). This enrichment was most pronounced in\nDeepSeek-V3.2. Taken together, clinician-blinded ratings and usage-based signals converge on a\nconsistent picture: with continued encounter exposure, DCPs become more reliably\nactionable and more broadly reusable, and their retrieval is increasingly enriched in\nerror-correcting episodes. These findings support that self-evolution involves qualitative refinement of accrued experience artifacts, rather than simply expanding the size\nof the DCP repository. 2.6 DxEvolve's evidence acquisition aligns with clinical\nworkflows and clinical guidelines In workflow-aligned diagnosis, performance depends not only on the final diagnosis but also on whether requested investigations resemble routine care. We therefore\nassessed DxEvolve's evidence-acquisition behaviour at the encounter level, measuring\nalignment with documented investigations and compatibility with common pathways\n(\"Evaluation and analysis\" in Methods). Across the MIMIC-CDM evaluation cohort, DxEvolve exhibited higher consistency\nwith recorded workups on all four trajectory-consistency measures than the standard workflow-aligned baseline (mean overall consistency across base LLMs, 0.89 and\n0.68, respectively), including physical-examination execution, laboratory-test set F1,\nimaging (modality, region) set F1 and action-order concordance. The results indicate\nmore reliable coverage of key investigation types and a workup sequence closer to the\nrecorded workflow (Fig. 6a). We further assessed workup behavior against established clinical guidelines using\na conservative, three-component compliance score that captures (i) whether physical\nexamination was performed before downstream testing, (ii) coverage of guidelinerecommended laboratory categories and (iii) whether the first imaging study matched\nguideline-supported modality–region choices for each condition. DxEvolve achieved\nhigher overall compliance than CDM across all evaluated backbones, with distributions shifted toward higher scores and statistically significant paired differences as\nshown in Fig. 6b. Together, these analyses indicate that DxEvolve's improvements extend beyond\nend-point accuracy to more clinically compatible evidence acquisition, rather than\narising from opportunistic or idiosyncratic request patterns. This study presents DxEvolve, a self-evolving diagnostic agent that instantiates diagnosis as an interactive deep clinical research (DCR) workflow, in which clinical evidence", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 17, + "total_chunks": 95, + "char_count": 3020, + "word_count": 380, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "6a91b98b-59d2-46f9-9e68-603a29c3c747", + "text": "Physical Exam\nQwen3-235B LaboratoryImaging F1 Tests\nAction Order Physical Exam\nQwen3-30B LaboratoryImaging F1 Tests\nAction Order Physical Exam\nDeepSeek-V3.2 LaboratoryImaging F1 Tests\nAction Order Physical Exam\nGLM-4.7 LaboratoryImaging F1 Tests\nAction Order 0.2 0.4 0.6 0.8 1.0\nAgreement with clinical ground truth CDM DxEvolve\nP = 1.7×10−61 P = 3.9×10−58 P = 3.6×10−13 P = 2.2×10−17", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 18, + "total_chunks": 95, + "char_count": 384, + "word_count": 56, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "d5291cdb-c7c2-48cd-8dd0-ac6dcdc60432", + "text": "60 compliance\nOverall 40 Qwen3-30B Qwen3-235B GLM-4.7 DeepSeek-V3.2 Fig. 6 DxEvolve produces more workflow-consistent investigations and shows improved\nalignment with clinical guidelines. a, Workup consistency. Across the MIMIC-CDM evaluation\ncohort (n=400), DxEvolve shows higher agreement with the documented investigation trace than the\nstandard decision-making baseline CDM for each backbone, spanning whether a physical examination\nwas performed, overlap with recorded laboratory testing, overlap with recorded imaging (modality\nand region), and concordance of the investigation ordering. Points are model-level means; grey lines\nconnect paired results for DxEvolve versus CDM under the same backbone. b, Guideline adherence. Distributions of encounter-level guideline-compliance scores, derived from the mean adherence across\nthree dimensions: physical examination, laboratory investigations, and imaging. Violin plots show\nscore densities; embedded boxplots indicate the median and interquartile range; points mark the\nmean. P values are from paired two-sided comparisons. is acquired procedurally through explicit evaluation actions, with optional consultation of external medical sources.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 19, + "total_chunks": 95, + "char_count": 1197, + "word_count": 151, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "322b600a-6d19-4007-ad66-3d9457495871", + "text": "DxEvolve is designed as a governed learning system\nover encounter-level diagnostic trajectories, supporting longitudinal self-evolution by\naccruing and retrieving diagnostic cognition primitives (DCPs) as reusable experience\nartifacts. Across a public, de-identified benchmark of clinical encounters formatted for\nprocedural evidence acquisition, DxEvolve reaches clinician-comparable performance\nunder interactive diagnosis. Importantly, evaluation on an external cohort from a\nChinese tertiary hospital operating in a distinct healthcare system shows consistent\nDCP-enabled gains, supporting the portability of experience under cross-institutional These findings show that workflow-aligned diagnostic agents can reach clinicianbenchmarked performance while preserving auditability, reframing progress from static\nfull-record prediction to governed, evidence-tethered execution and improvement as\nclinical expertise accrues. A central contribution of DxEvolve lies in the experience-driven self-evolution\nmechanism, which renders encounter exposure an explicit learning signal within a\nworkflow-aligned diagnostic process. Unlike paradigms that treat each case as a static,\nfull-record input, where all documented findings are provided upfront, DxEvolve operates through procedural evidence acquisition and iterative hypothesis refinement under\nthe DCR framework. This design more closely mirrors the temporal and inferential structure of routine diagnostic workups. By generating standardized, clinically\nauditable trajectories with explicit provenance, DxEvolve learns from practice in a\nmanner analogous to human clinicians. Through this process, DCPs are accumulated\ninto a reusable experience repository and can be retrieved to steer subsequent evidence gathering and diagnostic refinement without parameter updates. When external\nmedical sources are consulted, their evidence can provide additional authoritative corroboration. Empirically, diagnostic performance improved with cumulative encounter\nexposure, yielding a reproducible, exposure-dependent scaling curve. Notably, DCPs\noriginating from prior diagnostic failures were enriched in improvement cases, suggesting an error-driven learning mechanism: unsuccessful episodes preferentially yield\ncorrective effects that reduce the likelihood of repeating similar mistakes in similar\nclinical contexts. Because DCP-based self-evolution remains non-parametric and traceable, these primitives can be inspected, curated, or even retracted as needed. This\noffers a practical pathway for governed, longitudinal adaptation, a capability difficult\nto achieve through conventional model training.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 20, + "total_chunks": 95, + "char_count": 2649, + "word_count": 319, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "fe5cfd59-d2f8-4e87-801b-0a7a97162e3f", + "text": "External validation at the Chinese PLA General Hospital confirms that DxEvolve's\nadvantages transcend institutional boundaries, linguistic variations, and diagnostic categories. The DxEvolve's sustained performance across translated and native\nChinese documentation suggests that its distilled experiences capture portable,\nworkflow-level logic rather than language-specific artifacts. Notably, the observed\ngains in diagnostic categories absent from the initial repository underscore a crossdisease generalizability essential for real-world deployment. Collectively, the DCR\nworkflow provides a portable execution substrate for stepwise evidence acquisition\nunder heterogeneous documentation, and DCP-based self-evolution supplies a reviewable mechanism for adaptation as institutions, languages and workup patterns drift. They offer a practical route to maintaining dependable diagnostic performance beyond\nthe originating benchmark. Beyond exposure-dependent performance gains, our results suggest that selfevolution is accompanied by a progressive improvement in the quality of accrued\nDCPs, echoing how clinicians' experiential knowledge can mature with seniority rather\nthan remaining isolated reflections. In clinician-blinded assessments, experiences accumulated later scored higher on clinical correctness, actionability and generality than\nearlier experiences, although both stages were broadly clinically reasonable. Consistent\nwith this, usage-based analyses showed that later experiences were retrieved across a\nwider range of evaluation encounters and were more often observed in error-correcting episodes under identical workflow constraints.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 21, + "total_chunks": 95, + "char_count": 1657, + "word_count": 198, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "53c059d6-8d48-418b-adc5-668263dc3638", + "text": "Together, these signals support a maturation process in which accrued experience becomes more reliably actionable and more\nbroadly reusable, rather than simply expanding in volume. In practice, the gains from\nself-evolution reflect experience refinement as well as accumulation. For workflow-aligned clinical agents, terminal diagnostic accuracy is an incomplete endpoint because the agent determines the sequence and intensity of evidence\nacquisition, with downstream implications for test utilization and imaging escalation. DxEvolve's requested investigations matched encounter-recorded workups more\nclosely than the baseline across behavioural concordance measures, and more often\nselected guideline-supported first-line imaging. Together with the accuracy gains, these\nprocess-level improvements suggest that the gains are not primarily explained by indiscriminate escalation of investigations. Such process alignment provides an auditable\nsubstrate for governance, enabling calibration of investigation intensity and targeted\nreview of recurrent failure patterns. Notwithstanding these advances, several limitations and corresponding priorities\nfor future work warrant consideration.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 22, + "total_chunks": 95, + "char_count": 1189, + "word_count": 146, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "5f13439b-0476-4594-94c6-9da3e82449e8", + "text": "First, our experiments use de-identified EHRderived records to enable reproducible, auditable measurement of evidence acquisition\nand experience reuse; extending this framework to prospective settings will benefit\nfrom incorporating additional real-world factors, such as clinician–patient interaction. Second, we observe consistent gains when applying distilled experiences to diagnostic\ncategories beyond those represented in the initial repository, supporting portability\nacross disease settings; broader evaluations across diverse case-mix and clinical contexts will further delineate generalizability in complex practice. Third, our current\naction schema emphasizes the core diagnostic-relevant actions required for diagnosis in an interactive workup setting; the framework is naturally extensible to richer\nactions as needed for specific clinical deployments. These considerations motivate three\nnext steps: (i) prospective clinician-in-the-loop studies that evaluate workflow fidelity,\nefficiency and patient-relevant endpoints; (ii) expanded multi-institutional and multispecialty evaluation to characterize when and where experience-guided self-evolution\ngeneralizes; and (iii) extension of the action space to incorporate richer operational\nactions while preserving auditability and benchmarking comparability. In summary, DxEvolve links workflow-aligned diagnostic investigation with longitudinal, governed improvement through experience-driven self-evolution. By operationalizing diagnosis as procedural evidence acquisition alongside auditable experience\nconsolidation, the framework reflects two core elements of clinical expertise: systematic investigation within a patient encounter and progressive learning across a career. Consistent with this, DxEvolve reaches clinician-level performance under evaluations\nthat emulate clinically realistic diagnostic constraints, demonstrating that sophisticated diagnostic reasoning emerges when structured investigative protocols are refined\nby an ever-maturing repository of DCPs. By externalizing learning into inspectable\nartifacts rather than opaque parameter updates, DxEvolve aligns AI advancement\nwith the transparency standards essential to clinical safety. More broadly, our findings\nsupport governed, auditable self-evolution as a promising direction for clinical AI that\nmust remain reliable as evidence and standards of care evolve.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 23, + "total_chunks": 95, + "char_count": 2400, + "word_count": 284, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "a383ec95-e40c-4bd7-981d-0c3c5cf20401", + "text": "4.1 DxEvolve framework DxEvolve is a self-evolving diagnostic agent that closes two coupled gaps observed\nin clinical AI diagnosis: a process gap between static full-information prediction and\nworkflow-aligned stepwise evidence acquisition, and a learning gap in which apparent competence does not accumulate into more reliable evidence-consistent reasoning\nunder uncertainty. DxEvolve operationalizes diagnosis as an evidence-centric deep\nclinical research workflow and the proposed self-evolution mechanism externalizes longitudinal improvement as auditable diagnostic cognition primitives, distilled from and\nreinvoked within the same diagnostic trajectories, without any parameter updates to\nthe base large language models (LLMs). At the core of each clinical encounter, DxEvolve implements a deep clinical research (DCR) framework—an agentic research protocol that treats diagnosis\nas evidence-driven investigation rather than single-pass prediction, while enforcing\nworkflow-aligned constraints on evidence acquisition. Each encounter starts from the\npresenting complaint with limited initial context, mirroring early-stage clinical uncertainty. The agent then iteratively plans the next information need, executes a concrete\nacquisition action, and updates an explicit encounter state that integrates newly\nrevealed findings with the evolving hypothesis set and a structured plan for subsequent\nsteps. The DCR workflow thus proceeds through repeated cycles of (i) formulating\nthe next evidence-seeking objective conditioned on the current state, (ii) acquiring\nthe selected information through tool-mediated actions, and (iii) synthesizing the new\nevidence into the state to refine hypotheses and commit to the next investigative\ndecision. The action space is aligned with routine workup operations and includes requests\nfor physical examination findings, laboratory testing results and imaging reports. Because evidence availability and recommended workup choices are often guided by\nevolving clinical guidance and best practices, relying solely on parametric model\nknowledge can be insufficient, particularly early in an encounter when patient-specific\nevidence is sparse. DxEvolve can therefore optionally invoke external medical evidence interfaces (PubMed and clinical guidelines) within the same workflow to support\nevidence-grounded decision-making and to reduce reliance on unsupported rationales. Specifically, clinical guidelines are accessed via dense retrieval through semantic\nvector-space indexing to identify contextually relevant standards, while peer-reviewed\nevidence is sourced through queries to the official PubMed search utilities. The DCR workflow can rapidly obtain long and heterogeneous text (for example,\nmulti-parameter laboratory outputs, narrative imaging reports and retrieved documents), in which weakly relevant or incidental content may dilute clinically decisive\nsignals. To mitigate this, DxEvolve applies context engineering by prioritizing clinically\nsalient findings and suppressing incidental content in the running context, performing\nan automatic summarization step that extracts and carries forward diagnostically relevant information when needed. This mechanism preserves continuity of the diagnostic\ntrajectory while maintaining a stable, high-signal representation to inform subsequent Importantly, the DCR-generated diagnostic trajectories can drive longitudinal learning with real encounter-derived workups and outcomes rather than by\nabstract, simulator-specific feedback.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 24, + "total_chunks": 95, + "char_count": 3528, + "word_count": 448, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "01863dcf-15a5-4efc-aa13-273fff32d704", + "text": "The central innovation of DxEvolve is the longitudinal self-evolution mechanism\nthat enables progressive improvement with clinical exposure by accumulating and\nreusing experience from prior episodes, without any parameter updates to the underlying base LLM. This design is motivated by clinician cognition: expertise is not only the\nrecall of medical facts, but the ability to recognize recurring clinical patterns, anticipate high-yield investigations and apply context-appropriate decision rules shaped by\nprior successes and failures. This design externalizes learning into accountable experience artifacts that clinicians can audit, revise or remove, rather than relying on latent\nbehavioural drift. After each completed diagnostic episode in the accumulation pool, DxEvolve performs a structured post-hoc consolidation step over the trajectory and distills a\ndiagnostic cognition primitive (DCP) optimized for reuse under uncertainty. Each\nDCP contains three components: experience pattern, test-ordering experience, and\ndiagnostic decision experience. The experience pattern provides a high-salience signature for retrieval, summarizing the presentation and discriminative cues at a level\nintended to generalize beyond the originating patient. The test-ordering experience\nencodes actionable workup guidance for the stepwise setting, including high-yield nextstep evaluations, contingency options when findings are equivocal and safety-oriented\nguardrails that reduce common omissions or inappropriate escalation. The diagnostic decision experience captures evidence-linked implications for hypothesis refinement\nand final decision-making, including discriminative patterns that support or refute\nleading hypotheses, red-flag checks, and corrective lessons when the source trajectory\nexposed an error mode.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 25, + "total_chunks": 95, + "char_count": 1812, + "word_count": 229, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "aa889cb9-70ef-4ce8-bb62-ef23a07aa254", + "text": "DCPs are written as portable guidance rather than narrative\nrationales. To support mechanistic analyses and traceable governance, each DCP is stored\nwith lightweight provenance metadata for in-depth analysis, including its exposure\nindex, diagnostic category and whether the source episode produced a correct primary\ndiagnosis. This provenance enables analyses of how DCP sources relate to subsequent\nperformance gains and error correction. During diagnosis on encounters, DxEvolve treats the DCP repository as a growing long-term memory. At the step of deciding to retrieve prior experience, the agent\nderives a retrieval query from its current evidence-grounded state and retrieves a small\nset of candidate DCPs whose experience patterns best match the current presentation. Retrieved DCPs are injected as a bounded context and applied as conditional\nguidance: they may steer evidence seeking, highlight discriminative cues to verify or\nprovide evidence-linked guidance for final diagnostic commitment. To mitigate spurious memory-driven bias, DxEvolve is instructed to use a DCP only when it is\ncompatible with the patient-specific evidence acquired so far and to disregard DCP\nguidance that is irrelevant with observed findings. By combining workflow-aligned trajectories with structured DCP consolidation\nand evidence-compatible reuse, DxEvolve provides an accountable pathway for exposure-dependent improvement while preserving transparency and avoiding finetuning-induced shifts in base-model behaviour.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 26, + "total_chunks": 95, + "char_count": 1510, + "word_count": 205, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "22d37b13-d9f5-4d5d-99f1-d67f0ae08584", + "text": "Diagnostic reasoning trajectories and\nDCP examples are shown in Supplementary Section C and D. Benchmark experiments used MIMIC-CDM [40], a clinical decision-making benchmark curated from MIMIC-IV [41]. MIMIC-IV is a large, de-identified electronic\nhealth record resource sourced from routine clinical care at Beth Israel Deaconess\nMedical Center (Boston, MA, USA), including longitudinal structured variables, laboratory measurements and linked clinical documentation [41]. MIMIC-CDM inherits\nthis real-world provenance and comprises 2,400 de-identified patient presentations of\nacute abdominal pain spanning four diagnostic categories (appendicitis, cholecystitis, diverticulitis and pancreatitis), formatted for workflow-aligned diagnosis in which\nadditional evidence (such as physical examination findings, laboratory results and\nimaging reports) is revealed only when explicitly requested through the corresponding\naction [40].", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 27, + "total_chunks": 95, + "char_count": 932, + "word_count": 114, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "702bcc8c-95e4-4568-b42d-3afba1de2aa9", + "text": "To prevent label leakage, agent-facing inputs excluded any diagnosis fields or labelbearing metadata. Evidence items were provided as structured text fields in the dataset\nrelease, with field boundaries preserved to avoid inadvertent information disclosure\nthrough formatting, concatenation or re-ordering. When multiple items of the same\nevidence type were available, they were retained in their original record order and\nwere exposed only after the agent issued the matching request action.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 28, + "total_chunks": 95, + "char_count": 492, + "word_count": 69, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "99213c4e-beda-47e5-b1c3-7379936e5cd3", + "text": "4.3 Evaluation cohorts Across all experiments, we enforced strict non-overlap between encounters used\nfor longitudinal experience accumulation (i.e., construction of the diagnostic cognition primitive repository, DCP) and those used for evaluation, implemented at the\nencounter level using unique identifiers. For primary comparisons under the deep\nclinical research (DCR) workflow, we predefined a held-out MIMIC-CDM evaluation\ncohort of 400 encounters and kept it fixed across base models, ablations and random\nseeds; all remaining non-overlapping MIMIC-CDM encounters were used exclusively\nfor DCP accrual. To contextualize against published clinician benchmarking, we additionally evaluated on the reader-study subset from Hager et al. (80 encounters; 20 per pathology) [40], which was treated as an independent evaluation cohort and strictly excluded\nfrom DCP accrual. On this subset, we report both workflow-aligned evaluation and\nsingle-pass full-information (FI) inference using identical underlying encounter content, differing only in the information-availability interface (complete record provided\nupfront for FI, with evidence-request actions disabled). For external validation, we assembled an independent cohort of de-identified\nencounters (2020–2024) from the Chinese PLA General Hospital (N=293) curated\nwith a standardized record structure, including appendicitis (n=30), cholecystitis\n(n=39) and pancreatitis (n=174), which match diagnostic categories in MIMICCDM, as well as liver abscess (n=39) and urinary tract infection (n=11). composition reflects the natural prevalence and clinical distribution of these conditions within the institution's stream, preserving the ecological validity of the dataset\nand ensuring that the evaluation mirrors the diagnostic challenges encountered in\nunconstrained real-world practice.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 29, + "total_chunks": 95, + "char_count": 1841, + "word_count": 238, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "db6a89c6-d9d8-49de-beab-4a8aa27f672d", + "text": "All external encounters were used exclusively for\nout-of-distribution testing and were never used for DCP accrual. For external-cohort\nexperiments, the DCP repository was built solely from the MIMIC-CDM accrual pool\nusing the same base LLM as in the corresponding evaluation. Records were harmonized to follow the MIMIC-CDM task format, preserving the\ninitial presenting complaint and a pool of candidate evidence items retrievable through\nexplicit requests. Imaging evidence followed the MIMIC-CDM convention by providing\nonly the final narrative report text. Owing to source-format constraints, laboratory\ntesting was returned as a consolidated results field, analogous to physical examination\nreturns. To enable controlled cross-institutional evaluation with English-prompted base\nmodels, we produced standardized English translations of the structured records using\nan offline, locally run translation tool with human verification. Translation was performed at the field level to preserve section boundaries and avoid reordering or merging\nacross fields; numerical values, units and unambiguous medical abbreviations were\nretained. For cross-language robustness, we additionally evaluated DxEvolve on the original\nChinese structured records under the same workflow and action schema. In this setting,\nonly the patient-specific encounter content was in Chinese, whereas prompts and the\nDCP repository remained in English.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 30, + "total_chunks": 95, + "char_count": 1424, + "word_count": 192, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "b8fa1575-b118-4b93-8584-b68f78f4806c", + "text": "4.4 Ethics approval and governance MIMIC-IV and the derived MIMIC-CDM cohort contain de-identified patient data\nand were accessed via PhysioNet under the required credentialing and data-use agreements, in accordance with the dataset governance policies [40, 41]. All analyses were\nconducted on de-identified data, and no directly identifiable information was used for\nmodel evaluation, reporting or dissemination.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 31, + "total_chunks": 95, + "char_count": 413, + "word_count": 56, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "5b3d1bc0-dd4e-45a0-aa9c-396f88224408", + "text": "The external institution cohort from the Chinese PLA General Hospital comprised\nretrospectively collected encounters and was de-identified prior to analysis under institutional policies. Use of these records for this study was reviewed and approved by\nthe hospital's institutional ethics committee of the Chinese PLA General Hospital\n(Approval No. S2020-418-01), with a waiver of informed consent where applicable\nunder the approved protocol. Data access was authorized through institutional governance procedures, and all processing and analyses were performed by authorized\nstudy personnel within institutionally approved computing environments.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 32, + "total_chunks": 95, + "char_count": 647, + "word_count": 85, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "e45636b2-78c5-4ebe-85e3-3d8620832e38", + "text": "4.5 Models and implementation DxEvolve was implemented as an LLM-orchestrated agent operating in a workflowaligned diagnostic environment with a constrained action schema, standardized tool\ninterfaces and explicit termination criteria. Across all experiments, we used offthe-shelf, open-weight base LLMs. Model inference was conducted locally to satisfy", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 33, + "total_chunks": 95, + "char_count": 353, + "word_count": 45, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "b8a2d475-40d7-4d1a-9e11-77d853c8ade7", + "text": "data-governance requirements for both the MIMIC-derived benchmark and the external hospital cohort, which preclude transmitting patient-level content to third-party\nhosted LLM services or external APIs. Base LLMs and inference settings. Unless otherwise stated, all experiments in this study applied Qwen3-30B (Qwen3-30B-A3B-Instruct), Qwen3-235B\n(Qwen3-235B-A22B-Instruct-2507) [42], DeepSeek-V3.2 [43] and GLM-4.7 [44]\nas backbones. To contextualize DxEvolve against domain-specific models, we\nalso evaluated MedGemma [45] (medgemma-27b-text-it) and ClinicalCamel [46]\n(ClinicalCamel-70B). During preliminary testing, these medical-domain LLMs\ndemonstrated insufficient compliance with the structured action-calling protocol\nrequired for workflow-aligned evaluation; specifically, they frequently failed to adhere\nto the pre-specified JSON output format or generated invalid investigative actions. Consequently, these models were evaluated exclusively under the single-pass fullinformation regime. All experiments were run on a local server equipped with NVIDIA\nA100 GPUs (80 GB), without using external hosted services. Within each base model,\ndecoding configurations were held fixed across all compared methods and ablations to\nensure that differences reflect workflow and experience mechanisms rather than sampling settings.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 34, + "total_chunks": 95, + "char_count": 1329, + "word_count": 161, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "6110b00c-4204-42c8-bb5d-a2ee43b4670d", + "text": "For all evaluated LLMs, we set temperature to 0.1, top-p to 0.7 and\ntop-k to 50. Prompt specification. All workflow-aligned experiments used a single, shared\nprompt contract that defines the action space and semantics, tool-call formatting, the\nagent state representation and the termination criteria. The same prompt template\nwas applied across all evaluated base models without model-specific adapters or taskconditional modifications, ensuring that comparisons differ only in the underlying\nmodel and the enabled system components. Full prompt templates are provided in the\nSupplementary Section A and B. DxEvolve uses a unified dense retrieval stack for both (i)\nexperience retrieval from the DCP repository and (ii) retrieval of external clinical guidelines when enabled. For both retrieval pathways, queries and candidate\ndocuments were embedded using bge-large-en-v1.5 [47] as dense encoder with\nvector-based similarity search (FAISS [48]). Similarity was computed by cosine\nsimilarity between ℓ2-normalized embeddings.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 35, + "total_chunks": 95, + "char_count": 1026, + "word_count": 143, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "d4072bab-da48-44a4-9de8-22c43c4a7955", + "text": "Retrieval was performed locally for\nreproducibility and, for sensitive cohorts, to avoid external transfer of patient information. We collected abdominal-condition guideline documents from authoritative\nclinical sources (for example, the American College of Gastroenterology, the World\nSociety of Emergency Surgery and Mayo Clinic) and manually verified relevance,\nauthority and recency, excluding outdated materials and ultimately retained 35 guidelines. The guidelines were converted to structured text, lightly cleaned (for example,\nremoving acknowledgements) before being locally indexed for retrieval. PubMed\nretrieval was implemented via the official NCBI Entrez (E-utilities) API, with queries\nrestricted to de-identified, non-patient-specific medical terms (for example, disease and\nsymptom keywords) and containing no patient-level records or identifiable information.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 36, + "total_chunks": 95, + "char_count": 877, + "word_count": 109, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "158dd197-9f9f-43a6-b083-978b13d3535d", + "text": "Baseline details and implementation parity. We use two complementary reference points: a published workflow-aligned baseline (CDM [40]) and an in-framework\nablation (DxEvolve w/o DCP) that isolates the marginal contribution of DCR and self-evolution mechanism.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 37, + "total_chunks": 95, + "char_count": 260, + "word_count": 34, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "325f2549-69c0-41dd-9db2-64240aefb4e4", + "text": "CDM is an established clinical decision-making diagnostic baseline capable of stepwise inquiry but lacking both a specialized investigative\narchitecture for evidence acquisition and a framework for experiential evolution. Our\nevaluation strategy prioritizes head-to-head, backbone-matched ablations within a\nunified architectural framework, an approach designed to isolate the specific contributions of workflow grounding and experiential reuse. Direct comparisons with\ngeneral-purpose agent frameworks are confounded by fundamental disparities in their\nunderlying diagnostic paradigms. For instance, most existing models focus on examcentric reasoning like USMLE-style scenarios, or are optimized for patient-physician\ndialogues. These settings diverge significantly from the sequential, uncertainty-laden\ninvestigation inherent to real-world clinical workups, where evidence is latent and must\nbe actively requisitioned. To preserve domain fidelity, DxEvolve is intentionally architected to mirror the structured rigor of actual bedside practice, where evidence is latent\nand must be actively requisitioned. Such divergent information-access constraints and\ninteraction modes make evaluation parity non-trivial; benchmarking against a standardized, workflow-aligned baseline and its corresponding ablations therefore ensures\nthat observed gains are strictly attributable to our architectural innovations rather\nthan artifacts of mismatched task definitions. 4.6 Evaluation and analysis This section defines the evaluation protocol and analysis definitions used throughout\nthe study. We report encounter-level diagnosis accuracy under the DCR workflow,\ncomplemented by regime comparisons against single-pass full-information (FI) inference, exposure-indexed self-evolution analyses based on DCP accrual, and process-level\nmetrics that characterize evidence-acquisition behaviour. All analyses were conducted\non held-out evaluation cohorts with prespecified encounter-level definitions. Episodes, regimes and primary endpoint. Each diagnostic episode starts\nfrom the presenting complaint and limited initial context. The agent iteratively issues\nactions to request additional evidence and receives results only for requested items. Episodes terminate when the agent outputs a final primary diagnosis or reaches\na prespecified maximum number of 20 interaction steps.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 38, + "total_chunks": 95, + "char_count": 2365, + "word_count": 291, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "9d96e0d1-091f-419c-96a3-20e202e9cff7", + "text": "The primary endpoint is\nencounter-level correctness of the final primary diagnosis; episodes that terminate\nwithout a valid diagnosis output are scored as incorrect.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 39, + "total_chunks": 95, + "char_count": 165, + "word_count": 23, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "85c595df-d3df-452d-a255-3c8cb12886c4", + "text": "We report two regimes that\ndiffer only in information availability and interaction constraints. In the interactive\nregime, the agent must explicitly request evidence and may condition decisions only\non evidence acquired within the episode. In single-pass full-information (FI) inference,\nthe model receives the complete record upfront and produces a single-step diagnosis. Single-pass FI inference was evaluated only on the reader-study subset (n=80) as a\nmatched control. Investigative burden and stratification. To analyze the efficacy of DxEvolve\nacross varying levels of diagnostic difficulty, we defined an investigative complexity\nproxy derived from the baseline diagnostic burden. For each encounter, complexity was quantified as the evidence-acquisition footprint—defined as the total number\nof investigative steps required by the baseline CDM model to reach termination. Encounters were stratified into \"high-burden\" and \"low-burden\" groups based on a median split of this footprint across the 400-case evaluation cohort. This stratification\nallowed us to assess whether experience-guided evolution provides differential benefits\nin cases requiring extensive iterative reasoning versus more straightforward clinical\npresentations. Longitudinal self-evolution and improvement cases provenance. To quantify exposure-dependent self-evolution, we varied the number of encounters available\nfor DCP accrual while holding the evaluation cohort fixed (n=400). Accrual encounters were ordered deterministically, and DCP repositories were constructed in a nested\nmanner: at exposure level k, the repository contains DCPs consolidated from the first\nk accrual encounters. This design yields an exposure-indexed learning curve without\nrepeated re-sampling. The DCP-free ablation (DxEvolve w/o DCP) is exposureindependent by construction and was evaluated under the same interactive constraints\nas a reference.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 40, + "total_chunks": 95, + "char_count": 1906, + "word_count": 251, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "c0216905-6437-493a-84c8-f37f8dd6d7d3", + "text": "To isolate evaluation encounters in which DCP reuse plausibly contributes to error\ncorrection, we defined improvement cases as evaluation encounters satisfying all of the\nfollowing criteria: (i) DxEvolve produced a correct primary diagnosis, (ii) DxEvolve\nw/o DCP produced an incorrect diagnosis under the same workflow constraints, and\n(iii) DxEvolve retrieved at least one DCP during the episode. For provenance analyses,\neach retrieved DCP was labeled by the outcome of its source accrual episode at the time\nof consolidation (correct versus incorrect primary diagnosis). We quantified provenance\nenrichment by comparing the distribution of source-episode outcomes among DCPs\nretrieved in improvement cases against the corresponding distribution among DCPs\nretrieved across the full evaluation cohort (that is, pooling retrieval events over all\nevaluation encounters). Unless otherwise stated, provenance analyses were performed\nusing the fixed accrual pool defined by the non-overlapping MIMIC-CDM split.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 41, + "total_chunks": 95, + "char_count": 1008, + "word_count": 139, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "acd89bc0-ac6d-4089-9ec4-e4d53fcbb2af", + "text": "Clinician assessment of DCP clinical maturation. To assess whether DCPs\nconsolidated later in exposure are more clinically useful and reusable, we conducted a\nclinician reader study contrasting an early exposure window (encounters 1–300) and a\nlate exposure window (encounters 1700–2000). For this assessment, we recruited two\nboard-certified internal medicine physicians, one from the Chinese PLA General Hospital, China (with 15 years of clinical experience), one from the Peking University\nThird Hospital, China (with 8 years of clinical experience). Clinicians were masked\nto the exposure window of each DCP and the study hypothesis. From each window,\nwe randomly sampled 20 DCPs (40 total).", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 42, + "total_chunks": 95, + "char_count": 695, + "word_count": 102, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "ad5d7ac2-2339-45d4-9a0b-f820a144c469", + "text": "Each DCP was presented in its native\nthree-part format (experience pattern, test-ordering experience and diagnostic decision experience) with all provenance metadata removed (including exposure index,\nsource outcome and pathology labels) and translated to Chinese via a standardized\ntranslation procedure followed by terminology checks. Two board-certified clinicians\nindependently rated each DCP on a 1–5 ordinal scale across three prespecified dimensions: clinical correctness (including potential safety concerns), actionability (capacity\nto guide evidence acquisition and hypothesis refinement in an interactive workflow)\nand generality (reusability beyond the originating encounter and pathology). Rating\norder was randomized and raters were blinded to sampling window and DCP source. Inter-rater agreement for the clinician ratings was assessed using ordinal-appropriate reliability metrics (quadratic-weighted Cohen's κ and intraclass correlation). Agreement for the aggregate DCP score (mean across the three dimensions) was high\n(weighted κ=0.83, ICC= 0.81), supporting the reliability of the clinician assessment\nfor downstream analyses. For analysis and visualization, ratings were aggregated by\naveraging the two clinicians' scores for each dimension and for the aggregate score.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 43, + "total_chunks": 95, + "char_count": 1291, + "word_count": 167, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "550d723e-ebd0-4b0c-a07f-1309d72a2581", + "text": "Process-level behaviour. We assessed evidence-acquisition behaviour by comparing the investigations requested by each method (DxEvolve and the CDM baseline)\nwith those documented in the MIMIC-CDM structured record for the same encounters\n(n=400). All metrics were averaged across encounters.\n• Trajectory consistency. We quantified workup consistency using four complementary measures. (i) Physical examination (PE) agreement was a binary indicator\nof whether the agent requested a physical examination at any point in the episode (1\nif requested, 0 otherwise). (ii) Laboratory-set F1 compared the set of laboratory tests\nordered by the agent with the set recorded in MIMIC-CDM using a set-level F1 score. Before scoring, laboratory item identifiers were canonicalized using a precomputed\nmapping that collapses equivalent codes to a canonical identifier, reducing artefactual disagreement due to coding variations. Precision reflects avoidance of unnecessary\ntests, whereas recall reflects coverage of recorded tests. (iii) Imaging-set F1 was computed analogously, but over sets of (modality, region) tuples extracted from imaging\nrequests, and a match required agreement on both modality and region. (iv) Actionorder concordance evaluated whether the relative ordering of broad investigation types\nfollowed the reference clinical ordering. We restricted comparison to the intersection\nof investigation types executed by both the agent and the record; if fewer than two\ntypes were present, concordance was defined as 1. Otherwise, we computed pairwise\nconcordance as the fraction of ordered pairs (a, b) consistent with the reference order\nthat were also ordered as a before b in the agent's episode.\n• Clinical guideline adherence proxies. We additionally scored adherence to\nguideline-informed workup expectations using rules-based proxies with three components, reported on a 0–100 scale and averaged to form an overall score. (i) PE timing\nscore captured whether PE was performed as the first workup step (100), performed\nlater (50) or not performed (0). (ii) Laboratory adherence score measured coverage\nof pathology-specific recommended laboratory categories with a two-tier weighting\nscheme: primary tests contributed weight 1.0 each, secondary tests contributed weight\n0.5 each with the total secondary contribution capped by the primary maximum\nto prevent inflation by extensive secondary testing; scores were normalized by the\nmaximum attainable weight for the pathology. (iii) Imaging adherence score evaluated only the first imaging study, scoring whether its modality and region matched\na pathology-specific preferred option (100), an acceptable alternative (50) or otherwise (0), including missing imaging. Guideline categories and imaging preferences were\nderived from established society guidelines (WSES [49–51] for appendicitis, diverticulitis and pancreatitis; Tokyo Guidelines [52] for cholecystitis), and this analysis was\nintended as a conservative, descriptive check for gross deviations rather than a claim\nof a single optimal workup for all clinical contexts.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 44, + "total_chunks": 95, + "char_count": 3085, + "word_count": 433, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "0ec3bd54-d062-478a-8d1e-79a61d205f36", + "text": "The MIMIC-IV dataset is available via PhysioNet subject to completion of the required\ndata-access training and a data use agreement. The MIMIC-CDM benchmark used\nin this study is derived from MIMIC-IV and is available from the original release at\nhttps://physionet.org/content/mimic-iv-ext-cdm under the same terms. After obtaining access to MIMIC-CDM, the data preprocessing and cohort-splitting scripts used\nin this study (to reproduce the non-overlapping accrual and evaluation partitions)\nare available at https://github.com/RUCAIBox/DxEvolve. The external cohort from\nthe Chinese PLA General Hospital is not publicly available due to institutional datagovernance requirements. Access to the minimum dataset necessary to reproduce\nthe external-cohort analyses may be considered for qualified researchers, subject to\napproval by the hospital's data governance procedures and execution of an appropriate\ndata-use agreement; requests should be directed to the corresponding authors. The code for DxEvolve is available at https://github.com/RUCAIBox/DxEvolve. All\nprompts used in DxEvolve are included in the Supplementary Information. L., Franklin, N. & Gordon, R.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 45, + "total_chunks": 95, + "char_count": 1165, + "word_count": 155, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "580478e4-0164-4f64-9235-8700a3fcc901", + "text": "Diagnostic error in internal medicine. Archives of internal medicine 165, 1493–1499 (2005). [2] Singh, H. & Sittig, D.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 46, + "total_chunks": 95, + "char_count": 118, + "word_count": 18, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "82b198ab-7aae-4c7a-84eb-bce236790c97", + "text": "Advancing the science of measurement of diagnostic\nerrors in healthcare: the safer dx framework. BMJ quality & safety 24, 103–110\n(2015). [3] Singh, H., Meyer, A. The frequency of diagnostic errors in\noutpatient care: estimations from three large observational studies involving us\nadult populations. BMJ quality & safety 23, 727–731 (2014).", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 47, + "total_chunks": 95, + "char_count": 341, + "word_count": 51, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "2636151e-d366-4be7-adf3-bda6888f872c", + "text": "The causes of errors in clinical reasoning: cognitive biases,\nknowledge deficits, and dual process thinking. Academic Medicine 92, 23–30\n(2017). Adverse diagnostic events in hospitalised patients: a singlecentre, retrospective cohort study. BMJ Quality & Safety 34, 377–388 (2025). Improving Diagnosis in Health Care\n(National Academies Press, 2016). [7] Schwartzstein, R. Critical thinking for 21st-century\nmedicine—moving beyond illness scripts. JAMA 334, 1509–1510 (2025). [8] Mahajan, A., Obermeyer, Z., Daneshjou, R., Lester, J. & Powell, D. Cognitive\nbias in clinical large language models. npj Digital Medicine 8, 428 (2025). [9] Ferber, D. et al.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 48, + "total_chunks": 95, + "char_count": 654, + "word_count": 92, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "ea0c4d08-c860-43f5-9177-6c4a60bd405d", + "text": "Development and validation of an autonomous artificial intelligence agent for clinical decision-making in oncology. Nature cancer 1–13\n(2025). [10] Nenadic, I. et al. Physicians as context engineers in the era of generative AI. Nature Medicine (2026). URL https://doi.org/10.1038/s41591-026-04215-x. [11] Singhal, K. et al. Large language models encode clinical knowledge.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 49, + "total_chunks": 95, + "char_count": 372, + "word_count": 49, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "5819c483-058a-496d-ae40-24f237adbd85", + "text": "Nature 620,\n172–180 (2023). [12] Achiam, J. et al. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023). V., M¨oller, S. & Ryg, J. Use of gpt-4 to diagnose complex clinical\ncases (2024). [14] Savage, T., Nayak, A., Gallo, R., Rangan, E. & Chen, J.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 50, + "total_chunks": 95, + "char_count": 257, + "word_count": 43, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "e4a44d4f-23c8-4791-9106-79edcb3c6218", + "text": "Diagnostic reasoning\nprompts reveal the potential for large language model interpretability in medicine. NPJ Digital Medicine 7, 20 (2024). Quantifying the reasoning abilities of llms on clinical cases. Nature\nCommunications 16, 9799 (2025).", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 51, + "total_chunks": 95, + "char_count": 241, + "word_count": 33, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "bd9b2cc3-1b52-43d1-9eaf-5975894f5ccd", + "text": "Knowledge-practice performance gap in clinical large language models: Systematic review of 39 benchmarks. Journal of Medical Internet Research 27, e84120 (2025). Assessment of large language models in clinical reasoning: a\nnovel benchmarking study. NEJM AI 2, AIdbp2500120 (2025). Reliability of LLMs as medical assistants for the general\npublic: a randomized preregistered study. Nature Medicine (2026). URL https:\n//doi.org/10.1038/s41591-025-04074-y. Comparative analysis of multimodal large language model\nperformance on clinical vignette questions. JAMA 331, 1320–1321 (2024). [20] Kaczmarczyk, R., Wilhelm, T. I., Martin, R. & Roos, J.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 52, + "total_chunks": 95, + "char_count": 641, + "word_count": 85, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "60b3157b-bbda-4201-91a7-3fbaba51e916", + "text": "Evaluating multimodal\nai in medical diagnostics. npj Digital Medicine 7, 205 (2024). [21] McDuff, D. et al. Towards accurate differential diagnosis with large language\nmodels. [22] Z¨oller, N. et al. Human–ai collectives most accurately diagnose clinical vignettes.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 53, + "total_chunks": 95, + "char_count": 265, + "word_count": 37, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "bcf4da4a-e905-412f-9a7f-1615a6c56e9e", + "text": "Proceedings of the National Academy of Sciences 122, e2426153122 (2025). [23] Bhasuran, B. et al. Preliminary analysis of the impact of lab results on large\nlanguage model generated differential diagnoses. npj Digital Medicine 8, 166\n(2025). Macd: Multi-agent clinical diagnosis with self-learned knowledge for\nllm. arXiv preprint arXiv:2509.20067 (2025). Enhancing diagnostic capability with multi-agents conversational\nlarge language models. NPJ digital medicine 8, 159 (2025). An agentic system for rare disease diagnosis with traceable\nreasoning.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 54, + "total_chunks": 95, + "char_count": 550, + "word_count": 74, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "43353817-1e68-4c0e-88d3-bf55d2249e03", + "text": "[27] Charlin, B., Boshuizen, H. Scripts and clinical\nreasoning. Medical education 41, 1178–1184 (2007).", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 55, + "total_chunks": 95, + "char_count": 103, + "word_count": 14, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "e9c1889f-264d-4d88-84f7-9960e6ef970d", + "text": "Zaimis, E. (ed.) A-mem: Agentic memory for llm agents. (ed.Zaimis,\nE.) Advances in Neural Information Processing Systems (2025). Agent hospital: A simulacrum of hospital with evolvable medical\nagents. arXiv preprint arXiv:2405.02957 (2024). [30] Food, U., Administration, D. et al.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 56, + "total_chunks": 95, + "char_count": 281, + "word_count": 39, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "df3c4d0c-17eb-4816-ba3f-ebd9913763df", + "text": "Transparency for machine learning-enabled\nmedical devices: Guiding principles. US Food And Drug Administration. Retrieved\nJune 30, 2024 (2024).", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 57, + "total_chunks": 95, + "char_count": 143, + "word_count": 18, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "e385003c-f8be-4870-8961-bde3ec5514bf", + "text": "[31] Babic, B., Glenn Cohen, I., Stern, A. D., Li, Y. & Ouellet, M. A general framework\nfor governing marketed ai/ml medical devices. npj Digital Medicine 8, 328 (2025). A generalist medical language model for disease diagnosis assistance. Nature medicine 31, 932–942 (2025). Empirical data drift detection experiments on real-world medical\nimaging data.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 58, + "total_chunks": 95, + "char_count": 354, + "word_count": 53, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "eeb8f3cc-fc38-41ab-b3ce-7127cdfaabe8", + "text": "Nature communications 15, 1887 (2024). [34] Subasri, V. et al. Detecting and remediating harmful data shifts for the responsible deployment of clinical ai models. JAMA Network Open 8, e2513685–e2513685\n(2025).", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 59, + "total_chunks": 95, + "char_count": 209, + "word_count": 30, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "2e4330f1-837d-407c-aadc-d794a284e74e", + "text": "Zaimis, E. (ed.) Memory injection attacks on llm agents via queryonly interaction. (ed.Zaimis, E.) Advances in Neural Information Processing\nSystems (2025). Foundation models for generalist medical artificial intelligence. Nature 616, 259–265 (2023). Towards conversational diagnostic artificial intelligence. Nature 642,\n442–450 (2025).", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 60, + "total_chunks": 95, + "char_count": 337, + "word_count": 41, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "8035a68d-eaf8-47b0-80d0-842fb4af7aa6", + "text": "Sequential diagnosis with language models. arXiv preprint [39] Rajpurkar, P., Chen, E., Banerjee, O. & Topol, E. Ai in health and medicine. Nature medicine 28, 31–38 (2022). [40] Hager, P. et al. Evaluation and mitigation of the limitations of large language\nmodels in clinical decision-making. Nature Medicine (2023). URL https://doi.\norg/10.1038/s41591-024-03097-1. Mimic-iv, a freely accessible electronic health record dataset. Scientific data 10, 1 (2023). Qwen3 technical report. arXiv preprint arXiv:2505.09388 (2025). Deepseek-v3. 2: Pushing the frontier of open large language models. Glm-4.5: Agentic, reasoning, and coding (arc) foundation models\n(2025). URL https://arxiv.org/abs/2508.06471. arXiv:2508.06471. [45] Sellergren, A. et al.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 61, + "total_chunks": 95, + "char_count": 748, + "word_count": 98, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "10cfba3a-2122-4b7d-8ab9-d8c2bc42922d", + "text": "Medgemma technical report. arXiv preprint arXiv:2507.05201\n(2025). Clinical camel: An open expert-level medical language model with\ndialogue-based knowledge encoding. arXiv preprint arXiv:2305.12031 (2023). [47] Xiao, S., Liu, Z., Zhang, P. & Muennighoff, N. C-pack: Packaged resources to\nadvance general chinese embedding (2023). arXiv:2309.07597. [48] Johnson, J., Douze, M. & J´egou, H.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 62, + "total_chunks": 95, + "char_count": 389, + "word_count": 51, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "c40cd1aa-701d-45ea-81c2-6dcdfe04c3ad", + "text": "Billion-scale similarity search with GPUs. IEEE Transactions on Big Data 7, 535–547 (2019). [49] Di Saverio, S. et al. Diagnosis and treatment of acute appendicitis: 2020 update of\nthe wses jerusalem guidelines. World journal of emergency surgery 15, 27 (2020). [50] Sartelli, M. et al. 2020 update of the wses guidelines for the management of\nacute colonic diverticulitis in the emergency setting. World Journal of Emergency\nSurgery 15, 32 (2020). [51] Lepp¨aniemi, A. et al. 2019 wses guidelines for the management of severe acute\npancreatitis. World journal of emergency surgery 14, 27 (2019). [52] Yokoe, M. et al. Tokyo guidelines 2018: diagnostic criteria and severity grading\nof acute cholecystitis (with videos). Journal of Hepato-biliary-pancreatic Sciences\n25, 41–54 (2018). Supplementary Information A Diagnostic Prompt Template The following is the main diagnostic prompt template of DxEvolve used in all experiments across various base models reported in this paper, with medical examinations,\nexperience retrieval, clinical guidelines, and PubMed search enabled.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 63, + "total_chunks": 95, + "char_count": 1076, + "word_count": 157, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "993bad2e-6daa-4465-8c5e-14a7913045fb", + "text": "Template variables are shown in {braces}. The tags {system tag start}, {system tag end},\n{user tag start}, {user tag end}, and {ai tag start} are replaced with modelspecific chat delimiters at runtime. Supplementary Table 1: Diagnostic Prompt Template. {system tag start}\nYou are a senior physician. Your task is to perform stepwise diagnostic reasoning\nusing ONLY the allowed tools. You must strictly follow one of the two output\nformats below at every step. INFORMATION GATHERING\nThought: [1-2 concise sentences: what you know + what uncertainty remains +\nwhy next action is needed]\nAction: [One of: Physical Examination, Laboratory Tests, Imaging, Experience\nSearch, Guideline Search, PubMed Search]\nAction Input: [Specific and valid request, MUST be within tool scope]\nObservation:\n[The system will fill this. DO NOT include any results yourself.] FINAL DIAGNOSIS\nThought: [1-2 concise sentences summarizing key findings leading to the diagnosis]\nFinal Diagnosis: [Single, clear, concise, and standard diagnosis. (Avoid overly complex or speculative etiological chains, focus on the most likely and commonly\nrecognized diagnosis.)]", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 64, + "total_chunks": 95, + "char_count": 1135, + "word_count": 166, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "73740833-07fa-429a-bdc8-36a9f0faafd1", + "text": "You MUST always follow the exact format (A or B). For any test, ONLY request those allowed by the corresponding tool.\n- Laboratory Tests: only valid lab names.\n- Imaging: must specify ' ' format (e.g., 'Abdomen\nUltrasound', 'Abdomen CT').\n- No invented tests, no unsupported modalities.\n3. Before giving the final diagnosis, you MUST explicitly perform all three core\ntypes of medical evaluation as actions – at least one Physical Examination, one\nLaboratory Test, and one Imaging.\n- Consider all clinically relevant imaging modalities for the suspected condition. Continued on next page", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 65, + "total_chunks": 95, + "char_count": 605, + "word_count": 93, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "4346664b-c878-48d7-82d2-d68aa4e0afe7", + "text": "- Do not omit a modality that is commonly recommended or diagnostically critical\nunless it is clearly inappropriate.\n4. You MUST use Experience Search at least once before giving the final diagnosis.\n- In Action Input you SHOULD provide a short case style description of this\npatient (age, sex, chief complaint, symptom pattern, duration, key exam or lab or\nimaging findings), not just a single disease keyword.\n- If the retrieved experience is clearly irrelevant or not useful, you may reformulate\nthe Action Input once and try a second Experience Search query. Do NOT keep\nsearching repeatedly.\n- Only integrate insights that are consistent with this patient's objective data.\n5. You MUST use Guideline Search at least once before giving the final diagnosis.\n6. Stop when a confident diagnosis is possible based on available information.\n7. When using Experience Search, Guideline Search, or PubMed Search, integrate\nonly relevant insights into your Thought and proceed; do not rely on them if they\nconflict with patient-specific objective data.\n8. If uncertainty remains but no high-yield action exists, you MUST provide the\nbest-supported diagnosis (Format B) based on currently available data, without\nloop actions indefinitely. CRITICAL FORMAT RULES:\n1. MUST output the \"Observation:\" label immediately after Action Input as a\nsignal to pause for respond.\n2. Keep \"Action\", \"Action Input\" and \"Final Diagnosis\" fields concise and to the\npoint. AVAILABLE TOOLS:\n- Physical Examination: Request physical examination of patient and receive the\nobservations. This is a strongly recommended Examination in the clinical diagnostic\nprocess and should be performed first.\n- Laboratory Tests: Request specific laboratory test and receive text values. Specify\ntest names in 'Action Input' clearly.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 66, + "total_chunks": 95, + "char_count": 1793, + "word_count": 274, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "ce69bee7-5381-48c2-8597-43d72c41e79b", + "text": "This is a common diagnostic step in the clinical\nevaluation.\n- Imaging: Request imaging scans and receive the radiologist report. Region AND\nmodality MUST be specified in the 'Action Input' field.\n- Experience Search: Dense retrieval over past diagnostic cases. Action Input\nSHOULD be a short case style description of this patient, not just a disease name.\n- Guideline Search: Retrieve relevant clinical guidelines. Provide a concise clinical query in \"Action Input\" (symptoms, suspected diagnosis, key labs/imaging, or\ndecision point).\n- PubMed Search: Conduct targeted search on PubMed and receive relevant medical\narticles. Concise and specific search query (few KEYWORDS) MUST be specified\nin \"Action Input\". Continued on next page", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 67, + "total_chunks": 95, + "char_count": 736, + "word_count": 110, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "b0a3b14d-cd2f-4143-8efb-5aea8707786a", + "text": "BE EFFICIENT: Prioritize high-yield diagnostic actions before broad or low-yield\nones. Some medical examination information may not be available, do not focus\non the unavailable data, make full use of the information that can be obtained to\ndiagnose.\n{system tag end}{user tag start} Patient History:\n{input} BEGIN YOUR DIAGNOSTIC PROCESS:\n{user tag end}{ai tag start}\nThought:{agent scratchpad} The prompt instructs the LLM to act as a senior physician performing stepwise diagnostic reasoning in an action-based loop. Two output formats\nare enforced: Format A for iterative information gathering (Thought →Action →\nObservation) and Format B for the final diagnosis with thought. B Experience Construction Prompt Template After each diagnostic case is completed, the following template is used to distill the\ncase into a reusable diagnostic cognition primitive (DCP) through reflection on the\ndiagnostic trajectory. The DCP is stored in the DCP repository for retrieval in future\ncases. Supplementary Table 2: Experience Construction Prompt. {system tag start}\nYou extract reusable diagnostic reasoning experience from completed clinical cases\nfor future tool using agents. Your goal:\n- Do NOT retell the full case or reproduce chain of thought.\n- Do NOT include treatment.\n- Distill ONE Diagnostic Cognition Primitive (DCP): a short heuristic that improves\nfuture diagnosis. The DCP must:\n- Be consistent with the ground truth diagnosis and the correctness flag.\n- Focus on diagnostic reasoning, not management or consultation.\n- Emphasize when and how to use ONLY the following tools in future similar cases:\n- Physical Examination (no additional input)\n- Laboratory Tests (input: names of the lab tests to run)", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 68, + "total_chunks": 95, + "char_count": 1714, + "word_count": 259, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "2494151a-cc8d-4d03-b0ef-b83634b66755", + "text": "Continued on next page - Imaging (input: imaging modality and region to be scanned) Tool input templates (copyable):\n- Physical Examination\n- Laboratory Tests: , , ...\n- Imaging: modality=, region= Coverage constraints:\n- Only recommend tests or imaging settings that are explicitly supported by the\nprovided case context, meaning they appear in at least one of:\n1) Clinician test orders (from the chart). Use this as a high quality reference for\nrealistic first line test selection and sequencing.\n2) Diagnostic steps where the tool call succeeded (has a non-error observation)\n3) Rule based feedback 'message' or retrieved guidance that explicitly recommends a specific test or imaging setting\n- Prefer to fully cover the explicitly provided clinician orders and successful tool\ncalls before adding anything else.\n- Do not invent new tests, imaging modalities, regions, or non-provided measurement names. Field roles:\n- Experience Pattern:\n- Case-style trigger pattern for retrieval, built from symptoms, basic context, and\nkey objective findings.\n- You may append compact labels such as the final correct diagnosis and common\nmisdiagnoses to improve retrieval.\n- Test Ordering Experience:\n- Constructive test-ordering heuristic using only the allowed tools and toolcompatible inputs.\n- You may rank actions by priority and specify escalation criteria, in natural\nclinical language.\n- Avoid blanket prohibitions. If a test is lower priority, express it as conditional\nor deferred rather than discouraged.\n- When naming tests or imaging, use the copyable tool input templates above.\n- Diagnostic Decision Experience:\n- Short rule on how to weigh key findings and move from differential diagnosis to\nthe correct final diagnosis. Error correction rules:\n- If correctness is \"Correct\":\n- Treat the model's diagnostic process as broadly appropriate.\n- Extract the most reusable diagnostic pattern and test ordering heuristic.\n- If correctness is \"Incorrect\": Continued on next page", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 69, + "total_chunks": 95, + "char_count": 2022, + "word_count": 304, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "9cefc019-9a2d-4bf1-94bf-e0c44bd5e869", + "text": "- Treat the model's final diagnosis and reasoning as a negative example.\n- Do NOT justify or reuse the incorrect diagnosis.\n- Use the ground truth and the rule based feedback in 'message' as the primary\nreference.\n- Base the DCP on the ideal diagnostic process implied by that feedback. Input fields:\n- Patient input: raw case description.\n- Diagnostic steps: chronological list of tool calls and observations.\n- Model final diagnosis: what the model concluded.\n- Ground truth diagnosis: correct diagnosis label for this case.\n- Correctness flag: \"Correct\" or \"Incorrect\".\n- Rule based feedback: comments about missing exams, unnecessary tests, wrong\nimaging, and efficiency.\n- Clinician test orders (from the chart): tests ordered by the treating clinician as\ndocumented in the chart, expressed with the same tool names and inputs, and\nserving as a realistic reference for first line test selection and sequencing. Case context:\nPatient input:\n{input} Diagnostic steps:\n{intermediate steps} Model final diagnosis:\n{output} Ground truth diagnosis:\n{ground truth} Correctness flag:\n{correctness} Rule based feedback on process:\n{message} Clinician test orders (from the chart):\n{clinician} Now output exactly in this format: Continued on next page", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 70, + "total_chunks": 95, + "char_count": 1246, + "word_count": 189, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "86942047-03fb-44eb-84c7-44fa58af8451", + "text": "Experience Pattern: \nTest Ordering Experience: \nDiagnostic Decision Experience: \n{system tag end} This template implements the Experience Construction module that\ngenerates DCPs from completed cases. Each DCP consists of three fields: (1) Experience Pattern, a case-style trigger description optimized for dense retrieval; (2) Test\nOrdering Experience, a prioritized test-ordering heuristic grounded in clinician orders\nand successful tool calls; and (3) Diagnostic Decision Experience, a concise rule\nfor weighing findings toward the correct diagnosis. The {message} variable contains rule-based evaluator feedback on the diagnostic process, which identifies missing\nexaminations, unnecessary tests, or procedural deviations based on pathology-specific\nevaluation criteria. For example, if the agent failed to request appropriate imaging for\nsuspected appendicitis, the feedback might state: \"Imaging: no appropriate abdominal imaging was requested. Set region='Abdomen' and request imaging (ultrasound\nis typically preferred in pediatric or pregnant patients, while CT is generally recommended for adult non-pregnant patients).\" This feedback guides the DCP construction\nto emphasize the correct diagnostic workflow.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 71, + "total_chunks": 95, + "char_count": 1598, + "word_count": 210, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "36cb8573-3552-40f9-a689-7000ecb5cf1f", + "text": "The {clinician} variable provides real\nclinician test orders extracted from the MIMIC-IV chart, serving as a high-quality\nreference for realistic test selection and sequencing. C Example Diagnostic Cognition Primitive The following is a representative DCP generated through reflection on the diagnostic\ntrajectory from a correctly diagnosed case of acute biliary pancreatitis. This DCP is\nstored in the DCP repository and retrieved via vector-based dense retrieval when the\nagent encounters similar presentations in future cases. Supplementary Table 3: Example DCP (Correct Case). Experience Pattern:\nPost-cholecystectomy patient with acute RUQ/back pain, elevated liver enzymes\nand lipase. (Acute pancreatitis, DDx: Biliary pancreatitis vs. other etiologies) Test Ordering Experience: Continued on next page", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 72, + "total_chunks": 95, + "char_count": 808, + "word_count": 109, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "b953f0b3-ff96-4aee-b115-bf7407a9fdc7", + "text": "First, confirm pancreatitis with Laboratory Tests: Lipase, Amylase, CBC, CMP. Concurrently, order first-line biliary imaging: Imaging: modality=Ultrasound,\nregion=Abdomen. If ultrasound is negative for stones/dilation but liver enzymes remain elevated,\nescalate to definitive biliary evaluation (ERCP) per clinician orders; do not escalate\nto CT or MRCP without specific indications (e.g., concern for complications or\nfailed ERCP). Diagnostic Decision Experience:\nIn a post-cholecystectomy setting, acute pancreatitis with concurrent transaminitis/hyperbilirubinemia is biliary in origin until proven otherwise, even with a\nnegative initial ultrasound, as microlithiasis or sphincter dysfunction may be the\ncause. Below is a second example DCP generated from an incorrectly diagnosed case,\ndemonstrating the error correction mechanism. The agent originally diagnosed \"adhesive small bowel obstruction\" but the ground truth was cholecystitis. Supplementary Table 4: Example DCP (Incorrect Case). Experience Pattern:\nYoung to middle-aged female with acute right abdominal pain, sharp on palpation,\nbilious vomiting, chills, and history of prior abdominal surgeries (e.g., laparoscopies). Past medical history of endometriosis. (Correct: cholecystitis; Common\nmisdiagnosis: adhesive small bowel obstruction)", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 73, + "total_chunks": 95, + "char_count": 1305, + "word_count": 165, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "83049e30-22e6-4fbd-85d9-8d42322541e7", + "text": "Test Ordering Experience:\n1. Physical Examination.\n2. Laboratory Tests: CBC differential, CMP, (Blood) Lactate, (Urine) HCG.\n3. Imaging: modality=Ultrasound, region=Abdomen. Escalate to further imaging (e.g., CT) only if ultrasound is non-diagnostic and\nclinical suspicion for obstruction or other complication remains high. Diagnostic Decision Experience:\nIn a patient with right upper quadrant or right-sided abdominal pain, vomiting,\nand chills, prioritize gallbladder pathology. A history of prior surgery should not\nprematurely anchor to adhesive obstruction; a finding of gallstones on ultrasound,\nespecially with local tenderness, strongly supports cholecystitis over obstruction.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 74, + "total_chunks": 95, + "char_count": 687, + "word_count": 88, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "dc7ac002-333e-48ca-9cd0-aeea7bc32dbf", + "text": "The first DCP illustrates how a correctly diagnosed case is consolidated\ninto a reusable experience artifact. The experience pattern provides a high-salience\nsignature for retrieval, summarizing the presentation and discriminative cues. The testordering experience encodes actionable workup guidance, including high-yield nextstep evaluations and contingency options. The diagnostic decision experience captures evidence-linked implications for hypothesis refinement and final decision-making. The\nsecond DCP demonstrates how corrective lessons are incorporated when the source\ntrajectory exposed an error mode: when the agent misdiagnosed cholecystitis as small\nbowel obstruction in a case with atypical presentation, the DCP was constructed from\nthe ground truth and evaluator feedback, explicitly labeling the common misdiagnosis\nand providing the correct reasoning pathway. D Example Diagnostic Reasoning Trace The following is a complete diagnostic reasoning trace from a real case in the MIMICCDM benchmark, showing the agent's stepwise process from initial presentation to\nfinal diagnosis.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 75, + "total_chunks": 95, + "char_count": 1096, + "word_count": 144, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "a1baab34-1052-4901-b76d-a6d921a75847", + "text": "The case involves an elderly patient with diabetes presenting with\nacute right upper quadrant abdominal pain, ultimately diagnosed with acute calculous\ncholecystitis. Supplementary Table 5: Example Diagnostic Reasoning Trace. Elderly male patient with history of type 2 diabetes mellitus on insulin, hypothyroidism, hypertension, and prostate cancer status-post radiotherapy presented to\nthe emergency department with acute onset abdominal pain. The patient reported\nthat the pain began suddenly at approximately 3 AM, waking him from sleep. He\ndescribed it as sharp, constant, and localized to the right side of the abdomen. When\nthe pain persisted, he initially attempted to contact his primary care physician but\nwas unable to secure an appointment, prompting him to call emergency services. In the emergency department, initial vital signs were temperature 98°F, blood pressure 150/86 mmHg, heart rate normal, respiratory rate 16 breaths per minute,\noxygen saturation 100% on room air. Initial laboratory studies showed lactate 2.3\nmmol/L, glucose 279 mg/dL, normal liver function tests, and hematocrit 33.2%.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 76, + "total_chunks": 95, + "char_count": 1113, + "word_count": 161, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "8de6d4a4-acea-4582-8262-2cc6348f10d8", + "text": "Right upper quadrant ultrasound demonstrated extrahepatic biliary duct dilation\nwith common bile duct measuring 10 mm and presence of gallstones. Computed\ntomography of the abdomen and pelvis confirmed these findings. He was treated\nwith intravenous fluids, morphine for pain control, and ondansetron for nausea. On admission to the hospital floor, the patient reported improved but persistent\nright upper quadrant abdominal pain. He endorsed dry heaves and constipation,\nwith his last bowel movement occurring two days prior. He denied fever, chills, chest\npain, or shortness of breath. Continued on next page Past Medical History: Prostate cancer status-post radiotherapy, type 2 diabetes\nmellitus on insulin, hypertension, hypothyroidism, chronic kidney disease stage III,\nhistory of gallstones, nephrolithiasis, osteoarthritis, glaucoma, cataracts, sickle cell\ntrait, hearing loss. Past Surgical History: Pilonidal cyst excision, bilateral inguinal hernia repairs.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 77, + "total_chunks": 95, + "char_count": 968, + "word_count": 132, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "e5c5522b-6436-45ea-a9a8-e6e9ba60b9ab", + "text": "Social History: Details not specified. Family History: Type 2 diabetes mellitus in family members; no family history of\nprostate cancer. Step 1: Physical Examination Action: Physical Examination Observation:\nVitals: Temperature 98.2°F, BP 148/68 mmHg, HR 90 bpm, RR 20, O2 sat 100%. General: Alert and oriented, no acute distress. HEENT: Sclerae anicteric, moist mucous membranes, oropharynx clear, pupils\nequal and reactive to light, bilateral cloudy lenses (cataracts). Cardiovascular: Regular rate and rhythm, normal S1 and S2 heart sounds, no\nmurmurs, rubs, or gallops. Pulmonary: Clear to auscultation bilaterally, no wheezes, rales, or rhonchi. Abdomen: Soft, tender to palpation in the right upper quadrant, non-distended,\nbowel sounds present, no organomegaly, no rebound tenderness or guarding. Genitourinary: No Foley catheter in place. Extremities: Warm and well-perfused, 2+ pulses bilaterally, no clubbing, cyanosis,\nor edema. Neurological: Cranial nerves II-XII grossly intact, strength grossly intact, noted\nhearing loss.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 78, + "total_chunks": 95, + "char_count": 1036, + "word_count": 144, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "10e9e0f5-b2f0-4dd9-bfa2-3523b334652a", + "text": "Step 2: Laboratory Tests Continued on next page Action: Laboratory Tests Action Input: Complete blood count with differential, comprehensive metabolic\npanel, lipase Observation:\nWhite Blood Cell Count: 7.6 K/uL (reference range: 4.0–11.0 K/uL) Neutrophils: 89.1%, Absolute Neutrophil Count: 6.77 K/uL [Elevated, indicating\nneutrophilia] Lymphocytes: 6.4%, Absolute Lymphocyte Count: 0.49 K/uL Monocytes: 3.2%, Absolute Monocyte Count: 0.24 K/uL Eosinophils: 0.4%, Absolute Eosinophil Count: 0.03 K/uL Basophils: 0.5%, Absolute Basophil Count: 0.04 K/uL Red Blood Cell Count: 4.73 M/uL Hemoglobin: 11.2 g/dL (reference range: 13.5–17.5 g/dL) [Low] Hematocrit: 33.2% (reference range: 41–53%) [Low] MCV: 70.0 fL (reference range: 80–100 fL) [Low, microcytic] MCH: 23.7 pg (reference range: 27–31 pg) [Low]", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 79, + "total_chunks": 95, + "char_count": 803, + "word_count": 108, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "892406a4-e133-418b-9d11-959332c907a3", + "text": "MCHC: 33.7 g/dL (reference range: 32–36 g/dL) Platelet Count: 156 K/uL (reference range: 150–400 K/uL) RDW: 14.6% (reference range: 11.5–14.5%) Glucose: 279 mg/dL (reference range: 70–100 mg/dL) [Elevated] Sodium: 141 mEq/L (reference range: 136–145 mEq/L) Potassium: 3.6 mEq/L (reference range: 3.5–5.1 mEq/L) Chloride: 99 mEq/L (reference range: 98–107 mEq/L) Continued on next page", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 80, + "total_chunks": 95, + "char_count": 384, + "word_count": 53, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "ee9b897e-9a0e-47cb-a34d-19f91acda218", + "text": "Bicarbonate: 25 mEq/L (reference range: 22–29 mEq/L) Blood Urea Nitrogen: 19 mg/dL (reference range: 7–20 mg/dL) Creatinine: 1.1 mg/dL (reference range: 0.7–1.3 mg/dL) Calcium, Total: 9.3 mg/dL (reference range: 8.5–10.5 mg/dL) Alanine Aminotransferase (ALT): 43 IU/L (reference range: 7–56 IU/L) Aspartate Aminotransferase (AST): 30 IU/L (reference range: 10–40 IU/L) Alkaline Phosphatase: 91 IU/L (reference range: 40–150 IU/L) Total Bilirubin: 0.6 mg/dL (reference range: 0.3–1.2 mg/dL) Lipase: 12 IU/L (reference range: 7–60 IU/L) Step 3: Imaging (Abdominal Ultrasound) Action Input: Abdomen Ultrasound (focused on gallbladder and biliary tree) Observation:\nRight Upper Quadrant Ultrasound: TECHNIQUE: Grayscale and color Doppler ultrasound images of the abdomen. Liver: Homogenous attenuation throughout. No focal lesions identified.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 81, + "total_chunks": 95, + "char_count": 838, + "word_count": 110, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "e4be4f10-6fb8-409b-a927-df1436e2d28c", + "text": "Hepatobiliary: Mild intrahepatic and extrahepatic biliary duct dilation noted. Common bile duct measures 12 mm in diameter. Gallbladder contains gallstones without\nwall thickening or surrounding inflammation visible on this initial study. Trace\npericholecystic free fluid noted. Pancreas: Diffuse pancreatic atrophy. No pancreatic duct dilation or peripancreatic\nfat stranding. Continued on next page", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 82, + "total_chunks": 95, + "char_count": 400, + "word_count": 52, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "680e35ac-1025-46b8-b91d-da2ad595c185", + "text": "Spleen: Normal size and attenuation, no focal lesions. Kidneys: Bilateral subcentimeter renal hypodensities too small to characterize. A 6.1 × 5.6 cm simple cyst in the interpolar region of the left kidney. No\nhydronephrosis or perinephric abnormality. IMPRESSION: Cholelithiasis, common bile duct dilation (12 mm), trace pericholecystic fluid. Recommend correlation with clinical findings and consider advanced\nbiliary imaging if indicated. Step 4: Experience Search Action: Experience Search", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 83, + "total_chunks": 95, + "char_count": 493, + "word_count": 67, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "075f29c3-9f08-4d40-ae02-d3ac8baf853b", + "text": "Action Input: Elderly male with diabetes, acute RUQ pain, leukocytosis, normal\nLFTs, imaging shows acute cholecystitis (gallbladder wall edema, pericholecystic\nfluid) and common bile duct dilation with papillary stenosis from passed stones Experience Pattern: Elderly female with known gallstones and prior biliary intervention (ERCP/stent) presents with acute right upper quadrant pain and tenderness;\nnormal white blood cell count and liver enzymes; ultrasound shows gallstones,\ngallbladder wall thickening, and common bile duct dilation. [Correct: Acute\nCholecystitis — Misdiagnoses: Cholangitis, Biliary Colic, Pancreatitis] Test Ordering Experience: First, perform a Physical Examination focusing on\nabdominal signs. Then, order initial Laboratory Tests: CBC, CMP (including liver\nenzymes and bilirubin), and lipase. If these are normal but suspicion remains, escalate to Laboratory Tests: C-reactive protein (CRP). For imaging, the first and\noften definitive test is Imaging: modality=Ultrasound, region=Abdomen (specifically right upper quadrant). Do not escalate to CT or MRCP unless the ultrasound\nis non-diagnostic or complications are suspected. Diagnostic Decision Experience: In a patient with known gallstones, acute RUQ\npain/tenderness, and ultrasound findings of gallstones with gallbladder wall thickening, diagnose acute cholecystitis even if inflammatory markers and LFTs are\nnormal; CBD dilation and pneumobilia in this context are likely chronic sequelae\nof prior ERCP and do not rule out cholecystitis.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 84, + "total_chunks": 95, + "char_count": 1524, + "word_count": 205, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "c10bdef5-16c9-4453-b457-366688533a9e", + "text": "[Additional guidelines omitted for brevity] Continued on next page Step 5: Imaging (MRCP) Action Input: Abdomen MRCP Observation:\nMagnetic Resonance Cholangiopancreatography (MRCP): TECHNIQUE: T1- and T2-weighted multiplanar images of the abdomen acquired\nin a 1.5 T magnet. Intravenous contrast: 9 mL Gadavist. Oral contrast: 1 cc of\nGadavist mixed with 50 cc of water. Lower Thorax: Mild bibasilar atelectasis. No large pleural effusion. Liver: Homogeneous signal intensity without focal lesions. Mild, nonspecific periportal edema. Biliary: The gallbladder is distended and contains numerous calculi. Since prior\nexaminations, there is progressive gallbladder wall edema and pericholecystic fluid\nwith gallbladder wall enhancement. No clear pericholecystic hepatic hyperenhancement.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 85, + "total_chunks": 95, + "char_count": 785, + "word_count": 104, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "9f4b5d8f-751c-4da2-bbaf-599aa641729d", + "text": "The common bile duct measures up to 1.2 cm in diameter. It tapers abruptly\nwith a fixed loss in caliber, most consistent with papillary stenosis, likely the result\nof previously passed stones.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 86, + "total_chunks": 95, + "char_count": 192, + "word_count": 32, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "f14bafdc-aaf6-4ba0-9a0e-475a5832e814", + "text": "Pancreas: Normal signal and enhancement. Several subcentimeter cystic lesions. Adrenal Glands: Unremarkable. Large simple cyst in the upper pole of the left kidney,\nunchanged. Minimal nonspecific perinephric fat stranding. Gastrointestinal Tract: No significant abnormalities. Lymph Nodes: No enlarged lymph nodes. Vasculature: Patent aorta and main branches. Continued on next page", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 87, + "total_chunks": 95, + "char_count": 382, + "word_count": 50, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "354dc49d-95c9-4520-be9b-15a3c8f2e44c", + "text": "Osseous and Soft Tissue Structures: Mild degenerative changes throughout the\nlower thoracic and lumbar spine. Mild symmetric bilateral gynecomastia noted\nincidentally. IMPRESSION: Acute calculous cholecystitis with gallbladder wall edema, pericholecystic fluid, and wall enhancement. Common bile duct dilation with abrupt\ntapering suggestive of papillary stenosis from previously passed stones.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 88, + "total_chunks": 95, + "char_count": 394, + "word_count": 49, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "42223731-b893-44e6-9c13-9b99427d2c7b", + "text": "Step 6: Guideline Search Action: Guideline Search Action Input: Management of acute cholecystitis with common bile duct dilation,\nsuspected choledocholithiasis or papillary stenosis Observation:\nGuideline 1 (score=0.5146) Title: 2020 World Society of Emergency Surgery updated guidelines for the\ndiagnosis and treatment of acute calculus cholecystitis Key points:\n- Common bile duct dilation on ultrasound (>6 mm with gallbladder in situ) is a\nstrong risk factor for common bile duct stones (CBDS), but is not diagnostic alone;\nfurther tests are required.\n- For moderate-risk patients (e.g., CBD dilation + bilirubin 1.8–4 mg/dL),\npreoperative MRCP or endoscopic ultrasound is recommended to diagnose CBDS.\n- For high-risk patients (e.g., visualized CBD stone on ultrasound or ascending\ncholangitis), preoperative ERCP is recommended to diagnose CBDS.\n- An increased common bile duct diameter on ultrasound is an indirect sign but is\nnot sufficient alone to diagnose choledocholithiasis, necessitating further diagnostic\ntests.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 89, + "total_chunks": 95, + "char_count": 1027, + "word_count": 146, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "80d4c345-3ea0-425f-8dfa-a4c136534d92", + "text": "[Additional guidelines omitted for brevity] Step 7: PubMed Search Action: PubMed Search Action Input: acute cholecystitis common bile duct dilation normal liver enzymes Continued on next page", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 90, + "total_chunks": 95, + "char_count": 191, + "word_count": 27, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "7dd3a33e-e81e-4dd0-9647-bf52ec4b4223", + "text": "Observation:\nArticle 1: Role of liver function tests in symptomatic cholelithiasis. Abstract: Cholelithiasis is a common problem in Western and developing nations\nwith continuously rising incidence. This prospective study evaluated the usefulness\nof routine liver function tests (LFTs) in patients undergoing cholecystectomy for\nsymptomatic cholelithiasis. Out of 124 patients, 87.10% had LFTs within normal\nrange. Most patients with normal LFTs (95.37%) presented with chronic calculous\ncholecystitis. Of those with deranged LFTs, 43.75% presented with acute calculous\ncholecystitis. None had dilated common bile duct on preoperative ultrasound or\nintraoperative findings. Conclusion: Routine LFTs in preoperative assessment of\nuncomplicated symptomatic cholelithiasis usually come out normal and are not\ngood predictive markers for detecting silent common bile duct calculi. Article 2: Endoscopic Retrograde Cholangiopancreatography in Kidney Transplant\nPatients: Results From a Single Center. Abstract: Twenty-one kidney transplant patients underwent 23 ERCP procedures\nfor various biliary indications including choledocholithiasis (28.6%), common bile\nduct dilation with liver enzyme elevations (19%), and cholangitis (23.8%). Findings\nincluded choledocholithiasis in 47.6% and dilated common bile duct in 9.6%. ERCP\nwas safe with no complications such as acute pancreatitis, bleeding, or perforation. [Additional guidelines omitted for brevity]", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 91, + "total_chunks": 95, + "char_count": 1449, + "word_count": 188, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "888958fd-1c23-4642-a0bc-b1b6ab71a86a", + "text": "The patient presents with acute right upper quadrant pain, neutrophilia (89.1%\nneutrophils, absolute neutrophil count 6.77 K/uL), and cholelithiasis on imaging. Initial ultrasound demonstrated gallstones with common bile duct dilation (12 mm)\nand trace pericholecystic fluid. MRCP definitively showed progressive gallbladder\nwall edema, pericholecystic fluid, and gallbladder wall enhancement—classic imaging findings for acute cholecystitis. The common bile duct dilation with abrupt\ntapering is consistent with papillary stenosis from previously passed stones. Notably,\nliver function tests remained entirely normal (ALT 43 IU/L, AST 30 IU/L, alkaline\nphosphatase 91 IU/L, bilirubin 0.6 mg/dL), and lipase was normal (12 IU/L), effectively ruling out concurrent pancreatitis or acute cholangitis. The retrieved clinical\nexperiences and guidelines confirm that normal liver enzymes do not exclude acute\ncholecystitis, and that CBD dilation in this setting likely reflects chronic sequelae\nfrom stone passage rather than active choledocholithiasis. Final Diagnosis: Acute calculous cholecystitis Continued on next page Ground Truth: Cholecystitis ✓ This trace demonstrates the complete deep clinical research (DCR)\nworkflow with actual laboratory results and rich experience retrieval.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 92, + "total_chunks": 95, + "char_count": 1285, + "word_count": 170, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "7bffc860-80eb-466f-b50f-f06f3a124d9a", + "text": "The agent follows a clinically logical sequence: physical examination first revealing right upper\nquadrant tenderness, followed by comprehensive laboratory evaluation showing relative neutrophilia (89.1% neutrophils, absolute neutrophil count 6.77 K/uL) with liver\nenzymes within normal limits (ALT 43 IU/L, AST 30 IU/L, alkaline phosphatase 91\nIU/L, bilirubin 0.6 mg/dL) and normal lipase (12 IU/L). Initial right upper quadrant ultrasound showed cholelithiasis with common bile duct dilation (12 mm) and\ntrace pericholecystic fluid. The agent escalated to MRCP for more definitive biliary\nassessment, which revealed gallbladder wall thickening and edema, pericholecystic\nfluid, and increased T2 signal—findings consistent with acute calculous cholecystitis. The Experience Search retrieved relevant cases from the experience library, providing\nguidance on test-ordering strategies and diagnostic reasoning for similar presentations.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 93, + "total_chunks": 95, + "char_count": 934, + "word_count": 121, + "chunking_strategy": "semantic" + }, + { + "chunk_id": "6685804c-d00c-47ff-9288-a87733448cb2", + "text": "The retrieved experiences noted that acute cholecystitis can present with normal\nliver enzymes and that CBD dilation in the absence of visualized stones reduces\nthe likelihood of active choledocholithiasis. The Guideline Search retrieved the 2020\nWorld Society of Emergency Surgery guidelines on acute calculous cholecystitis, which\ninformed the diagnostic reasoning regarding CBD dilation and the appropriateness of\nMRCP for moderate-risk patients. The PubMed Search provided supporting evidence\nregarding the prevalence of normal liver function tests in acute cholecystitis. The final\ndiagnosis of acute calculous cholecystitis was correct, matching the ground truth label.", + "paper_id": "2603.10677", + "title": "Emulating Clinician Cognition via Self-Evolving Deep Clinical Research", + "authors": [ + "Ruiyang Ren", + "Yuhao Wang", + "Yunsen Liang", + "Lan Luo", + "Jing Liu", + "Haifeng Wang", + "Cong Feng", + "Yinan Zhang", + "Chunyan Miao", + "Ji-Rong Wen", + "Wayne Xin Zhao" + ], + "published_date": "2026-03-11", + "primary_category": "", + "arxiv_url": "http://arxiv.org/abs/2603.10677v1", + "chunk_index": 94, + "total_chunks": 95, + "char_count": 675, + "word_count": 92, + "chunking_strategy": "semantic" + } +] \ No newline at end of file