//! ANSI / VT100 Parser — Paul Williams state machine with SIMD fast-path. //! //! Uses `memchr` for ESC byte scanning to skip large text runs, //! then dispatches through a compact state machine. use arrayvec::ArrayVec; use memchr::memchr; const MAX_PARAMS: usize = 32; const MAX_OSC_LEN: usize = 8192; const MAX_DCS_LEN: usize = 4096; const MAX_INTERMEDIATES: usize = 4; /// ANSI parser state machine. #[derive(Clone, Debug, Default)] pub struct Parser { state: State, params: ArrayVec, param: u16, intermediates: ArrayVec, osc_raw: ArrayVec, osc_params: ArrayVec<(usize, usize), 8>, dcs_raw: ArrayVec, ignoring: bool, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] enum State { #[default] Ground, Utf8(u8, u32), Escape, EscapeIntermediate, CsiEntry, CsiParam, CsiIntermediate, CsiIgnore, OscString, DcsEntry, DcsParam, DcsIntermediate, DcsPassthrough, DcsIgnore, SosPmApcString, } /// Trait for handling parsed terminal actions. pub trait Perform { fn print(&mut self, ch: char); fn execute(&mut self, byte: u8); fn csi_dispatch( &mut self, params: &[u16], intermediates: &[u8], ignore: bool, action: char, ); fn esc_dispatch(&mut self, intermediates: &[u8], ignore: bool, byte: u8); fn osc_dispatch(&mut self, params: &[&[u8]]); fn dcs_hook( &mut self, params: &[u16], intermediates: &[u8], ignore: bool, action: char, ); fn dcs_put(&mut self, byte: u8); fn dcs_unhook(&mut self); } impl Parser { pub fn new() -> Self { Self::default() } #[inline] pub fn advance(&mut self, performer: &mut P, bytes: &[u8]) { let mut i = 0; while i < bytes.len() { if matches!(self.state, State::Ground) { match memchr(0x1B, &bytes[i..]) { Some(offset) => { if offset > 0 { self.print_bulk(performer, &bytes[i..i + offset]); } i += offset; self.state = State::Escape; i += 1; continue; } None => { self.print_bulk(performer, &bytes[i..]); break; } } } i += self.advance_one(performer, bytes[i]); } } #[inline(always)] fn print_bulk(&mut self, performer: &mut P, text: &[u8]) { let mut i = 0; while i < text.len() { let b = text[i]; if b.is_ascii() { if b < 0x20 { performer.execute(b); } else { performer.print(b as char); } i += 1; } else { let len = utf8_len(b); if i + len <= text.len() { if let Some(ch) = decode_utf8(&text[i..i + len]) { performer.print(ch); } else { performer.print('\u{FFFD}'); } i += len; } else { self.state = State::Utf8((len - 1) as u8, utf8_acc(b)); return; } } } } #[inline(always)] fn advance_one(&mut self, performer: &mut P, byte: u8) -> usize { match self.state { State::Ground => self.state_ground(performer, byte), State::Utf8(rem, acc) => self.state_utf8(performer, byte, rem, acc), State::Escape => self.state_escape(performer, byte), State::EscapeIntermediate => self.state_escape_intermediate(performer, byte), State::CsiEntry => self.state_csi_entry(performer, byte), State::CsiParam => self.state_csi_param(performer, byte), State::CsiIntermediate => self.state_csi_intermediate(performer, byte), State::CsiIgnore => self.state_csi_ignore(performer, byte), State::OscString => self.state_osc_string(performer, byte), State::DcsEntry => self.state_dcs_entry(performer, byte), State::DcsParam => self.state_dcs_param(performer, byte), State::DcsIntermediate => self.state_dcs_intermediate(performer, byte), State::DcsPassthrough => self.state_dcs_passthrough(performer, byte), State::DcsIgnore => self.state_dcs_ignore(performer, byte), State::SosPmApcString => self.state_sos_pm_apc_string(performer, byte), } } #[inline(always)] fn state_ground(&mut self, performer: &mut P, byte: u8) -> usize { if byte.is_ascii_control() { if byte != 0x1B { performer.execute(byte); } return 1; } if byte.is_ascii() { performer.print(byte as char); return 1; } let len = utf8_len(byte); let acc = utf8_acc(byte); if len == 1 { performer.print('\u{FFFD}'); return 1; } self.state = State::Utf8((len - 1) as u8, acc); 1 } #[inline(always)] fn state_utf8(&mut self, performer: &mut P, byte: u8, rem: u8, acc: u32) -> usize { let acc = (acc << 6) | (byte as u32 & 0x3F); let rem = rem - 1; if rem == 0 { if let Some(ch) = char::from_u32(acc) { performer.print(ch); } else { performer.print('\u{FFFD}'); } self.state = State::Ground; } else { self.state = State::Utf8(rem, acc); } 1 } #[inline(always)] fn state_escape(&mut self, performer: &mut P, byte: u8) -> usize { match byte { 0x5B => { self.state = State::CsiEntry; self.clear_params(); } 0x5D => { self.state = State::OscString; self.osc_raw.clear(); self.osc_params.clear(); } 0x50 => { self.state = State::DcsEntry; self.clear_params(); } 0x58 | 0x5E | 0x5F => { self.state = State::SosPmApcString; } 0x20..=0x2F => { self.intermediates.push(byte); self.state = State::EscapeIntermediate; } 0x30..=0x7E => { performer.esc_dispatch(&self.intermediates, self.ignoring, byte); self.reset(); } _ => { self.reset(); } } 1 } #[inline(always)] fn state_escape_intermediate(&mut self, performer: &mut P, byte: u8) -> usize { match byte { 0x20..=0x2F => self.intermediates.push(byte), 0x30..=0x7E => { performer.esc_dispatch(&self.intermediates, self.ignoring, byte); self.reset(); } _ => self.reset(), } 1 } #[inline(always)] fn state_csi_entry(&mut self, _performer: &mut P, byte: u8) -> usize { match byte { 0x30..=0x39 | 0x3B => { self.state = State::CsiParam; self.process_csi_byte(byte); } 0x3A => { self.ignoring = true; self.state = State::CsiParam; } 0x3C..=0x3F => { self.intermediates.push(byte); self.state = State::CsiParam; } 0x20..=0x2F => { self.intermediates.push(byte); self.state = State::CsiIntermediate; } 0x40..=0x7E => { self.dispatch_csi(_performer, byte); } _ => { self.state = State::CsiIgnore; } } 1 } #[inline(always)] fn state_csi_param(&mut self, _performer: &mut P, byte: u8) -> usize { match byte { 0x30..=0x39 | 0x3B => self.process_csi_byte(byte), 0x3A => self.ignoring = true, 0x20..=0x2F => { self.intermediates.push(byte); self.state = State::CsiIntermediate; } 0x40..=0x7E => { self.dispatch_csi(_performer, byte); } _ => { self.state = State::CsiIgnore; } } 1 } #[inline(always)] fn state_csi_intermediate(&mut self, _performer: &mut P, byte: u8) -> usize { match byte { 0x20..=0x2F => self.intermediates.push(byte), 0x40..=0x7E => { self.dispatch_csi(_performer, byte); } _ => { self.state = State::CsiIgnore; } } 1 } #[inline(always)] fn state_csi_ignore(&mut self, _performer: &mut P, byte: u8) -> usize { if (0x40..=0x7E).contains(&byte) { self.reset(); } 1 } #[inline(always)] fn process_csi_byte(&mut self, byte: u8) { if byte == 0x3B { self.push_param(); } else { let digit = (byte - 0x30) as u16; self.param = self.param.saturating_mul(10).saturating_add(digit); } } #[inline(always)] fn dispatch_csi(&mut self, performer: &mut P, byte: u8) { self.push_param(); let params: ArrayVec = self.params.clone(); let intermediates: ArrayVec = self.intermediates.clone(); performer.csi_dispatch(¶ms, &intermediates, self.ignoring, byte as char); self.reset(); } #[inline(always)] fn state_osc_string(&mut self, performer: &mut P, byte: u8) -> usize { match byte { 0x07 => { self.dispatch_osc(performer); self.reset(); } 0x1B => { self.dispatch_osc(performer); self.reset(); } 0x9C => { self.dispatch_osc(performer); self.reset(); } b => { if self.osc_raw.len() < MAX_OSC_LEN { self.osc_raw.push(b); } } } 1 } #[inline(always)] fn dispatch_osc(&mut self, performer: &mut P) { if self.osc_raw.is_empty() { return; } let mut params: Vec<&[u8]> = Vec::new(); let mut start = 0; for (i, &b) in self.osc_raw.iter().enumerate() { if b == 0x3B { params.push(&self.osc_raw[start..i]); start = i + 1; } } if start < self.osc_raw.len() { params.push(&self.osc_raw[start..]); } performer.osc_dispatch(¶ms); } #[inline(always)] fn state_dcs_entry(&mut self, performer: &mut P, byte: u8) -> usize { match byte { 0x30..=0x39 | 0x3B | 0x3C..=0x3F => { self.state = State::DcsParam; self.process_csi_byte(byte); } 0x20..=0x2F => { self.intermediates.push(byte); self.state = State::DcsIntermediate; } 0x40..=0x7E => { self.state = State::DcsPassthrough; self.dcs_hook(performer, byte); } _ => { self.state = State::DcsIgnore; } } 1 } #[inline(always)] fn state_dcs_param(&mut self, performer: &mut P, byte: u8) -> usize { match byte { 0x30..=0x39 | 0x3B => self.process_csi_byte(byte), 0x20..=0x2F => { self.intermediates.push(byte); self.state = State::DcsIntermediate; } 0x40..=0x7E => { self.state = State::DcsPassthrough; self.dcs_hook(performer, byte); } _ => { self.state = State::DcsIgnore; } } 1 } #[inline(always)] fn state_dcs_intermediate(&mut self, performer: &mut P, byte: u8) -> usize { match byte { 0x20..=0x2F => self.intermediates.push(byte), 0x40..=0x7E => { self.state = State::DcsPassthrough; self.dcs_hook(performer, byte); } _ => { self.state = State::DcsIgnore; } } 1 } #[inline(always)] fn state_dcs_passthrough(&mut self, performer: &mut P, byte: u8) -> usize { match byte { 0x1B => {} 0x9C => { performer.dcs_unhook(); self.reset(); } b => { if self.dcs_raw.len() < MAX_DCS_LEN { self.dcs_raw.push(b); performer.dcs_put(b); } } } 1 } #[inline(always)] fn state_dcs_ignore(&mut self, _performer: &mut P, byte: u8) -> usize { if byte == 0x9C || byte == 0x1B { self.reset(); } 1 } #[inline(always)] fn dcs_hook(&mut self, performer: &mut P, byte: u8) { self.push_param(); let params: ArrayVec = self.params.clone(); let intermediates: ArrayVec = self.intermediates.clone(); performer.dcs_hook(¶ms, &intermediates, self.ignoring, byte as char); } #[inline(always)] fn state_sos_pm_apc_string(&mut self, _performer: &mut P, byte: u8) -> usize { if byte == 0x9C || byte == 0x1B { self.reset(); } 1 } #[inline(always)] fn clear_params(&mut self) { self.params.clear(); self.param = 0; } #[inline(always)] fn push_param(&mut self) { if self.params.len() < MAX_PARAMS { self.params.push(self.param); } self.param = 0; } #[inline(always)] fn reset(&mut self) { self.state = State::Ground; self.clear_params(); self.intermediates.clear(); self.ignoring = false; } } #[inline(always)] fn utf8_len(byte: u8) -> usize { match byte.leading_ones() { 0 => 1, 2 => 2, 3 => 3, 4 => 4, _ => 1, } } #[inline(always)] fn utf8_acc(byte: u8) -> u32 { match byte.leading_ones() { 0 => byte as u32, 2 => (byte & 0x1F) as u32, 3 => (byte & 0x0F) as u32, 4 => (byte & 0x07) as u32, _ => byte as u32, } } #[inline(always)] fn decode_utf8(bytes: &[u8]) -> Option { let first = bytes[0]; let len = utf8_len(first); if bytes.len() < len { return None; } let mut acc = utf8_acc(first); for i in 1..len { acc = (acc << 6) | ((bytes[i] & 0x3F) as u32); } char::from_u32(acc) }