
This is an alternative to ee6459d6521cf6a4c2e08b6e13ce3c6ce5d55ed0. That is, it fixes the issue that affects the very long type names in https://docs.rs/async-stripe/0.31.0/stripe/index.html#structs. This is, necessarily, a pile of nasty heuristics. We need to balance a few issues: - Sometimes, there's no real word break. For example, `BTreeMap` should be `BTree<wbr>Map`, not `B<wbr>Tree<wbr>Map`. - Sometimes, there's a legit word break, but the name is tiny and the HTML overhead isn't worth it. For example, if we're typesetting `TyCtx`, writing `Ty<wbr>Ctx` would have an HTML overhead of 50%. Line breaking inside it makes no sense.
120 lines
4.2 KiB
Rust
120 lines
4.2 KiB
Rust
//! HTML escaping.
|
|
//!
|
|
//! This module contains one unit struct, which can be used to HTML-escape a
|
|
//! string of text (for use in a format string).
|
|
|
|
use std::fmt;
|
|
|
|
use unicode_segmentation::UnicodeSegmentation;
|
|
|
|
/// Wrapper struct which will emit the HTML-escaped version of the contained
|
|
/// string when passed to a format string.
|
|
pub(crate) struct Escape<'a>(pub &'a str);
|
|
|
|
impl<'a> fmt::Display for Escape<'a> {
|
|
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
// Because the internet is always right, turns out there's not that many
|
|
// characters to escape: http://stackoverflow.com/questions/7381974
|
|
let Escape(s) = *self;
|
|
let pile_o_bits = s;
|
|
let mut last = 0;
|
|
for (i, ch) in s.char_indices() {
|
|
let s = match ch {
|
|
'>' => ">",
|
|
'<' => "<",
|
|
'&' => "&",
|
|
'\'' => "'",
|
|
'"' => """,
|
|
_ => continue,
|
|
};
|
|
fmt.write_str(&pile_o_bits[last..i])?;
|
|
fmt.write_str(s)?;
|
|
// NOTE: we only expect single byte characters here - which is fine as long as we
|
|
// only match single byte characters
|
|
last = i + 1;
|
|
}
|
|
|
|
if last < s.len() {
|
|
fmt.write_str(&pile_o_bits[last..])?;
|
|
}
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
/// Wrapper struct which will emit the HTML-escaped version of the contained
|
|
/// string when passed to a format string.
|
|
///
|
|
/// This is only safe to use for text nodes. If you need your output to be
|
|
/// safely contained in an attribute, use [`Escape`]. If you don't know the
|
|
/// difference, use [`Escape`].
|
|
pub(crate) struct EscapeBodyText<'a>(pub &'a str);
|
|
|
|
impl<'a> fmt::Display for EscapeBodyText<'a> {
|
|
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
// Because the internet is always right, turns out there's not that many
|
|
// characters to escape: http://stackoverflow.com/questions/7381974
|
|
let EscapeBodyText(s) = *self;
|
|
let pile_o_bits = s;
|
|
let mut last = 0;
|
|
for (i, ch) in s.char_indices() {
|
|
let s = match ch {
|
|
'>' => ">",
|
|
'<' => "<",
|
|
'&' => "&",
|
|
_ => continue,
|
|
};
|
|
fmt.write_str(&pile_o_bits[last..i])?;
|
|
fmt.write_str(s)?;
|
|
// NOTE: we only expect single byte characters here - which is fine as long as we
|
|
// only match single byte characters
|
|
last = i + 1;
|
|
}
|
|
|
|
if last < s.len() {
|
|
fmt.write_str(&pile_o_bits[last..])?;
|
|
}
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
/// Wrapper struct which will emit the HTML-escaped version of the contained
|
|
/// string when passed to a format string. This function also word-breaks
|
|
/// CamelCase and snake_case word names.
|
|
///
|
|
/// This is only safe to use for text nodes. If you need your output to be
|
|
/// safely contained in an attribute, use [`Escape`]. If you don't know the
|
|
/// difference, use [`Escape`].
|
|
pub(crate) struct EscapeBodyTextWithWbr<'a>(pub &'a str);
|
|
|
|
impl<'a> fmt::Display for EscapeBodyTextWithWbr<'a> {
|
|
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
let EscapeBodyTextWithWbr(text) = *self;
|
|
if text.len() < 8 {
|
|
return EscapeBodyText(text).fmt(fmt);
|
|
}
|
|
let mut last = 0;
|
|
let mut it = text.grapheme_indices(true).peekable();
|
|
let _ = it.next(); // don't insert wbr before first char
|
|
while let Some((i, s)) = it.next() {
|
|
let pk = it.peek();
|
|
let is_uppercase = || s.chars().any(|c| c.is_uppercase());
|
|
let next_is_uppercase =
|
|
|| pk.map_or(true, |(_, t)| t.chars().any(|c| c.is_uppercase()));
|
|
let next_is_underscore = || pk.map_or(true, |(_, t)| t.contains('_'));
|
|
if (i - last > 3 && is_uppercase() && !next_is_uppercase())
|
|
|| (s.contains('_') && !next_is_underscore())
|
|
{
|
|
EscapeBodyText(&text[last..i]).fmt(fmt)?;
|
|
fmt.write_str("<wbr>")?;
|
|
last = i;
|
|
}
|
|
}
|
|
if last < text.len() {
|
|
EscapeBodyText(&text[last..]).fmt(fmt)?;
|
|
}
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests;
|