jj/cli/src/text_util.rs
2024-03-23 10:43:38 +09:00

633 lines
22 KiB
Rust
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2022-2023 The Jujutsu Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::borrow::Cow;
use std::{cmp, io};
use unicode_width::UnicodeWidthChar as _;
use crate::formatter::{FormatRecorder, Formatter};
pub fn complete_newline(s: impl Into<String>) -> String {
let mut s = s.into();
if !s.is_empty() && !s.ends_with('\n') {
s.push('\n');
}
s
}
pub fn split_email(email: &str) -> (&str, Option<&str>) {
if let Some((username, rest)) = email.split_once('@') {
(username, Some(rest))
} else {
(email, None)
}
}
/// Shortens `text` to `max_width` by removing leading characters. `ellipsis` is
/// added if the `text` gets truncated.
///
/// The returned string (including `ellipsis`) never exceeds the `max_width`.
pub fn elide_start<'a>(
text: &'a str,
ellipsis: &'a str,
max_width: usize,
) -> (Cow<'a, str>, usize) {
let (text_start, text_width) = truncate_start_pos(text, max_width);
if text_start == 0 {
return (Cow::Borrowed(text), text_width);
}
let (ellipsis_start, ellipsis_width) = truncate_start_pos(ellipsis, max_width);
if ellipsis_start != 0 {
let ellipsis = trim_start_zero_width_chars(&ellipsis[ellipsis_start..]);
return (Cow::Borrowed(ellipsis), ellipsis_width);
}
let text = &text[text_start..];
let max_text_width = max_width - ellipsis_width;
let (skip, skipped_width) = skip_start_pos(text, text_width.saturating_sub(max_text_width));
let text = trim_start_zero_width_chars(&text[skip..]);
let concat_width = ellipsis_width + (text_width - skipped_width);
assert!(concat_width <= max_width);
(Cow::Owned([ellipsis, text].concat()), concat_width)
}
/// Shortens `text` to `max_width` by removing leading characters, returning
/// `(start_index, width)`.
///
/// The truncated string may have 0-width decomposed characters at start.
fn truncate_start_pos(text: &str, max_width: usize) -> (usize, usize) {
let mut acc_width = 0;
for (i, c) in text.char_indices().rev() {
let new_width = acc_width + c.width().unwrap_or(0);
if new_width > max_width {
let prev_index = i + c.len_utf8();
return (prev_index, acc_width);
}
acc_width = new_width;
}
(0, acc_width)
}
/// Skips `width` leading characters, returning `(start_index, skipped_width)`.
///
/// The `skipped_width` may exceed the given `width` if `width` is not at
/// character boundary.
///
/// The truncated string may have 0-width decomposed characters at start.
fn skip_start_pos(text: &str, width: usize) -> (usize, usize) {
let mut acc_width = 0;
for (i, c) in text.char_indices() {
if acc_width >= width {
return (i, acc_width);
}
acc_width += c.width().unwrap_or(0);
}
(text.len(), acc_width)
}
/// Removes leading 0-width characters.
fn trim_start_zero_width_chars(text: &str) -> &str {
text.trim_start_matches(|c: char| c.width().unwrap_or(0) == 0)
}
/// Indents each line by the given prefix preserving labels.
pub fn write_indented(
formatter: &mut dyn Formatter,
recorded_content: &FormatRecorder,
mut write_prefix: impl FnMut(&mut dyn Formatter) -> io::Result<()>,
) -> io::Result<()> {
let data = recorded_content.data();
let mut new_line = true;
recorded_content.replay_with(formatter, |formatter, range| {
for line in data[range].split_inclusive(|&c| c == b'\n') {
if new_line && line != b"\n" {
// Prefix inherits the current labels. This is implementation detail
// and may be fixed later.
write_prefix(formatter)?;
}
formatter.write_all(line)?;
new_line = line.ends_with(b"\n");
}
Ok(())
})
}
/// Word with trailing whitespace.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
struct ByteFragment<'a> {
word: &'a [u8],
whitespace_len: usize,
word_width: usize,
}
impl<'a> ByteFragment<'a> {
fn new(word: &'a [u8], whitespace_len: usize) -> Self {
// We don't care about the width of non-UTF-8 bytes, but should not panic.
let word_width = textwrap::core::display_width(&String::from_utf8_lossy(word));
ByteFragment {
word,
whitespace_len,
word_width,
}
}
fn offset_in(&self, text: &[u8]) -> usize {
byte_offset_from(text, self.word)
}
}
impl textwrap::core::Fragment for ByteFragment<'_> {
fn width(&self) -> f64 {
self.word_width as f64
}
fn whitespace_width(&self) -> f64 {
self.whitespace_len as f64
}
fn penalty_width(&self) -> f64 {
0.0
}
}
fn byte_offset_from(outer: &[u8], inner: &[u8]) -> usize {
let outer_start = outer.as_ptr() as usize;
let inner_start = inner.as_ptr() as usize;
assert!(outer_start <= inner_start);
assert!(inner_start + inner.len() <= outer_start + outer.len());
inner_start - outer_start
}
fn split_byte_line_to_words(line: &[u8]) -> Vec<ByteFragment<'_>> {
let mut words = Vec::new();
let mut tail = line;
while let Some(word_end) = tail.iter().position(|&c| c == b' ') {
let word = &tail[..word_end];
let ws_end = tail[word_end + 1..]
.iter()
.position(|&c| c != b' ')
.map(|p| p + word_end + 1)
.unwrap_or(tail.len());
words.push(ByteFragment::new(word, ws_end - word_end));
tail = &tail[ws_end..];
}
if !tail.is_empty() {
words.push(ByteFragment::new(tail, 0));
}
words
}
/// Wraps lines at the given width, returns a vector of lines (excluding "\n".)
///
/// Existing newline characters will never be removed. For `str` content, you
/// can use `textwrap::refill()` to refill a pre-formatted text.
///
/// Each line is a sub-slice of the given text, even if the line is empty.
///
/// The wrapping logic is more restricted than the default of the `textwrap`.
/// Notably, this doesn't support hyphenation nor unicode line break. The
/// display width is calculated based on unicode property in the same manner
/// as `textwrap::wrap()`.
pub fn wrap_bytes(text: &[u8], width: usize) -> Vec<&[u8]> {
let mut split_lines = Vec::new();
for line in text.split(|&c| c == b'\n') {
let words = split_byte_line_to_words(line);
let split = textwrap::wrap_algorithms::wrap_first_fit(&words, &[width as f64]);
split_lines.extend(split.iter().map(|words| match words {
[] => &line[..0], // Empty line
[a] => a.word,
[a, .., b] => {
let start = a.offset_in(line);
let end = b.offset_in(line) + b.word.len();
&line[start..end]
}
}));
}
split_lines
}
/// Wraps lines at the given width preserving labels.
///
/// `textwrap::wrap()` can also process text containing ANSI escape sequences.
/// The main difference is that this function will reset the style for each line
/// and recreate it on the following line if the output `formatter` is
/// a `ColorFormatter`.
pub fn write_wrapped(
formatter: &mut dyn Formatter,
recorded_content: &FormatRecorder,
width: usize,
) -> io::Result<()> {
let data = recorded_content.data();
let mut line_ranges = wrap_bytes(data, width)
.into_iter()
.map(|line| {
let start = byte_offset_from(data, line);
start..start + line.len()
})
.peekable();
// The recorded data ranges are contiguous, and the line ranges are increasing
// sequence (with some holes.) Both ranges should start from data[0].
recorded_content.replay_with(formatter, |formatter, data_range| {
while let Some(line_range) = line_ranges.peek() {
let start = cmp::max(data_range.start, line_range.start);
let end = cmp::min(data_range.end, line_range.end);
if start < end {
formatter.write_all(&data[start..end])?;
}
if data_range.end <= line_range.end {
break; // No more lines in this data range
}
line_ranges.next().unwrap();
if line_ranges.peek().is_some() {
writeln!(formatter)?; // Not the last line
}
}
Ok(())
})
}
#[cfg(test)]
mod tests {
use std::io::Write as _;
use super::*;
use crate::formatter::{ColorFormatter, PlainTextFormatter};
fn format_colored(write: impl FnOnce(&mut dyn Formatter) -> io::Result<()>) -> String {
let config = config::Config::builder()
.set_override("colors.cyan", "cyan")
.unwrap()
.set_override("colors.red", "red")
.unwrap()
.build()
.unwrap();
let mut output = Vec::new();
let mut formatter = ColorFormatter::for_config(&mut output, &config).unwrap();
write(&mut formatter).unwrap();
drop(formatter);
String::from_utf8(output).unwrap()
}
fn format_plain_text(write: impl FnOnce(&mut dyn Formatter) -> io::Result<()>) -> String {
let mut output = Vec::new();
let mut formatter = PlainTextFormatter::new(&mut output);
write(&mut formatter).unwrap();
String::from_utf8(output).unwrap()
}
#[test]
fn test_elide_start() {
// Empty string
assert_eq!(elide_start("", "", 1), ("".into(), 0));
// Basic truncation
assert_eq!(elide_start("abcdef", "", 6), ("abcdef".into(), 6));
assert_eq!(elide_start("abcdef", "", 5), ("bcdef".into(), 5));
assert_eq!(elide_start("abcdef", "", 1), ("f".into(), 1));
assert_eq!(elide_start("abcdef", "", 0), ("".into(), 0));
assert_eq!(elide_start("abcdef", "-=~", 6), ("abcdef".into(), 6));
assert_eq!(elide_start("abcdef", "-=~", 5), ("-=~ef".into(), 5));
assert_eq!(elide_start("abcdef", "-=~", 4), ("-=~f".into(), 4));
assert_eq!(elide_start("abcdef", "-=~", 3), ("-=~".into(), 3));
assert_eq!(elide_start("abcdef", "-=~", 2), ("=~".into(), 2));
assert_eq!(elide_start("abcdef", "-=~", 1), ("~".into(), 1));
assert_eq!(elide_start("abcdef", "-=~", 0), ("".into(), 0));
// East Asian characters (char.width() == 2)
assert_eq!(elide_start("一二三", "", 6), ("一二三".into(), 6));
assert_eq!(elide_start("一二三", "", 5), ("二三".into(), 4));
assert_eq!(elide_start("一二三", "", 4), ("二三".into(), 4));
assert_eq!(elide_start("一二三", "", 1), ("".into(), 0));
assert_eq!(elide_start("一二三", "-=~", 6), ("一二三".into(), 6));
assert_eq!(elide_start("一二三", "-=~", 5), ("-=~三".into(), 5));
assert_eq!(elide_start("一二三", "-=~", 4), ("-=~".into(), 3));
assert_eq!(elide_start("一二三", "", 6), ("一二三".into(), 6));
assert_eq!(elide_start("一二三", "", 5), ("略三".into(), 4));
assert_eq!(elide_start("一二三", "", 4), ("略三".into(), 4));
assert_eq!(elide_start("一二三", "", 2), ("".into(), 2));
assert_eq!(elide_start("一二三", "", 1), ("".into(), 0));
assert_eq!(elide_start("一二三", ".", 5), (".二三".into(), 5));
assert_eq!(elide_start("一二三", ".", 4), (".三".into(), 3));
assert_eq!(elide_start("一二三", "略.", 5), ("略.三".into(), 5));
assert_eq!(elide_start("一二三", "略.", 4), ("略.".into(), 3));
// Multi-byte character at boundary
assert_eq!(elide_start("àbcdè", "", 5), ("àbcdè".into(), 5));
assert_eq!(elide_start("àbcdè", "", 4), ("bcdè".into(), 4));
assert_eq!(elide_start("àbcdè", "", 1), ("è".into(), 1));
assert_eq!(elide_start("àbcdè", "", 0), ("".into(), 0));
assert_eq!(elide_start("àbcdè", "ÀÇÈ", 4), ("ÀÇÈè".into(), 4));
assert_eq!(elide_start("àbcdè", "ÀÇÈ", 3), ("ÀÇÈ".into(), 3));
assert_eq!(elide_start("àbcdè", "ÀÇÈ", 2), ("ÇÈ".into(), 2));
// Decomposed character at boundary
assert_eq!(
elide_start("a\u{300}bcde\u{300}", "", 5),
("a\u{300}bcde\u{300}".into(), 5)
);
assert_eq!(
elide_start("a\u{300}bcde\u{300}", "", 4),
("bcde\u{300}".into(), 4)
);
assert_eq!(
elide_start("a\u{300}bcde\u{300}", "", 1),
("e\u{300}".into(), 1)
);
assert_eq!(elide_start("a\u{300}bcde\u{300}", "", 0), ("".into(), 0));
assert_eq!(
elide_start("a\u{300}bcde\u{300}", "A\u{300}CE\u{300}", 4),
("A\u{300}CE\u{300}e\u{300}".into(), 4)
);
assert_eq!(
elide_start("a\u{300}bcde\u{300}", "A\u{300}CE\u{300}", 3),
("A\u{300}CE\u{300}".into(), 3)
);
assert_eq!(
elide_start("a\u{300}bcde\u{300}", "A\u{300}CE\u{300}", 2),
("CE\u{300}".into(), 2)
);
}
#[test]
fn test_split_byte_line_to_words() {
assert_eq!(split_byte_line_to_words(b""), vec![]);
assert_eq!(
split_byte_line_to_words(b"foo"),
vec![ByteFragment {
word: b"foo",
whitespace_len: 0,
word_width: 3
}],
);
assert_eq!(
split_byte_line_to_words(b" foo"),
vec![
ByteFragment {
word: b"",
whitespace_len: 2,
word_width: 0
},
ByteFragment {
word: b"foo",
whitespace_len: 0,
word_width: 3
},
],
);
assert_eq!(
split_byte_line_to_words(b"foo "),
vec![ByteFragment {
word: b"foo",
whitespace_len: 2,
word_width: 3
}],
);
assert_eq!(
split_byte_line_to_words(b"a b foo bar "),
vec![
ByteFragment {
word: b"a",
whitespace_len: 1,
word_width: 1
},
ByteFragment {
word: b"b",
whitespace_len: 2,
word_width: 1
},
ByteFragment {
word: b"foo",
whitespace_len: 1,
word_width: 3,
},
ByteFragment {
word: b"bar",
whitespace_len: 1,
word_width: 3,
},
],
);
}
#[test]
fn test_wrap_bytes() {
assert_eq!(wrap_bytes(b"foo", 10), [b"foo".as_ref()]);
assert_eq!(wrap_bytes(b"foo bar", 10), [b"foo bar".as_ref()]);
assert_eq!(
wrap_bytes(b"foo bar baz", 10),
[b"foo bar".as_ref(), b"baz".as_ref()],
);
// Empty text is represented as [""]
assert_eq!(wrap_bytes(b"", 10), [b"".as_ref()]);
assert_eq!(wrap_bytes(b" ", 10), [b"".as_ref()]);
// Whitespace in the middle should be preserved
assert_eq!(
wrap_bytes(b"foo bar baz", 8),
[b"foo bar".as_ref(), b"baz".as_ref()],
);
assert_eq!(
wrap_bytes(b"foo bar x", 7),
[b"foo".as_ref(), b"bar x".as_ref()],
);
assert_eq!(
wrap_bytes(b"foo bar \nx", 7),
[b"foo bar".as_ref(), b"x".as_ref()],
);
assert_eq!(
wrap_bytes(b"foo bar\n x", 7),
[b"foo bar".as_ref(), b" x".as_ref()],
);
assert_eq!(
wrap_bytes(b"foo bar x", 4),
[b"foo".as_ref(), b"bar".as_ref(), b"x".as_ref()],
);
// Ends with "\n"
assert_eq!(wrap_bytes(b"foo\n", 10), [b"foo".as_ref(), b"".as_ref()]);
assert_eq!(wrap_bytes(b"foo\n", 3), [b"foo".as_ref(), b"".as_ref()]);
assert_eq!(wrap_bytes(b"\n", 10), [b"".as_ref(), b"".as_ref()]);
// Overflow
assert_eq!(wrap_bytes(b"foo x", 2), [b"foo".as_ref(), b"x".as_ref()]);
assert_eq!(wrap_bytes(b"x y", 0), [b"x".as_ref(), b"y".as_ref()]);
// Invalid UTF-8 bytes should not cause panic
assert_eq!(wrap_bytes(b"foo\x80", 10), [b"foo\x80".as_ref()]);
}
#[test]
fn test_wrap_bytes_slice_ptr() {
let text = b"\nfoo\n\nbar baz\n";
let lines = wrap_bytes(text, 10);
assert_eq!(
lines,
[
b"".as_ref(),
b"foo".as_ref(),
b"".as_ref(),
b"bar baz".as_ref(),
b"".as_ref()
],
);
// Each line should be a sub-slice of the source text
assert_eq!(lines[0].as_ptr(), text[0..].as_ptr());
assert_eq!(lines[1].as_ptr(), text[1..].as_ptr());
assert_eq!(lines[2].as_ptr(), text[5..].as_ptr());
assert_eq!(lines[3].as_ptr(), text[6..].as_ptr());
assert_eq!(lines[4].as_ptr(), text[14..].as_ptr());
}
#[test]
fn test_write_wrapped() {
// Split single label chunk
let mut recorder = FormatRecorder::new();
recorder.push_label("red").unwrap();
write!(recorder, "foo bar baz\nqux quux\n").unwrap();
recorder.pop_label().unwrap();
insta::assert_snapshot!(
format_colored(|formatter| write_wrapped(formatter, &recorder, 7)),
@r###"
foo bar
baz
qux
quux
"###
);
// Multiple label chunks in a line
let mut recorder = FormatRecorder::new();
for (i, word) in ["foo ", "bar ", "baz\n", "qux ", "quux"].iter().enumerate() {
recorder.push_label(["red", "cyan"][i & 1]).unwrap();
write!(recorder, "{word}").unwrap();
recorder.pop_label().unwrap();
}
insta::assert_snapshot!(
format_colored(|formatter| write_wrapped(formatter, &recorder, 7)),
@r###"
foo bar
baz
qux
quux
"###
);
// Empty lines should not cause panic
let mut recorder = FormatRecorder::new();
for (i, word) in ["", "foo", "", "bar baz", ""].iter().enumerate() {
recorder.push_label(["red", "cyan"][i & 1]).unwrap();
writeln!(recorder, "{word}").unwrap();
recorder.pop_label().unwrap();
}
insta::assert_snapshot!(
format_colored(|formatter| write_wrapped(formatter, &recorder, 10)),
@r###"

foo

bar baz

"###
);
// Split at label boundary
let mut recorder = FormatRecorder::new();
recorder.push_label("red").unwrap();
write!(recorder, "foo bar").unwrap();
recorder.pop_label().unwrap();
write!(recorder, " ").unwrap();
recorder.push_label("cyan").unwrap();
writeln!(recorder, "baz").unwrap();
recorder.pop_label().unwrap();
insta::assert_snapshot!(
format_colored(|formatter| write_wrapped(formatter, &recorder, 10)),
@r###"
foo bar
baz
"###
);
// Do not split at label boundary "ba|z" (since it's a single word)
let mut recorder = FormatRecorder::new();
recorder.push_label("red").unwrap();
write!(recorder, "foo bar ba").unwrap();
recorder.pop_label().unwrap();
recorder.push_label("cyan").unwrap();
writeln!(recorder, "z").unwrap();
recorder.pop_label().unwrap();
insta::assert_snapshot!(
format_colored(|formatter| write_wrapped(formatter, &recorder, 10)),
@r###"
foo bar
baz
"###
);
}
#[test]
fn test_write_wrapped_leading_labeled_whitespace() {
let mut recorder = FormatRecorder::new();
recorder.push_label("red").unwrap();
write!(recorder, " ").unwrap();
recorder.pop_label().unwrap();
write!(recorder, "foo").unwrap();
insta::assert_snapshot!(
format_colored(|formatter| write_wrapped(formatter, &recorder, 10)),
@" foo"
);
}
#[test]
fn test_write_wrapped_trailing_labeled_whitespace() {
// data: "foo" " "
// line: ---
let mut recorder = FormatRecorder::new();
write!(recorder, "foo").unwrap();
recorder.push_label("red").unwrap();
write!(recorder, " ").unwrap();
recorder.pop_label().unwrap();
assert_eq!(
format_plain_text(|formatter| write_wrapped(formatter, &recorder, 10)),
"foo",
);
// data: "foo" "\n"
// line: --- -
let mut recorder = FormatRecorder::new();
write!(recorder, "foo").unwrap();
recorder.push_label("red").unwrap();
writeln!(recorder).unwrap();
recorder.pop_label().unwrap();
assert_eq!(
format_plain_text(|formatter| write_wrapped(formatter, &recorder, 10)),
"foo\n",
);
// data: "foo\n" " "
// line: --- -
let mut recorder = FormatRecorder::new();
writeln!(recorder, "foo").unwrap();
recorder.push_label("red").unwrap();
write!(recorder, " ").unwrap();
recorder.pop_label().unwrap();
assert_eq!(
format_plain_text(|formatter| write_wrapped(formatter, &recorder, 10)),
"foo\n",
);
}
}