diff: Treat multi-byte UTF-8 runes as word characters

Inline diffs on multi-byte UTF-8 characters would match individual bytes, causing garbled diffs in some cases. For example, replacing `⊢` with `⊣`, which differ in the final byte only, caused the diff to display a diff of the bytes instead the character. This commit uses a workaround present in Mercurial by treating all bytes 0x80 and above as word characters, causing any multi-byte character to be treated as a word and not segmented. https://www.mercurial-scm.org/repo/hg/file/6.3.3/mercurial/patch.py#l51
2023-03-29 12:49:23 +09:00 · 2023-03-29 12:49:23 +09:00 · 01a9ce0c71
commit 01a9ce0c71
parent 7aad2aea8a
1 changed files with 11 additions and 1 deletions
--- a/lib/src/diff.rs
+++ b/lib/src/diff.rs
@ -44,7 +44,12 @@ pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {

 fn is_word_byte(b: u8) -> bool {
    // TODO: Make this configurable (probably higher up in the call stack)
-    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_')
+    matches!(
+        b,
+        // Count 0x80..0xff as word bytes so multi-byte UTF-8 chars are
+        // treated as a single unit.
+        b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' | b'\x80'..=b'\xff'
+    )
 }

 pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
@ -675,6 +680,11 @@ mod tests {
        assert_eq!(find_word_ranges(b"   Abc"), vec![3..6]);
    }

+    #[test]
+    fn test_find_word_ranges_multibyte() {
+        assert_eq!(find_word_ranges("⊢".as_bytes()), vec![0..3])
+    }
+
    #[test]
    fn test_find_lcs_empty() {
        let empty: Vec<(usize, usize)> = vec![];