WIP: Switch to byte-oriented indexing

Co-Authored-By: Nathan Sobo <nathan@zed.dev>
This commit is contained in:
Antonio Scandurra 2021-05-17 18:01:08 +02:00
parent 529a42e78f
commit f7691fc00c
2 changed files with 80 additions and 70 deletions

View file

@ -1,6 +1,5 @@
use super::Point; use super::Point;
use crate::sum_tree::{self, SeekBias, SumTree}; use crate::sum_tree::{self, SeekBias, SumTree};
use crate::util::byte_range_for_char_range;
use anyhow::{anyhow, Result}; use anyhow::{anyhow, Result};
use arrayvec::ArrayString; use arrayvec::ArrayString;
use smallvec::SmallVec; use smallvec::SmallVec;
@ -125,31 +124,54 @@ impl Rope {
ChunksIter::new(self, range) ChunksIter::new(self, range)
} }
pub fn to_point(&self, offset: usize) -> Result<Point> { pub fn to_point(&self, offset: usize) -> Point {
if offset <= self.summary().chars { assert!(offset <= self.summary().bytes);
let mut cursor = self.chunks.cursor::<usize, TextSummary>(); let mut cursor = self.chunks.cursor::<usize, TextSummary>();
cursor.seek(&offset, SeekBias::Left, &()); cursor.seek(&offset, SeekBias::Left, &());
let overshoot = offset - cursor.start().chars; let overshoot = offset - cursor.start().bytes;
Ok(cursor.start().lines cursor.start().lines
+ cursor + cursor
.item() .item()
.map_or(Point::zero(), |chunk| chunk.to_point(overshoot))) .map_or(Point::zero(), |chunk| chunk.to_point(overshoot))
} else {
Err(anyhow!("offset out of bounds"))
}
} }
pub fn to_offset(&self, point: Point) -> Result<usize> { pub fn to_offset(&self, point: Point) -> usize {
if point <= self.summary().lines { assert!(point <= self.summary().lines);
let mut cursor = self.chunks.cursor::<Point, TextSummary>(); let mut cursor = self.chunks.cursor::<Point, TextSummary>();
cursor.seek(&point, SeekBias::Left, &()); cursor.seek(&point, SeekBias::Left, &());
let overshoot = point - cursor.start().lines; let overshoot = point - cursor.start().lines;
Ok(cursor.start().chars cursor.start().bytes + cursor.item().map_or(0, |chunk| chunk.to_offset(overshoot))
+ cursor }
.item()
.map_or(Ok(0), |chunk| chunk.to_offset(overshoot))?) pub fn next_char_boundary(&self, mut offset: usize) -> usize {
assert!(offset <= self.summary().bytes);
let mut cursor = self.chunks.cursor::<usize, usize>();
cursor.seek(&offset, SeekBias::Left, &());
if let Some(chunk) = cursor.item() {
let ix = offset - cursor.start();
while !chunk.0.is_char_boundary(ix) {
ix += 1;
offset += 1;
}
offset
} else { } else {
Err(anyhow!("offset out of bounds")) offset
}
}
pub fn prev_char_boundary(&self, offset: usize) -> usize {
assert!(offset <= self.summary().bytes);
let mut cursor = self.chunks.cursor::<usize, usize>();
cursor.seek(&offset, SeekBias::Left, &());
if let Some(chunk) = cursor.item() {
let ix = offset - cursor.start();
while !chunk.0.is_char_boundary(ix) {
ix -= 1;
offset -= 1;
}
offset
} else {
offset
} }
} }
} }
@ -193,8 +215,7 @@ impl<'a> Cursor<'a> {
if let Some(start_chunk) = self.chunks.item() { if let Some(start_chunk) = self.chunks.item() {
let start_ix = self.offset - self.chunks.start(); let start_ix = self.offset - self.chunks.start();
let end_ix = cmp::min(end_offset, self.chunks.end()) - self.chunks.start(); let end_ix = cmp::min(end_offset, self.chunks.end()) - self.chunks.start();
let byte_range = byte_range_for_char_range(start_chunk.0, start_ix..end_ix); slice.push(&start_chunk.0[start_ix..end_ix]);
slice.push(&start_chunk.0[byte_range]);
} }
if end_offset > self.chunks.end() { if end_offset > self.chunks.end() {
@ -204,8 +225,7 @@ impl<'a> Cursor<'a> {
}); });
if let Some(end_chunk) = self.chunks.item() { if let Some(end_chunk) = self.chunks.item() {
let end_ix = end_offset - self.chunks.start(); let end_ix = end_offset - self.chunks.start();
let byte_range = byte_range_for_char_range(end_chunk.0, 0..end_ix); slice.push(&end_chunk.0[..end_ix]);
slice.push(&end_chunk.0[byte_range]);
} }
} }
@ -220,8 +240,7 @@ impl<'a> Cursor<'a> {
if let Some(start_chunk) = self.chunks.item() { if let Some(start_chunk) = self.chunks.item() {
let start_ix = self.offset - self.chunks.start(); let start_ix = self.offset - self.chunks.start();
let end_ix = cmp::min(end_offset, self.chunks.end()) - self.chunks.start(); let end_ix = cmp::min(end_offset, self.chunks.end()) - self.chunks.start();
let byte_range = byte_range_for_char_range(start_chunk.0, start_ix..end_ix); summary = TextSummary::from(&start_chunk.0[start_ix..end_ix]);
summary = TextSummary::from(&start_chunk.0[byte_range]);
} }
if end_offset > self.chunks.end() { if end_offset > self.chunks.end() {
@ -229,8 +248,7 @@ impl<'a> Cursor<'a> {
summary += &self.chunks.summary(&end_offset, SeekBias::Right, &()); summary += &self.chunks.summary(&end_offset, SeekBias::Right, &());
if let Some(end_chunk) = self.chunks.item() { if let Some(end_chunk) = self.chunks.item() {
let end_ix = end_offset - self.chunks.start(); let end_ix = end_offset - self.chunks.start();
let byte_range = byte_range_for_char_range(end_chunk.0, 0..end_ix); summary += TextSummary::from(&end_chunk.0[..end_ix]);
summary += TextSummary::from(&end_chunk.0[byte_range]);
} }
} }
@ -268,9 +286,8 @@ impl<'a> Iterator for ChunksIter<'a> {
} else if let Some(chunk) = self.chunks.item() { } else if let Some(chunk) = self.chunks.item() {
let start = self.range.start.saturating_sub(*self.chunks.start()); let start = self.range.start.saturating_sub(*self.chunks.start());
let end = self.range.end - self.chunks.start(); let end = self.range.end - self.chunks.start();
let byte_range = byte_range_for_char_range(&chunk.0, start..end);
self.chunks.next(); self.chunks.next();
Some(&chunk.0[byte_range]) Some(&chunk.0[start..end])
} else { } else {
None None
} }
@ -293,14 +310,14 @@ impl Chunk {
point.row += 1; point.row += 1;
point.column = 0; point.column = 0;
} else { } else {
point.column += 1; point.column += ch.len_utf8() as u32;
} }
offset += 1; offset += ch.len_utf8();
} }
point point
} }
fn to_offset(&self, target: Point) -> Result<usize> { fn to_offset(&self, target: Point) -> usize {
let mut offset = 0; let mut offset = 0;
let mut point = Point::new(0, 0); let mut point = Point::new(0, 0);
for ch in self.0.chars() { for ch in self.0.chars() {
@ -312,16 +329,13 @@ impl Chunk {
point.row += 1; point.row += 1;
point.column = 0; point.column = 0;
} else { } else {
point.column += 1; point.column += ch.len_utf8() as u32;
} }
offset += 1; offset += ch.len_utf8();
} }
if point == target { assert_eq!(point, target);
Ok(offset) offset
} else {
Err(anyhow!("point out of bounds"))
}
} }
} }
@ -335,7 +349,6 @@ impl sum_tree::Item for Chunk {
#[derive(Clone, Debug, Default, Eq, PartialEq)] #[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct TextSummary { pub struct TextSummary {
pub chars: usize,
pub bytes: usize, pub bytes: usize,
pub lines: Point, pub lines: Point,
pub first_line_len: u32, pub first_line_len: u32,
@ -345,18 +358,15 @@ pub struct TextSummary {
impl<'a> From<&'a str> for TextSummary { impl<'a> From<&'a str> for TextSummary {
fn from(text: &'a str) -> Self { fn from(text: &'a str) -> Self {
let mut chars = 0; let mut chars = 0;
let mut bytes = 0;
let mut lines = Point::new(0, 0); let mut lines = Point::new(0, 0);
let mut first_line_len = 0; let mut first_line_len = 0;
let mut rightmost_point = Point::new(0, 0); let mut rightmost_point = Point::new(0, 0);
for c in text.chars() { for c in text.chars() {
chars += 1;
bytes += c.len_utf8();
if c == '\n' { if c == '\n' {
lines.row += 1; lines.row += 1;
lines.column = 0; lines.column = 0;
} else { } else {
lines.column += 1; lines.column += c.len_utf8() as u32;
if lines.row == 0 { if lines.row == 0 {
first_line_len = lines.column; first_line_len = lines.column;
} }
@ -367,8 +377,7 @@ impl<'a> From<&'a str> for TextSummary {
} }
TextSummary { TextSummary {
chars, bytes: text.len(),
bytes,
lines, lines,
first_line_len, first_line_len,
rightmost_point, rightmost_point,
@ -398,7 +407,6 @@ impl<'a> std::ops::AddAssign<&'a Self> for TextSummary {
self.first_line_len += other.first_line_len; self.first_line_len += other.first_line_len;
} }
self.chars += other.chars;
self.bytes += other.bytes; self.bytes += other.bytes;
self.lines += &other.lines; self.lines += &other.lines;
} }
@ -418,7 +426,7 @@ impl<'a> sum_tree::Dimension<'a, TextSummary> for TextSummary {
impl<'a> sum_tree::Dimension<'a, TextSummary> for usize { impl<'a> sum_tree::Dimension<'a, TextSummary> for usize {
fn add_summary(&mut self, summary: &'a TextSummary) { fn add_summary(&mut self, summary: &'a TextSummary) {
*self += summary.chars; *self += summary.bytes;
} }
} }
@ -439,6 +447,7 @@ impl<'a> Chars<'a> {
cursor.seek(&start, SeekBias::Left, &()); cursor.seek(&start, SeekBias::Left, &());
let chars = if let Some(chunk) = cursor.item() { let chars = if let Some(chunk) = cursor.item() {
let ix = start - cursor.start(); let ix = start - cursor.start();
assert_char_boundary(&chunk.0, ix);
cursor.next(); cursor.next();
chunk.0.chars().skip(ix) chunk.0.chars().skip(ix)
} else { } else {
@ -484,6 +493,23 @@ fn find_split_ix(text: &str) -> usize {
ix ix
} }
#[inline(always)]
fn assert_char_boundary(s: &str, ix: usize) {
if !s.is_char_boundary(ix) {
let mut char_start = ix;
while !s.is_char_boundary(char_start) {
char_start -= 1;
}
let ch = s[char_start..].chars().next().unwrap();
let char_range = char_start..char_start + ch.len_utf8();
panic!(
"byte index {} is not a char boundary; it is inside {:?} (bytes {:?}) of `{}`",
ix, ch, char_range, s
);
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::util::RandomCharIter; use crate::util::RandomCharIter;
@ -521,8 +547,8 @@ mod tests {
let mut expected = String::new(); let mut expected = String::new();
let mut actual = Rope::new(); let mut actual = Rope::new();
for _ in 0..operations { for _ in 0..operations {
let end_ix = rng.gen_range(0..=expected.chars().count()); let end_ix = actual.next_char_boundary(rng.gen_range(0..=expected.len()));
let start_ix = rng.gen_range(0..=end_ix); let start_ix = actual.prev_char_boundary(rng.gen_range(0..=end_ix));
let len = rng.gen_range(0..=64); let len = rng.gen_range(0..=64);
let new_text: String = RandomCharIter::new(&mut rng).take(len).collect(); let new_text: String = RandomCharIter::new(&mut rng).take(len).collect();
@ -559,8 +585,8 @@ mod tests {
let mut point = Point::new(0, 0); let mut point = Point::new(0, 0);
let mut offset = 0; let mut offset = 0;
for ch in expected.chars() { for ch in expected.chars() {
assert_eq!(actual.to_point(offset).unwrap(), point); assert_eq!(actual.to_point(offset), point);
assert_eq!(actual.to_offset(point).unwrap(), offset); assert_eq!(actual.to_offset(point), offset);
if ch == '\n' { if ch == '\n' {
assert!(actual assert!(actual
.to_offset(Point::new(point.row, point.column + 1)) .to_offset(Point::new(point.row, point.column + 1))
@ -581,7 +607,6 @@ mod tests {
for _ in 0..5 { for _ in 0..5 {
let end_ix = rng.gen_range(0..=expected.chars().count()); let end_ix = rng.gen_range(0..=expected.chars().count());
let start_ix = rng.gen_range(0..=end_ix); let start_ix = rng.gen_range(0..=end_ix);
let byte_range = byte_range_for_char_range(&expected, start_ix..end_ix);
assert_eq!( assert_eq!(
actual.cursor(start_ix).summary(end_ix), actual.cursor(start_ix).summary(end_ix),
TextSummary::from(&expected[byte_range]) TextSummary::from(&expected[byte_range])

View file

@ -1,21 +1,6 @@
use rand::prelude::*; use rand::prelude::*;
use std::{cmp::Ordering, ops::Range}; use std::{cmp::Ordering, ops::Range};
pub fn byte_range_for_char_range(text: impl AsRef<str>, char_range: Range<usize>) -> Range<usize> {
let text = text.as_ref();
let mut result = text.len()..text.len();
for (i, (offset, _)) in text.char_indices().enumerate() {
if i == char_range.start {
result.start = offset;
}
if i == char_range.end {
result.end = offset;
break;
}
}
result
}
pub fn post_inc(value: &mut usize) -> usize { pub fn post_inc(value: &mut usize) -> usize {
let prev = *value; let prev = *value;
*value += 1; *value += 1;