gitignores: add own implementation and stop using libgit2's

This is to address issue #8. I haven't added the optimization to avoid
walking all the files in `target/` yet. Even so, this patch still
speeds up `jj st` in this repo, with ~13k files in `target/`, from
~320 ms to ~100 ms (-5.1dB). The time actually checking if paths match
gitignores seems to go down from 116 ms to 6 ms. I think that's mostly
because libgit2 has to look for `.gitignore` files in every parent
directory every time we ask it about a file, while the rewritten code
looks for a `.gitignore` file only when visiting a new directory.
This commit is contained in:
Martin von Zweigbergk 2021-05-12 09:43:58 -07:00
parent 31eff96cb7
commit 88f7f4732b
5 changed files with 466 additions and 43 deletions

27
Cargo.lock generated
View file

@ -4,9 +4,9 @@ version = 3
[[package]]
name = "aho-corasick"
version = "0.7.13"
version = "0.7.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86"
checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
dependencies = [
"memchr",
]
@ -588,6 +588,7 @@ dependencies = [
"protobuf",
"protobuf-codegen-pure",
"rand 0.8.0",
"regex",
"serde_json",
"tempfile",
"test-case",
@ -707,9 +708,9 @@ checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
[[package]]
name = "memchr"
version = "2.3.3"
version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400"
checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
[[package]]
name = "memoffset"
@ -1050,14 +1051,13 @@ dependencies = [
[[package]]
name = "regex"
version = "1.4.2"
version = "1.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38cf2c13ed4745de91a5eb834e11c00bcc3709e773173b2ce4c56c9fbde04b9c"
checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
"thread_local",
]
[[package]]
@ -1071,9 +1071,9 @@ dependencies = [
[[package]]
name = "regex-syntax"
version = "0.6.21"
version = "0.6.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b181ba2dcf07aaccad5448e8ead58db5b742cf85dfe035e2227f137a539a189"
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
[[package]]
name = "remove_dir_all"
@ -1309,15 +1309,6 @@ dependencies = [
"syn",
]
[[package]]
name = "thread_local"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
dependencies = [
"lazy_static",
]
[[package]]
name = "time"
version = "0.1.44"

View file

@ -30,6 +30,7 @@ pest = "2.1.3"
pest_derive = "2.1.0"
protobuf = { version = "2.22.1", features = ["with-bytes"] }
rand = "0.8.0"
regex = "1.5.4"
serde_json = "1.0.60"
tempfile = "3.1.0"
thiserror = "1.0.22"

413
lib/src/gitignore.rs Normal file
View file

@ -0,0 +1,413 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use regex::{escape as regex_escape, Regex};
pub enum GitIgnoreParseError {}
struct GitIgnoreLine {
is_negative: bool,
regex: Regex,
}
impl GitIgnoreLine {
// Remove trailing spaces (unless backslash-escaped)
fn remove_trailing_space(input: &str) -> &str {
let mut trimmed_len = 0;
let mut non_space_seen = false;
let mut prev_was_space = false;
let mut in_escape = false;
for (i, c) in input.char_indices() {
if !prev_was_space && non_space_seen {
trimmed_len = i;
}
if c == ' ' {
if in_escape {
in_escape = false;
} else {
prev_was_space = true;
}
} else {
non_space_seen = true;
prev_was_space = false;
in_escape = c == '\\'
}
}
if !prev_was_space && non_space_seen {
trimmed_len = input.len();
}
&input.split_at(trimmed_len).0
}
fn parse(prefix: &str, input: &str) -> Result<Option<GitIgnoreLine>, GitIgnoreParseError> {
if input.starts_with('#') {
return Ok(None);
}
let input = GitIgnoreLine::remove_trailing_space(input);
// Remove leading "!" before checking for empty to match git's implementation
// (i.e. just "!" matching nothing, not everything).
let (is_negative, input) = match input.strip_prefix('!') {
None => (false, input),
Some(rest) => (true, rest),
};
if input.is_empty() {
return Ok(None);
}
let (matches_only_directory, input) = match input.strip_suffix('/') {
None => (false, input),
Some(rest) => (true, rest),
};
let (mut is_rooted, input) = match input.strip_prefix('/') {
None => (false, input),
Some(rest) => (true, rest),
};
is_rooted |= input.contains('/');
let mut regex = String::new();
if is_rooted {
regex.insert_str(0, prefix);
regex.insert(0, '^');
} else {
regex.insert_str(0, "(^|/)");
}
let components: Vec<_> = input.split('/').collect();
for (i, component) in components.iter().enumerate() {
if *component == "**" {
if i == components.len() - 1 {
regex.push_str(".*");
} else {
regex.push_str("(.*/)?");
}
} else {
let mut in_escape = false;
let mut character_class: Option<String> = None;
for c in component.chars() {
if in_escape {
in_escape = false;
if !matches!(c, ' ' | '#' | '!' | '?' | '\\' | '*') {
regex.push_str(&regex_escape("\\"));
}
regex.push_str(&regex_escape(&c.to_string()));
} else if c == '\\' {
in_escape = true;
} else if let Some(characters) = &mut character_class {
if c == ']' {
regex.push('[');
regex.push_str(&characters);
regex.push(']');
character_class = None;
} else {
characters.push(c);
}
} else {
in_escape = false;
if c == '?' {
regex.push_str("[^/]");
} else if c == '*' {
regex.push_str("[^/]*");
} else if c == '[' {
character_class = Some(String::new());
} else {
regex.push_str(&regex_escape(&c.to_string()));
}
}
}
if in_escape {
regex.push_str(&regex_escape("\\"));
}
if i < components.len() - 1 {
regex.push('/');
}
}
}
if matches_only_directory {
regex.push_str("/.*");
} else {
regex.push_str("(/.*|$)");
}
let regex = Regex::new(&regex).unwrap();
Ok(Some(GitIgnoreLine { is_negative, regex }))
}
fn matches_file(&self, path: &str) -> bool {
self.regex.is_match(path)
}
}
pub struct GitIgnoreFile {
parent: Option<Arc<GitIgnoreFile>>,
lines: Vec<GitIgnoreLine>,
}
impl GitIgnoreFile {
pub fn empty() -> Arc<GitIgnoreFile> {
Arc::new(GitIgnoreFile {
parent: None,
lines: vec![],
})
}
pub fn chain(
self: &Arc<GitIgnoreFile>,
prefix: &str,
input: &[u8],
) -> Result<Arc<GitIgnoreFile>, GitIgnoreParseError> {
let mut lines = vec![];
for input_line in input.split(|b| *b == b'\n') {
// Skip non-utf8 lines
if let Ok(line_string) = String::from_utf8(input_line.to_vec()) {
if let Some(line) = GitIgnoreLine::parse(prefix, &line_string)? {
lines.push(line);
}
}
}
Ok(Arc::new(GitIgnoreFile {
parent: Some(self.clone()),
lines,
}))
}
fn all_lines_reversed<'a>(&'a self) -> Box<dyn Iterator<Item = &GitIgnoreLine> + 'a> {
if let Some(parent) = &self.parent {
Box::new(self.lines.iter().rev().chain(parent.all_lines_reversed()))
} else {
Box::new(self.lines.iter().rev())
}
}
pub fn matches_file(&self, path: &str) -> bool {
// Later lines take precedence, so check them in reverse
for line in self.all_lines_reversed() {
if line.matches_file(path) {
return !line.is_negative;
}
}
false
}
}
#[cfg(test)]
mod tests {
use super::*;
fn matches_file(input: &[u8], path: &str) -> bool {
let file = GitIgnoreFile::empty().chain("", input).ok().unwrap();
file.matches_file(path)
}
#[test]
fn test_gitignore_empty_file() {
let file = GitIgnoreFile::empty();
assert!(!file.matches_file("foo"));
}
#[test]
fn test_gitignore_empty_file_with_prefix() {
let file = GitIgnoreFile::empty().chain("dir/", b"").ok().unwrap();
assert!(!file.matches_file("dir/foo"));
}
#[test]
fn test_gitignore_literal() {
let file = GitIgnoreFile::empty().chain("", b"foo\n").ok().unwrap();
assert!(file.matches_file("foo"));
assert!(file.matches_file("dir/foo"));
assert!(file.matches_file("dir/subdir/foo"));
assert!(!file.matches_file("food"));
assert!(!file.matches_file("dir/food"));
}
#[test]
fn test_gitignore_literal_with_prefix() {
let file = GitIgnoreFile::empty().chain("dir/", b"foo\n").ok().unwrap();
// I consider it undefined whether a file in a parent directory matches, but
// let's test it anyway
assert!(file.matches_file("foo"));
assert!(file.matches_file("dir/foo"));
assert!(file.matches_file("dir/subdir/foo"));
}
#[test]
fn test_gitignore_rooted_literal() {
let file = GitIgnoreFile::empty().chain("", b"/foo\n").ok().unwrap();
assert!(file.matches_file("foo"));
assert!(!file.matches_file("dir/foo"));
}
#[test]
fn test_gitignore_rooted_literal_with_prefix() {
let file = GitIgnoreFile::empty()
.chain("dir/", b"/foo\n")
.ok()
.unwrap();
// I consider it undefined whether a file in a parent directory matches, but
// let's test it anyway
assert!(!file.matches_file("foo"));
assert!(file.matches_file("dir/foo"));
assert!(!file.matches_file("dir/subdir/foo"));
}
#[test]
fn test_gitignore_deep_dir() {
let file = GitIgnoreFile::empty()
.chain("", b"/dir1/dir2/dir3\n")
.ok()
.unwrap();
assert!(!file.matches_file("foo"));
assert!(!file.matches_file("dir1/foo"));
assert!(!file.matches_file("dir1/dir2/foo"));
assert!(file.matches_file("dir1/dir2/dir3/foo"));
assert!(file.matches_file("dir1/dir2/dir3/dir4/foo"));
}
#[test]
fn test_gitignore_match_only_dir() {
let file = GitIgnoreFile::empty().chain("", b"/dir/\n").ok().unwrap();
assert!(!file.matches_file("dir"));
assert!(file.matches_file("dir/foo"));
assert!(file.matches_file("dir/subdir/foo"));
}
#[test]
fn test_gitignore_unusual_symbols() {
assert!(matches_file(b"\\*\n", "*"));
assert!(!matches_file(b"\\*\n", "foo"));
assert!(matches_file(b"\\\n", "\\"));
assert!(matches_file(b"\\!\n", "!"));
assert!(matches_file(b"\\?\n", "?"));
assert!(!matches_file(b"\\?\n", "x"));
// Invalid escapes are treated like literal backslashes
assert!(matches_file(b"\\w\n", "\\w"));
assert!(!matches_file(b"\\w\n", "w"));
}
#[test]
fn test_gitignore_whitespace() {
assert!(!matches_file(b" \n", " "));
assert!(matches_file(b"\\ \n", " "));
assert!(matches_file(b" a\n", " a"));
assert!(matches_file(b"a b\n", "a b"));
assert!(matches_file(b"a b \n", "a b"));
assert!(!matches_file(b"a b \n", "a b "));
assert!(matches_file(b"a b\\ \\ \n", "a b "));
// It's unclear how this should be interpreted, but we count spaces before
// escaped spaces
assert!(matches_file(b"a b \\ \n", "a b "));
}
#[test]
fn test_gitignore_glob() {
assert!(!matches_file(b"*.o\n", "foo"));
assert!(matches_file(b"*.o\n", "foo.o"));
assert!(!matches_file(b"foo.?\n", "foo"));
assert!(!matches_file(b"foo.?\n", "foo."));
assert!(matches_file(b"foo.?\n", "foo.o"));
}
#[test]
fn test_gitignore_range() {
assert!(!matches_file(b"foo.[az]\n", "foo"));
assert!(matches_file(b"foo.[az]\n", "foo.a"));
assert!(!matches_file(b"foo.[az]\n", "foo.g"));
assert!(matches_file(b"foo.[az]\n", "foo.z"));
assert!(!matches_file(b"foo.[a-z]\n", "foo"));
assert!(matches_file(b"foo.[a-z]\n", "foo.a"));
assert!(matches_file(b"foo.[a-z]\n", "foo.g"));
assert!(matches_file(b"foo.[a-z]\n", "foo.z"));
assert!(matches_file(b"foo.[0-9a-fA-F]\n", "foo.5"));
assert!(matches_file(b"foo.[0-9a-fA-F]\n", "foo.c"));
assert!(matches_file(b"foo.[0-9a-fA-F]\n", "foo.E"));
assert!(!matches_file(b"foo.[0-9a-fA-F]\n", "foo._"));
}
#[test]
fn test_gitignore_leading_dir_glob() {
assert!(matches_file(b"**/foo\n", "foo"));
assert!(matches_file(b"**/foo\n", "dir1/dir2/foo"));
assert!(matches_file(b"**/foo\n", "foo/file"));
assert!(matches_file(b"**/dir/foo\n", "dir/foo"));
assert!(matches_file(b"**/dir/foo\n", "dir1/dir2/dir/foo"));
}
#[test]
fn test_gitignore_leading_dir_glob_with_prefix() {
let file = GitIgnoreFile::empty()
.chain("dir1/dir2/", b"**/foo\n")
.ok()
.unwrap();
// I consider it undefined whether a file in a parent directory matches, but
// let's test it anyway
assert!(!file.matches_file("foo"));
assert!(file.matches_file("dir1/dir2/foo"));
assert!(!file.matches_file("dir1/dir2/bar"));
assert!(file.matches_file("dir1/dir2/sub1/sub2/foo"));
assert!(!file.matches_file("dir1/dir2/sub1/sub2/bar"));
}
#[test]
fn test_gitignore_trailing_dir_glob() {
assert!(!matches_file(b"abc/**\n", "abc"));
assert!(matches_file(b"abc/**\n", "abc/file"));
assert!(matches_file(b"abc/**\n", "abc/dir/file"));
}
#[test]
fn test_gitignore_internal_dir_glob() {
assert!(matches_file(b"a/**/b\n", "a/b"));
assert!(matches_file(b"a/**/b\n", "a/x/b"));
assert!(matches_file(b"a/**/b\n", "a/x/y/b"));
assert!(!matches_file(b"a/**/b\n", "ax/y/b"));
assert!(!matches_file(b"a/**/b\n", "a/x/yb"));
assert!(!matches_file(b"a/**/b\n", "ab"));
}
#[test]
fn test_gitignore_internal_dir_glob_not_really() {
assert!(!matches_file(b"a/x**y/b\n", "a/b"));
assert!(matches_file(b"a/x**y/b\n", "a/xy/b"));
assert!(matches_file(b"a/x**y/b\n", "a/xzzzy/b"));
}
#[test]
fn test_gitignore_line_ordering() {
assert!(matches_file(b"foo\n!foo/bar\n", "foo"));
assert!(!matches_file(b"foo\n!foo/bar\n", "foo/bar"));
assert!(matches_file(b"foo\n!foo/bar\n", "foo/baz"));
assert!(matches_file(b"foo\n!foo/bar\nfoo/bar/baz", "foo"));
assert!(!matches_file(b"foo\n!foo/bar\nfoo/bar/baz", "foo/bar"));
assert!(matches_file(b"foo\n!foo/bar\nfoo/bar/baz", "foo/bar/baz"));
assert!(!matches_file(b"foo\n!foo/bar\nfoo/bar/baz", "foo/bar/quux"));
}
#[test]
fn test_gitignore_file_ordering() {
let file1 = GitIgnoreFile::empty().chain("", b"foo\n").ok().unwrap();
let file2 = file1.chain("foo/", b"!bar").ok().unwrap();
let file3 = file2.chain("foo/bar/", b"baz").ok().unwrap();
assert!(file1.matches_file("foo"));
assert!(file1.matches_file("foo/bar"));
assert!(!file2.matches_file("foo/bar"));
assert!(file2.matches_file("foo/baz"));
assert!(file3.matches_file("foo/bar/baz"));
assert!(!file3.matches_file("foo/bar/qux"));
}
}

View file

@ -33,6 +33,7 @@ pub mod evolution;
pub mod files;
pub mod git;
pub mod git_store;
pub mod gitignore;
pub mod index;
pub mod index_store;
pub mod local_store;

View file

@ -17,6 +17,7 @@ use std::collections::{BTreeMap, HashSet};
use std::convert::TryInto;
use std::fs;
use std::fs::{File, OpenOptions};
use std::io::Read;
#[cfg(unix)]
use std::os::unix::fs::symlink;
#[cfg(unix)]
@ -27,13 +28,13 @@ use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::UNIX_EPOCH;
use git2::{Repository, RepositoryInitOptions};
use protobuf::Message;
use tempfile::NamedTempFile;
use thiserror::Error;
use crate::commit::Commit;
use crate::commit_builder::CommitBuilder;
use crate::gitignore::GitIgnoreFile;
use crate::lock::FileLock;
use crate::repo::ReadonlyRepo;
use crate::repo_path::{
@ -282,6 +283,25 @@ impl TreeState {
self.store.write_symlink(path, str_target).unwrap()
}
fn try_chain_gitignore(
base: &Arc<GitIgnoreFile>,
prefix: &str,
file: PathBuf,
) -> Arc<GitIgnoreFile> {
if file.is_file() {
let mut file = File::open(file).unwrap();
let mut buf = Vec::new();
file.read_to_end(&mut buf).unwrap();
if let Ok(chained) = base.chain(prefix, &buf) {
chained
} else {
base.clone()
}
} else {
base.clone()
}
}
// Look for changes to the working copy. If there are any changes, create
// a new tree from it and return it, and also update the dirstate on disk.
// TODO: respect ignores
@ -289,33 +309,30 @@ impl TreeState {
// We create a temporary git repo with the working copy shared with ours only
// so we can use libgit2's .gitignore check.
// TODO: We should probably have the caller pass in the home directory to the
// library crate instead of depending on $HOME directly here (as we do because
// git2::Repository::status_should_ignore() reads the .gitignore there).
// TODO: Do this more cleanly, perhaps by reading .gitignore files ourselves.
let git_repo_dir = tempfile::tempdir().unwrap();
let mut git_repo_options = RepositoryInitOptions::new();
git_repo_options.workdir_path(&self.working_copy_path);
// Repository::init_opts creates a ".git" file in the working copy,
// which is undesired. On Windows it's worse because that ".git" makes
// the next Repository::init_opts fail with "Permission Denied".
// Automatically remove it.
let _cleanup_dot_git = {
struct Cleanup(PathBuf);
impl Drop for Cleanup {
fn drop(&mut self) {
let _ = fs::remove_file(&self.0);
}
}
Cleanup(self.working_copy_path.join(".git"))
};
let git_repo = Repository::init_opts(git_repo_dir.path(), &git_repo_options).unwrap();
// library crate instead of depending on $HOME directly here. We should also
// have the caller (within the library crate) chain that the
// .jj/git/info/exclude file if we're inside a git-backed repo.
let home_dir = std::env::var("HOME");
let home_dir_path = PathBuf::from(home_dir.unwrap());
let mut git_ignore = GitIgnoreFile::empty();
git_ignore =
TreeState::try_chain_gitignore(&git_ignore, "", home_dir_path.join(".gitignore"));
let mut work = vec![(DirRepoPath::root(), self.working_copy_path.clone())];
let mut work = vec![(
DirRepoPath::root(),
self.working_copy_path.clone(),
git_ignore,
)];
let mut tree_builder = self.store.tree_builder(self.tree_id.clone());
let mut deleted_files: HashSet<&FileRepoPath> = self.file_states.keys().collect();
let mut modified_files = BTreeMap::new();
while !work.is_empty() {
let (dir, disk_dir) = work.pop().unwrap();
let (dir, disk_dir, git_ignore) = work.pop().unwrap();
let git_ignore = TreeState::try_chain_gitignore(
&git_ignore,
&dir.to_internal_string(),
disk_dir.join(".gitignore"),
);
for maybe_entry in disk_dir.read_dir().unwrap() {
let entry = maybe_entry.unwrap();
let file_type = entry.file_type().unwrap();
@ -327,7 +344,7 @@ impl TreeState {
if file_type.is_dir() {
let subdir = dir.join(&DirRepoPathComponent::from(name));
let disk_subdir = disk_dir.join(file_name);
work.push((subdir, disk_subdir));
work.push((subdir, disk_subdir, git_ignore.clone()));
} else {
let file = dir.join(&FileRepoPathComponent::from(name));
let disk_file = disk_dir.join(file_name);
@ -338,7 +355,7 @@ impl TreeState {
match self.file_states.get(&file) {
None => {
// untracked
if git_repo.status_should_ignore(&disk_file).unwrap() {
if git_ignore.matches_file(&file.to_internal_string()) {
continue;
}
clean = false;