revset: add diff_contains(text[, files]) to search diffs

The text pattern is applied prior to comparison as we do in Mercurial. This
might affect hunk selection, but is much faster than computing diff of full
file contents. For example, the following hunk wouldn't be caught by
diff_contains("a") because the line "b\n" is filtered out:

    - a
      b
    + a

Closes 
This commit is contained in:
Yuya Nishihara 2024-07-08 20:20:41 +09:00
parent eabff4c0b4
commit 895eead4b8
5 changed files with 268 additions and 2 deletions

View file

@ -63,6 +63,8 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
* The `file()` revset function now accepts fileset as argument.
* New `diff_contains()` revset function can be used to search diffs.
### Fixed bugs
* `jj diff --git` no longer shows the contents of binary files.

View file

@ -280,6 +280,16 @@ given [string pattern](#string-patterns).
Some file patterns might need quoting because the `expression` must also be
parsable as a revset. For example, `.` has to be quoted in `file(".")`.
* `diff_contains(text[, files])`: Commits containing diffs matching the given
`text` pattern line by line.
The search paths can be narrowed by the `files` expression. All modified files
are scanned by default, but it is likely to change in future version to
respect the command line path arguments.
For example, `diff_contains("TODO", "src")` will search revisions where "TODO"
is added to or removed from files under "src".
* `conflict()`: Commits with conflicts.
* `present(x)`: Same as `x`, but evaluated to `none()` if any of the commits

View file

@ -20,14 +20,17 @@ use std::collections::{BTreeSet, BinaryHeap, HashSet};
use std::ops::Range;
use std::rc::Rc;
use std::sync::Arc;
use std::{fmt, iter};
use std::{fmt, iter, str};
use futures::StreamExt as _;
use itertools::Itertools;
use pollster::FutureExt as _;
use super::rev_walk::{EagerRevWalk, PeekableRevWalk, RevWalk, RevWalkBuilder};
use super::revset_graph_iterator::RevsetGraphWalk;
use crate::backend::{BackendResult, ChangeId, CommitId, MillisSinceEpoch};
use crate::backend::{BackendError, BackendResult, ChangeId, CommitId, MillisSinceEpoch};
use crate::commit::Commit;
use crate::conflicts::{materialized_diff_stream, MaterializedTreeValue};
use crate::default_index::{AsCompositeIndex, CompositeIndex, IndexPosition};
use crate::graph::GraphEdge;
use crate::matchers::{Matcher, Visit};
@ -37,6 +40,7 @@ use crate::revset::{
RevsetFilterPredicate, GENERATION_RANGE_FULL,
};
use crate::store::Store;
use crate::str_util::StringPattern;
use crate::{rewrite, union_find};
type BoxedPredicateFn<'a> = Box<dyn FnMut(&CompositeIndex, IndexPosition) -> bool + 'a>;
@ -1078,6 +1082,16 @@ fn build_predicate_fn(
has_diff_from_parent(&store, index, &commit, matcher.as_ref()).unwrap()
})
}
RevsetFilterPredicate::DiffContains { text, files } => {
let text_pattern = text.clone();
let files_matcher: Rc<dyn Matcher> = files.to_matcher().into();
box_pure_predicate_fn(move |index, pos| {
let entry = index.entry_by_pos(pos);
let commit = store.get_commit(&entry.commit_id()).unwrap();
matches_diff_from_parent(&store, index, &commit, &text_pattern, &*files_matcher)
.unwrap()
})
}
RevsetFilterPredicate::HasConflict => box_pure_predicate_fn(move |index, pos| {
let entry = index.entry_by_pos(pos);
let commit = store.get_commit(&entry.commit_id()).unwrap();
@ -1115,6 +1129,75 @@ fn has_diff_from_parent(
Ok(from_tree.diff(&to_tree, matcher).next().is_some())
}
fn matches_diff_from_parent(
store: &Arc<Store>,
index: &CompositeIndex,
commit: &Commit,
text_pattern: &StringPattern,
files_matcher: &dyn Matcher,
) -> BackendResult<bool> {
let parents: Vec<_> = commit.parents().try_collect()?;
let from_tree = rewrite::merge_commit_trees_without_repo(store, &index, &parents)?;
let to_tree = commit.tree()?;
let tree_diff = from_tree.diff_stream(&to_tree, files_matcher);
// Conflicts are compared in materialized form. Alternatively, conflict
// pairs can be compared one by one. #4062
let mut diff_stream = materialized_diff_stream(store, tree_diff);
async {
while let Some((path, diff)) = diff_stream.next().await {
let (left_value, right_value) = diff?;
let left_content = to_file_content(&path, left_value)?;
let right_content = to_file_content(&path, right_value)?;
// Filter lines prior to comparison. This might produce inferior
// hunks due to lack of contexts, but is way faster than full diff.
let left_lines = match_lines(&left_content, text_pattern);
let right_lines = match_lines(&right_content, text_pattern);
if left_lines.ne(right_lines) {
return Ok(true);
}
}
Ok(false)
}
.block_on()
}
fn match_lines<'a: 'b, 'b>(
text: &'a [u8],
pattern: &'b StringPattern,
) -> impl Iterator<Item = &'a [u8]> + 'b {
// The pattern is matched line by line so that it can be anchored to line
// start/end. For example, exact:"" will match blank lines.
text.split_inclusive(|b| *b == b'\n').filter(|line| {
let line = line.strip_suffix(b"\n").unwrap_or(line);
// TODO: add .matches_bytes() or .to_bytes_matcher()
str::from_utf8(line).map_or(false, |line| pattern.matches(line))
})
}
fn to_file_content(path: &RepoPath, value: MaterializedTreeValue) -> BackendResult<Vec<u8>> {
match value {
MaterializedTreeValue::Absent => Ok(vec![]),
MaterializedTreeValue::AccessDenied(_) => Ok(vec![]),
MaterializedTreeValue::File { id, mut reader, .. } => {
let mut content = vec![];
reader
.read_to_end(&mut content)
.map_err(|err| BackendError::ReadFile {
path: path.to_owned(),
id: id.clone(),
source: err.into(),
})?;
Ok(content)
}
MaterializedTreeValue::Symlink { id: _, target } => Ok(target.into_bytes()),
MaterializedTreeValue::GitSubmodule(_) => Ok(vec![]),
MaterializedTreeValue::Conflict { contents, .. } => Ok(contents),
MaterializedTreeValue::Tree(id) => {
panic!("Unexpected tree with id {id:?} in diff at path {path:?}");
}
}
}
#[cfg(test)]
mod tests {
use super::*;

View file

@ -134,6 +134,11 @@ pub enum RevsetFilterPredicate {
Committer(StringPattern),
/// Commits modifying the paths specified by the fileset.
File(FilesetExpression),
/// Commits containing diffs matching the `text` pattern within the `files`.
DiffContains {
text: StringPattern,
files: FilesetExpression,
},
/// Commits with conflicts
HasConflict,
/// Custom predicates provided by extensions
@ -714,6 +719,26 @@ static BUILTIN_FUNCTION_MAP: Lazy<HashMap<&'static str, RevsetFunction>> = Lazy:
let expr = FilesetExpression::union_all(file_expressions);
Ok(RevsetExpression::filter(RevsetFilterPredicate::File(expr)))
});
map.insert("diff_contains", |function, context| {
let ([text_arg], [files_opt_arg]) = function.expect_arguments()?;
let text = expect_string_pattern(text_arg)?;
let files = if let Some(files_arg) = files_opt_arg {
let ctx = context.workspace.as_ref().ok_or_else(|| {
RevsetParseError::with_span(
RevsetParseErrorKind::FsPathWithoutWorkspace,
files_arg.span,
)
})?;
expect_fileset_expression(files_arg, ctx.path_converter)?
} else {
// TODO: defaults to CLI path arguments?
// https://github.com/martinvonz/jj/issues/2933#issuecomment-1925870731
FilesetExpression::all()
};
Ok(RevsetExpression::filter(
RevsetFilterPredicate::DiffContains { text, files },
))
});
map.insert("conflict", |function, _context| {
function.expect_no_arguments()?;
Ok(RevsetExpression::filter(RevsetFilterPredicate::HasConflict))

View file

@ -3012,6 +3012,152 @@ fn test_evaluate_expression_file() {
);
}
#[test]
fn test_evaluate_expression_diff_contains() {
let settings = testutils::user_settings();
let test_workspace = TestWorkspace::init(&settings);
let repo = &test_workspace.repo;
let mut tx = repo.start_transaction(&settings);
let mut_repo = tx.mut_repo();
let empty_clean_inserted_deleted =
RepoPath::from_internal_string("empty_clean_inserted_deleted");
let blank_clean_inserted_clean = RepoPath::from_internal_string("blank_clean_inserted_clean");
let noeol_modified_modified_clean =
RepoPath::from_internal_string("noeol_modified_modified_clean");
let normal_inserted_modified_removed =
RepoPath::from_internal_string("normal_inserted_modified_removed");
let tree1 = create_tree(
repo,
&[
(empty_clean_inserted_deleted, ""),
(blank_clean_inserted_clean, "\n"),
(noeol_modified_modified_clean, "1"),
(normal_inserted_modified_removed, "1\n"),
],
);
let tree2 = create_tree(
repo,
&[
(empty_clean_inserted_deleted, ""),
(blank_clean_inserted_clean, "\n"),
(noeol_modified_modified_clean, "2"),
(normal_inserted_modified_removed, "1\n2\n"),
],
);
let tree3 = create_tree(
repo,
&[
(empty_clean_inserted_deleted, "3"),
(blank_clean_inserted_clean, "\n3\n"),
(noeol_modified_modified_clean, "2 3"),
(normal_inserted_modified_removed, "1 3\n2\n"),
],
);
let tree4 = create_tree(
repo,
&[
(empty_clean_inserted_deleted, ""),
(blank_clean_inserted_clean, "\n3\n"),
(noeol_modified_modified_clean, "2 3"),
// normal_inserted_modified_removed
],
);
let commit1 = mut_repo
.new_commit(
&settings,
vec![repo.store().root_commit_id().clone()],
tree1.id(),
)
.write()
.unwrap();
let commit2 = mut_repo
.new_commit(&settings, vec![commit1.id().clone()], tree2.id())
.write()
.unwrap();
let commit3 = mut_repo
.new_commit(&settings, vec![commit2.id().clone()], tree3.id())
.write()
.unwrap();
let commit4 = mut_repo
.new_commit(&settings, vec![commit3.id().clone()], tree4.id())
.write()
.unwrap();
let query = |revset_str: &str| {
resolve_commit_ids_in_workspace(
mut_repo,
revset_str,
&test_workspace.workspace,
Some(test_workspace.workspace.workspace_root()),
)
};
// should match both inserted and deleted lines
assert_eq!(
query("diff_contains('2')"),
vec![
commit4.id().clone(),
commit3.id().clone(),
commit2.id().clone(),
]
);
assert_eq!(
query("diff_contains('3')"),
vec![commit4.id().clone(), commit3.id().clone()]
);
assert_eq!(query("diff_contains('2 3')"), vec![commit3.id().clone()]);
assert_eq!(
query("diff_contains('1 3')"),
vec![commit4.id().clone(), commit3.id().clone()]
);
// should match line with eol
assert_eq!(
query(&format!(
"diff_contains(exact:'1', {normal_inserted_modified_removed:?})",
)),
vec![commit3.id().clone(), commit1.id().clone()]
);
// should match line without eol
assert_eq!(
query(&format!(
"diff_contains(exact:'1', {noeol_modified_modified_clean:?})",
)),
vec![commit2.id().clone(), commit1.id().clone()]
);
// exact:'' should match blank line
assert_eq!(
query(&format!(
"diff_contains(exact:'', {empty_clean_inserted_deleted:?})",
)),
vec![]
);
assert_eq!(
query(&format!(
"diff_contains(exact:'', {blank_clean_inserted_clean:?})",
)),
vec![commit1.id().clone()]
);
// '' should match anything but clean
assert_eq!(
query(&format!(
"diff_contains('', {empty_clean_inserted_deleted:?})",
)),
vec![commit4.id().clone(), commit3.id().clone()]
);
assert_eq!(
query(&format!(
"diff_contains('', {blank_clean_inserted_clean:?})",
)),
vec![commit3.id().clone(), commit1.id().clone()]
);
}
#[test]
fn test_evaluate_expression_conflict() {
let settings = testutils::user_settings();