From 9c28fe954c1d359f6523bd1cea47442b38b447b1 Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Sun, 7 Apr 2024 21:14:17 +0900 Subject: [PATCH] fileset: add grammar and implement parser (without name resolution) The fileset grammar is basically a stripped-down version of the revset grammar, with a few adjustments: * extract function call to "function" rule (like templater) * inline "symbol" rule (because "identifier" and "string" should be treated differently at the early parsing stage.) The parser will have a separate name resolution stage. This will help to do alias substitution properly. I'll probably rewrite the revset parser in the same way. It will also help if we want to embed fileset expression in file() revset. --- lib/src/fileset.pest | 60 +++++ lib/src/fileset.rs | 1 + lib/src/fileset_parser.rs | 553 ++++++++++++++++++++++++++++++++++++++ lib/src/lib.rs | 1 + 4 files changed, 615 insertions(+) create mode 100644 lib/src/fileset.pest create mode 100644 lib/src/fileset_parser.rs diff --git a/lib/src/fileset.pest b/lib/src/fileset.pest new file mode 100644 index 000000000..83e17780c --- /dev/null +++ b/lib/src/fileset.pest @@ -0,0 +1,60 @@ +// Copyright 2021-2024 The Jujutsu Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +whitespace = _{ " " | "\t" | "\r" | "\n" | "\x0c" } + +// TODO: adjust identifier rule for file names +identifier_part = @{ (ASCII_ALPHANUMERIC | "_" | "/")+ } +identifier = @{ + identifier_part ~ (("." | "-" | "+") ~ identifier_part)* +} + +string_escape = @{ "\\" ~ ("t" | "r" | "n" | "0" | "\"" | "\\") } +string_content_char = @{ !("\"" | "\\") ~ ANY } +string_content = @{ string_content_char+ } +string_literal = ${ "\"" ~ (string_content | string_escape)* ~ "\"" } + +pattern_kind_op = { ":" } + +negate_op = { "~" } +union_op = { "|" } +intersection_op = { "&" } +difference_op = { "~" } +prefix_ops = _{ negate_op } +infix_ops = _{ union_op | intersection_op | difference_op } + +function = { function_name ~ "(" ~ whitespace* ~ function_arguments ~ whitespace* ~ ")" } +function_name = @{ (ASCII_ALPHANUMERIC | "_")+ } +function_arguments = { + expression ~ (whitespace* ~ "," ~ whitespace* ~ expression)* ~ (whitespace* ~ ",")? + | "" +} + +// TODO: change rhs to string_literal to require quoting? #2101 +string_pattern = { identifier ~ pattern_kind_op ~ (identifier | string_literal) } + +primary = { + "(" ~ whitespace* ~ expression ~ whitespace* ~ ")" + | function + | string_pattern + | identifier + | string_literal +} + +expression = { + (prefix_ops ~ whitespace*)* ~ primary + ~ (whitespace* ~ infix_ops ~ whitespace* ~ (prefix_ops ~ whitespace*)* ~ primary)* +} + +program = _{ SOI ~ whitespace* ~ expression ~ whitespace* ~ EOI } diff --git a/lib/src/fileset.rs b/lib/src/fileset.rs index 1074263ba..5c7b29b36 100644 --- a/lib/src/fileset.rs +++ b/lib/src/fileset.rs @@ -19,6 +19,7 @@ use std::slice; use thiserror::Error; +pub use crate::fileset_parser::{FilesetParseError, FilesetParseErrorKind, FilesetParseResult}; use crate::matchers::{ DifferenceMatcher, EverythingMatcher, FilesMatcher, IntersectionMatcher, Matcher, NothingMatcher, PrefixMatcher, UnionMatcher, diff --git a/lib/src/fileset_parser.rs b/lib/src/fileset_parser.rs new file mode 100644 index 000000000..c08fb02cf --- /dev/null +++ b/lib/src/fileset_parser.rs @@ -0,0 +1,553 @@ +// Copyright 2024 The Jujutsu Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Parser for the fileset language. + +#![allow(unused)] // TODO + +use std::error; + +use itertools::Itertools as _; +use once_cell::sync::Lazy; +use pest::iterators::Pair; +use pest::pratt_parser::{Assoc, Op, PrattParser}; +use pest::Parser; +use pest_derive::Parser; +use thiserror::Error; + +use crate::dsl_util::StringLiteralParser; + +#[derive(Parser)] +#[grammar = "fileset.pest"] +struct FilesetParser; + +const STRING_LITERAL_PARSER: StringLiteralParser = StringLiteralParser { + content_rule: Rule::string_content, + escape_rule: Rule::string_escape, +}; + +impl Rule { + fn to_symbol(self) -> Option<&'static str> { + match self { + Rule::EOI => None, + Rule::whitespace => None, + Rule::identifier_part => None, + Rule::identifier => None, + Rule::string_escape => None, + Rule::string_content_char => None, + Rule::string_content => None, + Rule::string_literal => None, + Rule::pattern_kind_op => Some(":"), + Rule::negate_op => Some("~"), + Rule::union_op => Some("|"), + Rule::intersection_op => Some("&"), + Rule::difference_op => Some("~"), + Rule::prefix_ops => None, + Rule::infix_ops => None, + Rule::function => None, + Rule::function_name => None, + Rule::function_arguments => None, + Rule::string_pattern => None, + Rule::primary => None, + Rule::expression => None, + Rule::program => None, + } + } +} + +/// Result of fileset parsing and name resolution. +pub type FilesetParseResult = Result; + +/// Error occurred during fileset parsing and name resolution. +#[derive(Debug, Error)] +#[error("{pest_error}")] +pub struct FilesetParseError { + kind: FilesetParseErrorKind, + pest_error: Box>, + source: Option>, +} + +/// Categories of fileset parsing and name resolution error. +#[allow(missing_docs)] +#[derive(Clone, Debug, Eq, Error, PartialEq)] +pub enum FilesetParseErrorKind { + #[error("Syntax error")] + SyntaxError, + #[error(r#"Function "{name}" doesn't exist"#)] + NoSuchFunction { + name: String, + candidates: Vec, + }, + #[error(r#"Function "{name}": {message}"#)] + InvalidArguments { name: String, message: String }, + #[error("{0}")] + Expression(String), +} + +impl FilesetParseError { + pub(super) fn new(kind: FilesetParseErrorKind, span: pest::Span<'_>) -> Self { + let message = kind.to_string(); + let pest_error = Box::new(pest::error::Error::new_from_span( + pest::error::ErrorVariant::CustomError { message }, + span, + )); + FilesetParseError { + kind, + pest_error, + source: None, + } + } + + pub(super) fn with_source( + mut self, + source: impl Into>, + ) -> Self { + self.source = Some(source.into()); + self + } + + /// Unexpected number of arguments, or invalid combination of arguments. + pub(super) fn invalid_arguments( + function: &FunctionCallNode, + message: impl Into, + ) -> Self { + FilesetParseError::new( + FilesetParseErrorKind::InvalidArguments { + name: function.name.to_owned(), + message: message.into(), + }, + function.args_span, + ) + } + + /// Some other expression error. + pub(super) fn expression(message: impl Into, span: pest::Span<'_>) -> Self { + FilesetParseError::new(FilesetParseErrorKind::Expression(message.into()), span) + } + + /// Category of the underlying error. + pub fn kind(&self) -> &FilesetParseErrorKind { + &self.kind + } +} + +impl From> for FilesetParseError { + fn from(err: pest::error::Error) -> Self { + FilesetParseError { + kind: FilesetParseErrorKind::SyntaxError, + pest_error: Box::new(rename_rules_in_pest_error(err)), + source: None, + } + } +} + +fn rename_rules_in_pest_error(err: pest::error::Error) -> pest::error::Error { + err.renamed_rules(|rule| { + rule.to_symbol() + .map(|sym| format!("`{sym}`")) + .unwrap_or_else(|| format!("<{rule:?}>")) + }) +} + +/// Parsed node without name resolution. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ExpressionNode<'i> { + pub kind: ExpressionKind<'i>, + pub span: pest::Span<'i>, +} + +impl<'i> ExpressionNode<'i> { + fn new(kind: ExpressionKind<'i>, span: pest::Span<'i>) -> Self { + ExpressionNode { kind, span } + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ExpressionKind<'i> { + Identifier(&'i str), + String(String), + StringPattern { kind: &'i str, value: String }, + Unary(UnaryOp, Box>), + Binary(BinaryOp, Box>, Box>), + FunctionCall(FunctionCallNode<'i>), +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub enum UnaryOp { + /// `~` + Negate, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub enum BinaryOp { + /// `|` + Union, + /// `&` + Intersection, + /// `~` + Difference, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct FunctionCallNode<'i> { + pub name: &'i str, + pub name_span: pest::Span<'i>, + pub args: Vec>, + pub args_span: pest::Span<'i>, +} + +fn parse_function_call_node(pair: Pair) -> FilesetParseResult { + assert_eq!(pair.as_rule(), Rule::function); + let (name_pair, args_pair) = pair.into_inner().collect_tuple().unwrap(); + assert_eq!(name_pair.as_rule(), Rule::function_name); + assert_eq!(args_pair.as_rule(), Rule::function_arguments); + let name_span = name_pair.as_span(); + let args_span = args_pair.as_span(); + let name = name_pair.as_str(); + let args = args_pair + .into_inner() + .map(parse_expression_node) + .try_collect()?; + Ok(FunctionCallNode { + name, + name_span, + args, + args_span, + }) +} + +fn parse_primary_node(pair: Pair) -> FilesetParseResult { + assert_eq!(pair.as_rule(), Rule::primary); + let first = pair.into_inner().next().unwrap(); + let span = first.as_span(); + let expr = match first.as_rule() { + Rule::expression => return parse_expression_node(first), + Rule::function => { + let function = parse_function_call_node(first)?; + ExpressionKind::FunctionCall(function) + } + Rule::string_pattern => { + let (lhs, op, rhs) = first.into_inner().collect_tuple().unwrap(); + assert_eq!(lhs.as_rule(), Rule::identifier); + assert_eq!(op.as_rule(), Rule::pattern_kind_op); + let kind = lhs.as_str(); + let value = match rhs.as_rule() { + Rule::identifier => rhs.as_str().to_owned(), + Rule::string_literal => STRING_LITERAL_PARSER.parse(rhs.into_inner()), + r => panic!("unexpected string pattern rule: {r:?}"), + }; + ExpressionKind::StringPattern { kind, value } + } + Rule::identifier => ExpressionKind::Identifier(first.as_str()), + Rule::string_literal => { + let text = STRING_LITERAL_PARSER.parse(first.into_inner()); + ExpressionKind::String(text) + } + r => panic!("unexpected primary rule: {r:?}"), + }; + Ok(ExpressionNode::new(expr, span)) +} + +fn parse_expression_node(pair: Pair) -> FilesetParseResult { + assert_eq!(pair.as_rule(), Rule::expression); + static PRATT: Lazy> = Lazy::new(|| { + PrattParser::new() + .op(Op::infix(Rule::union_op, Assoc::Left)) + .op(Op::infix(Rule::intersection_op, Assoc::Left) + | Op::infix(Rule::difference_op, Assoc::Left)) + .op(Op::prefix(Rule::negate_op)) + }); + PRATT + .map_primary(parse_primary_node) + .map_prefix(|op, rhs| { + let op_kind = match op.as_rule() { + Rule::negate_op => UnaryOp::Negate, + r => panic!("unexpected prefix operator rule {r:?}"), + }; + let rhs = Box::new(rhs?); + let span = op.as_span().start_pos().span(&rhs.span.end_pos()); + let expr = ExpressionKind::Unary(op_kind, rhs); + Ok(ExpressionNode::new(expr, span)) + }) + .map_infix(|lhs, op, rhs| { + let op_kind = match op.as_rule() { + Rule::union_op => BinaryOp::Union, + Rule::intersection_op => BinaryOp::Intersection, + Rule::difference_op => BinaryOp::Difference, + r => panic!("unexpected infix operator rule {r:?}"), + }; + let lhs = Box::new(lhs?); + let rhs = Box::new(rhs?); + let span = lhs.span.start_pos().span(&rhs.span.end_pos()); + let expr = ExpressionKind::Binary(op_kind, lhs, rhs); + Ok(ExpressionNode::new(expr, span)) + }) + .parse(pair.into_inner()) +} + +/// Parses text into expression tree. No name resolution is made at this stage. +pub fn parse_program(text: &str) -> FilesetParseResult { + let mut pairs = FilesetParser::parse(Rule::program, text)?; + let first = pairs.next().unwrap(); + parse_expression_node(first) +} + +#[cfg(test)] +mod tests { + use assert_matches::assert_matches; + + use super::*; + + fn parse_into_kind(text: &str) -> Result { + parse_program(text) + .map(|node| node.kind) + .map_err(|err| err.kind) + } + + fn parse_normalized(text: &str) -> FilesetParseResult { + parse_program(text).map(normalize_tree) + } + + /// Drops auxiliary data from parsed tree so it can be compared with other. + fn normalize_tree(node: ExpressionNode) -> ExpressionNode { + fn empty_span() -> pest::Span<'static> { + pest::Span::new("", 0, 0).unwrap() + } + + fn normalize_list(nodes: Vec) -> Vec { + nodes.into_iter().map(normalize_tree).collect() + } + + fn normalize_function_call(function: FunctionCallNode) -> FunctionCallNode { + FunctionCallNode { + name: function.name, + name_span: empty_span(), + args: normalize_list(function.args), + args_span: empty_span(), + } + } + + let normalized_kind = match node.kind { + ExpressionKind::Identifier(_) + | ExpressionKind::String(_) + | ExpressionKind::StringPattern { .. } => node.kind, + ExpressionKind::Unary(op, arg) => { + let arg = Box::new(normalize_tree(*arg)); + ExpressionKind::Unary(op, arg) + } + ExpressionKind::Binary(op, lhs, rhs) => { + let lhs = Box::new(normalize_tree(*lhs)); + let rhs = Box::new(normalize_tree(*rhs)); + ExpressionKind::Binary(op, lhs, rhs) + } + ExpressionKind::FunctionCall(function) => { + ExpressionKind::FunctionCall(normalize_function_call(function)) + } + }; + ExpressionNode { + kind: normalized_kind, + span: empty_span(), + } + } + + #[test] + fn test_parse_tree_eq() { + assert_eq!( + parse_normalized(r#" foo( x ) | ~bar:"baz" "#).unwrap(), + parse_normalized(r#"(foo(x))|(~(bar:"baz"))"#).unwrap() + ); + assert_ne!( + parse_normalized(r#" foo "#).unwrap(), + parse_normalized(r#" "foo" "#).unwrap() + ); + } + + #[test] + fn test_parse_whitespace() { + let ascii_whitespaces: String = ('\x00'..='\x7f') + .filter(char::is_ascii_whitespace) + .collect(); + assert_eq!( + parse_normalized(&format!("{ascii_whitespaces}f()")).unwrap(), + parse_normalized("f()").unwrap() + ); + } + + #[test] + fn test_parse_identifier() { + assert_eq!( + parse_into_kind("dir/foo-bar_0.baz"), + Ok(ExpressionKind::Identifier("dir/foo-bar_0.baz")) + ); + } + + #[test] + fn test_parse_string_literal() { + // "\" escapes + assert_eq!( + parse_into_kind(r#" "\t\r\n\"\\\0" "#), + Ok(ExpressionKind::String("\t\r\n\"\\\0".to_owned())) + ); + + // Invalid "\" escape + assert_eq!( + parse_into_kind(r#" "\y" "#), + Err(FilesetParseErrorKind::SyntaxError) + ); + } + + #[test] + fn test_parse_string_pattern() { + assert_eq!( + parse_into_kind(r#" foo:bar "#), + Ok(ExpressionKind::StringPattern { + kind: "foo", + value: "bar".to_owned() + }) + ); + assert_eq!( + parse_into_kind(r#" foo:"bar" "#), + Ok(ExpressionKind::StringPattern { + kind: "foo", + value: "bar".to_owned() + }) + ); + assert_eq!( + parse_into_kind(r#" foo:"" "#), + Ok(ExpressionKind::StringPattern { + kind: "foo", + value: "".to_owned() + }) + ); + assert_eq!( + parse_into_kind(r#" foo: "#), + Err(FilesetParseErrorKind::SyntaxError) + ); + assert_eq!( + parse_into_kind(r#" foo: "" "#), + Err(FilesetParseErrorKind::SyntaxError) + ); + assert_eq!( + parse_into_kind(r#" foo :"" "#), + Err(FilesetParseErrorKind::SyntaxError) + ); + } + + #[test] + fn test_parse_operator() { + assert_matches!( + parse_into_kind("~x"), + Ok(ExpressionKind::Unary(UnaryOp::Negate, _)) + ); + assert_matches!( + parse_into_kind("x|y"), + Ok(ExpressionKind::Binary(BinaryOp::Union, _, _)) + ); + assert_matches!( + parse_into_kind("x&y"), + Ok(ExpressionKind::Binary(BinaryOp::Intersection, _, _)) + ); + assert_matches!( + parse_into_kind("x~y"), + Ok(ExpressionKind::Binary(BinaryOp::Difference, _, _)) + ); + + // Set operator associativity/precedence + assert_eq!( + parse_normalized("~x|y").unwrap(), + parse_normalized("(~x)|y").unwrap() + ); + assert_eq!( + parse_normalized("x&~y").unwrap(), + parse_normalized("x&(~y)").unwrap() + ); + assert_eq!( + parse_normalized("x~~y").unwrap(), + parse_normalized("x~(~y)").unwrap() + ); + assert_eq!( + parse_normalized("x~~~y").unwrap(), + parse_normalized("x~(~(~y))").unwrap() + ); + assert_eq!( + parse_normalized("x|y|z").unwrap(), + parse_normalized("(x|y)|z").unwrap() + ); + assert_eq!( + parse_normalized("x&y|z").unwrap(), + parse_normalized("(x&y)|z").unwrap() + ); + assert_eq!( + parse_normalized("x|y&z").unwrap(), + parse_normalized("x|(y&z)").unwrap() + ); + assert_eq!( + parse_normalized("x|y~z").unwrap(), + parse_normalized("x|(y~z)").unwrap() + ); + assert_eq!( + parse_normalized("~x:y").unwrap(), + parse_normalized("~(x:y)").unwrap() + ); + assert_eq!( + parse_normalized("x|y:z").unwrap(), + parse_normalized("x|(y:z)").unwrap() + ); + + // Expression span + assert_eq!(parse_program(" ~ x ").unwrap().span.as_str(), "~ x"); + assert_eq!(parse_program(" x |y ").unwrap().span.as_str(), "x |y"); + } + + #[test] + fn test_parse_function_call() { + assert_matches!( + parse_into_kind("foo()"), + Ok(ExpressionKind::FunctionCall(_)) + ); + + // Trailing comma isn't allowed for empty argument + assert!(parse_normalized("foo(,)").is_err()); + + // Trailing comma is allowed for the last argument + assert_eq!( + parse_normalized("foo(a,)").unwrap(), + parse_normalized("foo(a)").unwrap() + ); + assert_eq!( + parse_normalized("foo(a , )").unwrap(), + parse_normalized("foo(a)").unwrap() + ); + assert!(parse_normalized("foo(,a)").is_err()); + assert!(parse_normalized("foo(a,,)").is_err()); + assert!(parse_normalized("foo(a , , )").is_err()); + assert_eq!( + parse_normalized("foo(a,b,)").unwrap(), + parse_normalized("foo(a,b)").unwrap() + ); + assert!(parse_normalized("foo(a,,b)").is_err()); + } + + #[test] + fn test_parse_error() { + insta::assert_snapshot!(parse_program("foo|").unwrap_err().to_string(), @r###" + --> 1:5 + | + 1 | foo| + | ^--- + | + = expected `~` or + "###); + } +} diff --git a/lib/src/lib.rs b/lib/src/lib.rs index a9d2396cf..6db22c9f6 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -41,6 +41,7 @@ pub mod extensions_map; pub mod file_util; pub mod files; pub mod fileset; +mod fileset_parser; pub mod fmt_util; pub mod fsmonitor; pub mod git;