update treesitter parsing to accomodate for collapsed nested functions

Co-authored-by: maxbrunsfeld <max@zed.dev>
This commit is contained in:
KCaverly 2023-07-19 15:47:05 -04:00
parent 0e071919a0
commit 9809ec3d70
9 changed files with 773 additions and 584 deletions

3
Cargo.lock generated
View file

@ -6486,6 +6486,7 @@ dependencies = [
"parking_lot 0.11.2",
"picker",
"postage",
"pretty_assertions",
"project",
"rand 0.8.5",
"rpc",
@ -7991,7 +7992,7 @@ dependencies = [
[[package]]
name = "tree-sitter"
version = "0.20.10"
source = "git+https://github.com/tree-sitter/tree-sitter?rev=49226023693107fba9a1191136a4f47f38cdca73#49226023693107fba9a1191136a4f47f38cdca73"
source = "git+https://github.com/tree-sitter/tree-sitter?rev=1c65ca24bc9a734ab70115188f465e12eecf224e#1c65ca24bc9a734ab70115188f465e12eecf224e"
dependencies = [
"cc",
"regex",

View file

@ -130,7 +130,7 @@ tree-sitter-yaml = { git = "https://github.com/zed-industries/tree-sitter-yaml",
tree-sitter-lua = "0.0.14"
[patch.crates-io]
tree-sitter = { git = "https://github.com/tree-sitter/tree-sitter", rev = "49226023693107fba9a1191136a4f47f38cdca73" }
tree-sitter = { git = "https://github.com/tree-sitter/tree-sitter", rev = "1c65ca24bc9a734ab70115188f465e12eecf224e" }
async-task = { git = "https://github.com/zed-industries/async-task", rev = "341b57d6de98cdfd7b418567b8de2022ca993a6e" }
# TODO - Remove when a version is released with this PR: https://github.com/servo/core-foundation-rs/pull/457

View file

@ -339,6 +339,8 @@ pub struct LanguageConfig {
#[serde(default)]
pub line_comment: Option<Arc<str>>,
#[serde(default)]
pub collapsed_placeholder: String,
#[serde(default)]
pub block_comment: Option<(Arc<str>, Arc<str>)>,
#[serde(default)]
pub overrides: HashMap<String, LanguageConfigOverride>,
@ -408,6 +410,7 @@ impl Default for LanguageConfig {
line_comment: Default::default(),
block_comment: Default::default(),
overrides: Default::default(),
collapsed_placeholder: Default::default(),
}
}
}
@ -525,6 +528,8 @@ pub struct EmbeddingConfig {
pub item_capture_ix: u32,
pub name_capture_ix: u32,
pub context_capture_ix: Option<u32>,
pub collapse_capture_ix: Option<u32>,
pub keep_capture_ix: Option<u32>,
}
struct InjectionConfig {
@ -1246,12 +1251,16 @@ impl Language {
let mut item_capture_ix = None;
let mut name_capture_ix = None;
let mut context_capture_ix = None;
let mut collapse_capture_ix = None;
let mut keep_capture_ix = None;
get_capture_indices(
&query,
&mut [
("item", &mut item_capture_ix),
("name", &mut name_capture_ix),
("context", &mut context_capture_ix),
("keep", &mut keep_capture_ix),
("collapse", &mut collapse_capture_ix),
],
);
if let Some((item_capture_ix, name_capture_ix)) = item_capture_ix.zip(name_capture_ix) {
@ -1260,6 +1269,8 @@ impl Language {
item_capture_ix,
name_capture_ix,
context_capture_ix,
collapse_capture_ix,
keep_capture_ix,
});
}
Ok(self)
@ -1544,9 +1555,20 @@ impl Language {
pub fn grammar(&self) -> Option<&Arc<Grammar>> {
self.grammar.as_ref()
}
pub fn default_scope(self: &Arc<Self>) -> LanguageScope {
LanguageScope {
language: self.clone(),
override_id: None,
}
}
}
impl LanguageScope {
pub fn collapsed_placeholder(&self) -> &str {
self.language.config.collapsed_placeholder.as_ref()
}
pub fn line_comment_prefix(&self) -> Option<&Arc<str>> {
Override::as_option(
self.config_override().map(|o| &o.line_comment),

View file

@ -46,6 +46,7 @@ rpc = { path = "../rpc", features = ["test-support"] }
workspace = { path = "../workspace", features = ["test-support"] }
settings = { path = "../settings", features = ["test-support"]}
pretty_assertions.workspace = true
rand.workspace = true
unindent.workspace = true
tempdir.workspace = true

View file

@ -1,6 +1,6 @@
use anyhow::{anyhow, Ok, Result};
use language::Language;
use std::{ops::Range, path::Path, sync::Arc};
use language::{Grammar, Language};
use std::{cmp, collections::HashSet, ops::Range, path::Path, sync::Arc};
use tree_sitter::{Parser, QueryCursor};
#[derive(Debug, PartialEq, Clone)]
@ -22,6 +22,20 @@ pub struct CodeContextRetriever {
pub cursor: QueryCursor,
}
// Every match has an item, this represents the fundamental treesitter symbol and anchors the search
// Every match has one or more 'name' captures. These indicate the display range of the item for deduplication.
// If there are preceeding comments, we track this with a context capture
// If there is a piece that should be collapsed in hierarchical queries, we capture it with a collapse capture
// If there is a piece that should be kept inside a collapsed node, we capture it with a keep capture
#[derive(Debug, Clone)]
pub struct CodeContextMatch {
pub start_col: usize,
pub item_range: Range<usize>,
pub name_range: Range<usize>,
pub context_ranges: Vec<Range<usize>>,
pub collapse_ranges: Vec<Range<usize>>,
}
impl CodeContextRetriever {
pub fn new() -> Self {
Self {
@ -49,6 +63,82 @@ impl CodeContextRetriever {
}])
}
fn get_matches_in_file(
&mut self,
content: &str,
grammar: &Arc<Grammar>,
) -> Result<Vec<CodeContextMatch>> {
let embedding_config = grammar
.embedding_config
.as_ref()
.ok_or_else(|| anyhow!("no embedding queries"))?;
self.parser.set_language(grammar.ts_language).unwrap();
let tree = self
.parser
.parse(&content, None)
.ok_or_else(|| anyhow!("parsing failed"))?;
let mut captures: Vec<CodeContextMatch> = Vec::new();
let mut collapse_ranges: Vec<Range<usize>> = Vec::new();
let mut keep_ranges: Vec<Range<usize>> = Vec::new();
for mat in self.cursor.matches(
&embedding_config.query,
tree.root_node(),
content.as_bytes(),
) {
let mut start_col = 0;
let mut item_range: Option<Range<usize>> = None;
let mut name_range: Option<Range<usize>> = None;
let mut context_ranges: Vec<Range<usize>> = Vec::new();
collapse_ranges.clear();
keep_ranges.clear();
for capture in mat.captures {
if capture.index == embedding_config.item_capture_ix {
item_range = Some(capture.node.byte_range());
start_col = capture.node.start_position().column;
} else if capture.index == embedding_config.name_capture_ix {
name_range = Some(capture.node.byte_range());
} else if Some(capture.index) == embedding_config.context_capture_ix {
context_ranges.push(capture.node.byte_range());
} else if Some(capture.index) == embedding_config.collapse_capture_ix {
collapse_ranges.push(capture.node.byte_range());
} else if Some(capture.index) == embedding_config.keep_capture_ix {
keep_ranges.push(capture.node.byte_range());
}
}
if item_range.is_some() && name_range.is_some() {
let item_range = item_range.unwrap();
captures.push(CodeContextMatch {
start_col,
item_range,
name_range: name_range.unwrap(),
context_ranges,
collapse_ranges: subtract_ranges(&collapse_ranges, &keep_ranges),
});
}
}
Ok(captures)
}
pub fn parse_file_with_template(
&mut self,
relative_path: &Path,
content: &str,
language: Arc<Language>,
) -> Result<Vec<Document>> {
let language_name = language.name();
let mut documents = self.parse_file(relative_path, content, language)?;
for document in &mut documents {
document.content = CODE_CONTEXT_TEMPLATE
.replace("<path>", relative_path.to_string_lossy().as_ref())
.replace("<language>", language_name.as_ref())
.replace("item", &document.content);
}
Ok(documents)
}
pub fn parse_file(
&mut self,
relative_path: &Path,
@ -62,78 +152,131 @@ impl CodeContextRetriever {
let grammar = language
.grammar()
.ok_or_else(|| anyhow!("no grammar for language"))?;
let embedding_config = grammar
.embedding_config
.as_ref()
.ok_or_else(|| anyhow!("no embedding queries"))?;
self.parser.set_language(grammar.ts_language).unwrap();
let tree = self
.parser
.parse(&content, None)
.ok_or_else(|| anyhow!("parsing failed"))?;
let mut documents = Vec::new();
// Iterate through query matches
let mut name_ranges: Vec<Range<usize>> = vec![];
for mat in self.cursor.matches(
&embedding_config.query,
tree.root_node(),
content.as_bytes(),
) {
let mut name: Vec<&str> = vec![];
let mut item: Option<&str> = None;
let mut byte_range: Option<Range<usize>> = None;
let mut context_spans: Vec<&str> = vec![];
for capture in mat.captures {
if capture.index == embedding_config.item_capture_ix {
byte_range = Some(capture.node.byte_range());
item = content.get(capture.node.byte_range());
} else if capture.index == embedding_config.name_capture_ix {
let name_range = capture.node.byte_range();
if name_ranges.contains(&name_range) {
continue;
}
name_ranges.push(name_range.clone());
if let Some(name_content) = content.get(name_range.clone()) {
name.push(name_content);
}
}
let matches = self.get_matches_in_file(content, grammar)?;
if let Some(context_capture_ix) = embedding_config.context_capture_ix {
if capture.index == context_capture_ix {
if let Some(context) = content.get(capture.node.byte_range()) {
context_spans.push(context);
}
}
let language_scope = language.default_scope();
let placeholder = language_scope.collapsed_placeholder();
let mut documents = Vec::new();
let mut collapsed_ranges_within = Vec::new();
let mut parsed_name_ranges = HashSet::new();
for (i, context_match) in matches.iter().enumerate() {
if parsed_name_ranges.contains(&context_match.name_range) {
continue;
}
collapsed_ranges_within.clear();
for remaining_match in &matches[(i + 1)..] {
if context_match
.item_range
.contains(&remaining_match.item_range.start)
&& context_match
.item_range
.contains(&remaining_match.item_range.end)
{
collapsed_ranges_within.extend(remaining_match.collapse_ranges.iter().cloned());
} else {
break;
}
}
if let Some((item, byte_range)) = item.zip(byte_range) {
if !name.is_empty() {
let item = if context_spans.is_empty() {
item.to_string()
} else {
format!("{}\n{}", context_spans.join("\n"), item)
};
let mut document_content = String::new();
for context_range in &context_match.context_ranges {
document_content.push_str(&content[context_range.clone()]);
document_content.push_str("\n");
}
let document_text = CODE_CONTEXT_TEMPLATE
.replace("<path>", relative_path.to_str().unwrap())
.replace("<language>", &language.name().to_lowercase())
.replace("<item>", item.as_str());
documents.push(Document {
range: byte_range,
content: document_text,
embedding: Vec::new(),
name: name.join(" ").to_string(),
});
let mut offset = context_match.item_range.start;
for collapsed_range in &collapsed_ranges_within {
if collapsed_range.start > offset {
add_content_from_range(
&mut document_content,
content,
offset..collapsed_range.start,
context_match.start_col,
);
}
document_content.push_str(placeholder);
offset = collapsed_range.end;
}
if offset < context_match.item_range.end {
add_content_from_range(
&mut document_content,
content,
offset..context_match.item_range.end,
context_match.start_col,
);
}
if let Some(name) = content.get(context_match.name_range.clone()) {
parsed_name_ranges.insert(context_match.name_range.clone());
documents.push(Document {
name: name.to_string(),
content: document_content,
range: context_match.item_range.clone(),
embedding: vec![],
})
}
}
return Ok(documents);
}
}
pub(crate) fn subtract_ranges(
ranges: &[Range<usize>],
ranges_to_subtract: &[Range<usize>],
) -> Vec<Range<usize>> {
let mut result = Vec::new();
let mut ranges_to_subtract = ranges_to_subtract.iter().peekable();
for range in ranges {
let mut offset = range.start;
while offset < range.end {
if let Some(range_to_subtract) = ranges_to_subtract.peek() {
if offset < range_to_subtract.start {
let next_offset = cmp::min(range_to_subtract.start, range.end);
result.push(offset..next_offset);
offset = next_offset;
} else {
let next_offset = cmp::min(range_to_subtract.end, range.end);
offset = next_offset;
}
if offset >= range_to_subtract.end {
ranges_to_subtract.next();
}
} else {
result.push(offset..range.end);
offset = range.end;
}
}
}
result
}
fn add_content_from_range(
output: &mut String,
content: &str,
range: Range<usize>,
start_col: usize,
) {
for mut line in content.get(range.clone()).unwrap_or("").lines() {
for _ in 0..start_col {
if line.starts_with(' ') {
line = &line[1..];
} else {
break;
}
}
output.push_str(line);
output.push('\n');
}
output.pop();
}

View file

@ -409,7 +409,11 @@ impl SemanticIndex {
) {
if let Some(content) = fs.load(&pending_file.absolute_path).await.log_err() {
if let Some(documents) = retriever
.parse_file(&pending_file.relative_path, &content, pending_file.language)
.parse_file_with_template(
&pending_file.relative_path,
&content,
pending_file.language,
)
.log_err()
{
log::trace!(
@ -657,6 +661,8 @@ impl SemanticIndex {
})
.await?;
dbg!(&documents);
let mut tasks = Vec::new();
let mut ranges = Vec::new();
let weak_project = project.downgrade();

File diff suppressed because it is too large Load diff

View file

@ -10,3 +10,4 @@ brackets = [
{ start = "\"", end = "\"", close = true, newline = false, not_in = ["string"] },
{ start = "/*", end = " */", close = true, newline = false, not_in = ["string", "comment"] },
]
collapsed_placeholder = " /* ... */ "

View file

@ -1,50 +1,28 @@
(
(line_comment)* @context
[(line_comment) (attribute_item)]* @context
.
(enum_item
name: (_) @name) @item
)
[
(struct_item
name: (_) @name)
(
(line_comment)* @context
.
(struct_item
name: (_) @name) @item
)
(enum_item
name: (_) @name)
(
(line_comment)* @context
.
(impl_item
trait: (_)? @name
"for"? @name
type: (_) @name) @item
)
(impl_item
trait: (_)? @name
"for"? @name
type: (_) @name)
(
(line_comment)* @context
.
(trait_item
name: (_) @name) @item
)
(trait_item
name: (_) @name)
(
(line_comment)* @context
.
(function_item
name: (_) @name) @item
)
(function_item
name: (_) @name
body: (block
"{" @keep
"}" @keep) @collapse)
(
(line_comment)* @context
.
(macro_definition
name: (_) @name) @item
)
(
(line_comment)* @context
.
(function_signature_item
name: (_) @name) @item
)
(macro_definition
name: (_) @name)
] @item
)