From 189cece03efadc6cbec87eac53a71b4553c7a092 Mon Sep 17 00:00:00 2001 From: Nate Butler Date: Mon, 22 Apr 2024 11:51:06 -0400 Subject: [PATCH] Add analyze highlights script (#10855) Adds a script to print all unique highlight keys for building syntax themes. Usage: - `python script/analyze_highlights.py` OR - `python script/analyze_highlights.py -v` - Using the `-v` or `--verbose` arg will print each language that uses each key. Example output: ``` @attribute (6) @boolean (5) @charset (1) @comment (19) @comment.doc (3) @comment.unused (2) @constant (27) @constant.builtin (15) @constant.character (1) @constructor (4) @embedded (10) @emphasis (1) @emphasis.strong (1) @escape (4) @function (44) @function.builtin (2) @function.definition (2) @function.method (22) @function.method.builtin (3) @function.special (4) @function.special.definition (1) @import (1) @keyframes (1) @keyword (32) @label (2) @link_text (1) @link_uri (1) @media (1) @module (1) @namespace (1) @number (16) @operator (24) @property (11) @property.json_key (1) @punctuation (1) @punctuation.bracket (28) @punctuation.delimiter (12) @punctuation.list_marker (1) @punctuation.special (17) @string (23) @string.doc (1) @string.escape (5) @string.regex (7) @string.special (4) @string.special.symbol (2) @supports (1) @tag (14) @text.literal (2) @title (1) @type (28) @type.builtin (4) @type.super (3) @variable (5) @variable.member (3) @variable.parameter (4) @variable.special (12) Extension-only: @tag.delimiter (1) ``` Verbose example output: ``` Shared: @attribute (6) - [css, heex, javascript, tsx] @boolean (5) - [javascript, proto, tsx, typescript, yaml] @charset (1) - [css] @comment (19) - [bash, c, cpp, css, elixir, erb, go, gomod, gowork, heex, javascript, json, proto, python, ruby, rust, tsx, typescript, yaml] @comment.doc (3) - [elixir] @comment.unused (2) - [elixir] @constant (27) - [bash, c, cpp, elixir, heex, javascript, json, proto, python, ruby, rust, tsx, typescript] @constant.builtin (15) - [elixir, go, javascript, python, ruby, tsx, typescript, yaml] @constant.character (1) - [regex] @constructor (4) - [tsx, typescript] @embedded (10) - [bash, elixir, javascript, python, ruby, tsx, typescript] @emphasis (1) - [markdown] @emphasis.strong (1) - [markdown] @escape (4) - [go, python, regex, ruby] @function (44) - [bash, c, cpp, css, elixir, go, heex, javascript, python, rust, tsx, typescript] @function.builtin (2) - [python] @function.definition (2) - [rust] @function.method (22) - [go, javascript, python, ruby, rust, tsx, typescript] @function.method.builtin (3) - [ruby] @function.special (4) - [c, cpp, rust] @function.special.definition (1) - [rust] @import (1) - [css] @keyframes (1) - [css] @keyword (32) - [bash, c, cpp, css, elixir, erb, go, gomod, gowork, heex, javascript, jsdoc, proto, python, ruby, rust, tsx, typescript] @label (2) - [c, cpp] @link_text (1) - [markdown] @link_uri (1) - [markdown] @media (1) - [css] @module (1) - [heex] @namespace (1) - [css] @number (16) - [bash, c, cpp, css, elixir, go, javascript, json, proto, python, regex, ruby, rust, tsx, typescript, yaml] @operator (24) - [bash, c, cpp, css, elixir, go, gomod, gowork, heex, javascript, proto, python, regex, ruby, tsx, typescript] @property (11) - [bash, c, cpp, css, javascript, python, regex, rust, tsx, typescript, yaml] @property.json_key (1) - [json] @punctuation (1) - [elixir] @punctuation.bracket (28) - [c, cpp, elixir, go, heex, javascript, json, proto, regex, ruby, rust, tsx, typescript, yaml] @punctuation.delimiter (12) - [c, cpp, css, elixir, heex, javascript, proto, regex, ruby, tsx, typescript, yaml] @punctuation.list_marker (1) - [markdown] @punctuation.special (17) - [elixir, javascript, python, ruby, tsx, typescript, yaml] @string (23) - [bash, c, cpp, css, elixir, go, gomod, gowork, heex, javascript, json, proto, python, regex, ruby, rust, tsx, typescript, yaml] @string.doc (1) - [python] @string.escape (5) - [elixir, javascript, tsx, typescript, yaml] @string.regex (7) - [elixir, javascript, ruby, tsx, typescript] @string.special (4) - [css, elixir] @string.special.symbol (2) - [elixir, ruby] @supports (1) - [css] @tag (14) - [css, heex, javascript, tsx] @text.literal (2) - [markdown] @title (1) - [markdown] @type (28) - [c, cpp, css, elixir, go, javascript, jsdoc, proto, python, ruby, rust, tsx, typescript, yaml] @type.builtin (4) - [javascript, rust, tsx, typescript] @type.super (3) - [ruby] @variable (5) - [c, cpp, javascript, tsx, typescript] @variable.member (3) - [go, ruby] @variable.parameter (4) - [ruby] @variable.special (12) - [cpp, css, javascript, ruby, rust, tsx, typescript] Extension-only: @tag.delimiter (1) - [astro] ``` Release Notes: - N/A --------- Co-authored-by: Joseph T. Lyons --- script/analyze_highlights.py | 68 ++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 script/analyze_highlights.py diff --git a/script/analyze_highlights.py b/script/analyze_highlights.py new file mode 100644 index 0000000000..968264a7c7 --- /dev/null +++ b/script/analyze_highlights.py @@ -0,0 +1,68 @@ +""" +This script analyzes all the highlight.scm files in our embedded languages and extensions. +It counts the number of unique instances of @{name} and the languages in which they are used. + +This is useful to help avoid accidentally introducing new tags when appropriate ones already exist when adding new languages. + +Flags: +-v, --verbose: Include a detailed list of languages for each tag found in the highlight.scm files. +""" + +from collections import defaultdict +from pathlib import Path +from typing import Any +import argparse +import re + +pattern = re.compile(r'@(?!_)[a-zA-Z_.]+') + +def parse_arguments(): + parser = argparse.ArgumentParser(description='Analyze highlight.scm files for unique instances and their languages.') + parser.add_argument('-v', '--verbose', action='store_true', help='Include a list of languages for each tag.') + return parser.parse_args() + +def find_highlight_files(root_dir): + for path in Path(root_dir).rglob('highlights.scm'): + yield path + +def count_instances(files): + instances: defaultdict[list[Any], dict[str, Any]] = defaultdict(lambda: {'count': 0, 'languages': set()}) + for file_path in files: + language = file_path.parent.name + with open(file_path, "r") as file: + text = file.read() + matches = pattern.findall(text) + for match in matches: + instances[match]['count'] += 1 + instances[match]['languages'].add(language) + return instances + +def print_instances(instances, verbose=False): + for item, details in sorted(instances.items(), key=lambda x: x[0]): + languages = ', '.join(sorted(details['languages'])) + if verbose: + print(f"{item} ({details['count']}) - [{languages}]") + else: + print(f"{item} ({details['count']})") + +def main(): + args = parse_arguments() + + base_dir = Path(__file__).parent.parent + core_path = base_dir / 'crates/languages/src' + extension_path = base_dir / 'extensions/astro/languages' + + core_instances = count_instances(find_highlight_files(core_path)) + extension_instances = count_instances(find_highlight_files(extension_path)) + + unique_extension_instances = {k: v for k, v in extension_instances.items() if k not in core_instances} + + print('Shared:\n') + print_instances(core_instances, args.verbose) + + if unique_extension_instances: + print('\nExtension-only:\n') + print_instances(unique_extension_instances, args.verbose) + +if __name__ == '__main__': + main()