[MLIR][Pygments] Refine the pygments MLIR lexer (#166406)

Recently, the MLIR website added API documentation for the Python
bindings generated via Sphinx
([https://mlir.llvm.org/python-bindings/](https://mlir.llvm.org/python-bindings/)).
In
[https://github.com/llvm/mlir-www/pull/245](https://github.com/llvm/mlir-www/pull/245),
I introduced the Pygments lexer from the MLIR repository to enable
syntax highlighting for MLIR code blocks in these API docs.

However, since the existing Pygments lexer was fairly minimal, it didn’t
fully handle all aspects of the MLIR syntax, leading to imperfect
highlighting in some cases. In this PR, I used ChatGPT to rewrite the
lexer by combining it with the TextMate grammar for MLIR
([https://github.com/llvm/llvm-project/blob/main/mlir/utils/textmate/mlir.json](https://github.com/llvm/llvm-project/blob/main/mlir/utils/textmate/mlir.json)).
After some manual adjustments, the results look good—so I’m submitting
this to improve the syntax highlighting experience in the Python
bindings API documentation.
This commit is contained in:
Twice
2025-11-07 21:12:28 +08:00
committed by GitHub
parent cdc3cb2054
commit 7ac6a95a11

View File

@@ -2,37 +2,132 @@
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
from pygments.lexer import RegexLexer
from pygments.lexer import RegexLexer, bygroups, include, using
from pygments.token import *
import re
class MlirLexer(RegexLexer):
"""Pygments lexer for MLIR.
This lexer focuses on accurate tokenization of common MLIR constructs:
- SSA values (%%... / %...)
- attribute and type aliases (#name =, !name =)
- types (builtin and dialect types, parametric types)
- attribute dictionaries and nested containers to a reasonable depth
- numbers (ints, floats with exponents, hex)
- strings with common escapes
- line comments (// ...)
- block labels (^foo) and operations
"""
name = "MLIR"
aliases = ["mlir"]
filenames = ["*.mlir"]
flags = re.MULTILINE
class VariableList(RegexLexer):
"""Lexer for lists of SSA variables separated by commas."""
tokens = {
"root": [
(r"\s+", Text),
(r",", Punctuation),
(r"%[_A-Za-z0-9\.\$\-:#]+", Name.Variable),
]
}
tokens = {
"root": [
(r"%[a-zA-Z0-9_]+", Name.Variable),
(r"@[a-zA-Z_][a-zA-Z0-9_]+", Name.Function),
(r"\^[a-zA-Z0-9_]+", Name.Label),
(r"#[a-zA-Z0-9_]+", Name.Constant),
(r"![a-zA-Z0-9_]+", Keyword.Type),
(r"[a-zA-Z_][a-zA-Z0-9_]*\.", Name.Entity),
(r"memref[^.]", Keyword.Type),
(r"index", Keyword.Type),
(r"i[0-9]+", Keyword.Type),
(r"f[0-9]+", Keyword.Type),
# Comments
(r"//.*?$", Comment.Single),
# operation name with assignment: %... = op.name
(
r"^(\s*)(%[\%_A-Za-z0-9\:#\,\s]+)(=)(\s*)([A-Za-z0-9_\.\$\-]+)\b",
bygroups(Text, using(VariableList), Operator, Text, Name.Builtin),
),
# operation name without result
(r"^(\s*)([A-Za-z0-9_\.\$\-]+)\b(?=[^<:])", bygroups(Text, Name.Builtin)),
# Attribute alias definition: #name =
(
r"^(\s*)(#[_A-Za-z0-9\$\-\.]+)(\b)(\s*=)",
bygroups(Text, Name.Constant, Text, Operator),
),
# Type alias definition: !name =
(
r"^(\s*)(![_A-Za-z0-9\$\-\.]+)(\b)(\s*=)",
bygroups(Text, Keyword.Type, Text, Operator),
),
# SSA values (uses)
(r"%[_A-Za-z0-9\.\$\-:#]+", Name.Variable),
# attribute refs, constants and named attributes
(r"#[_A-Za-z0-9\$\-\.]+\b", Name.Constant),
# symbol refs / function-like names
(r"@[_A-Za-z][_A-Za-z0-9\$\-\.]*\b", Name.Function),
# blocks
(r"\^[A-Za-z0-9_\$\.\-]+", Name.Label),
# types by exclamation or builtin names
(r"![_A-Za-z0-9\$\-\.]+\b", Keyword.Type),
# NOTE: please sync changes to corresponding builtin type rule in "angled-type"
(r"\b(bf16|f16|f32|f64|f80|f128|index|none|(u|s)?i[0-9]+)\b", Keyword.Type),
# container-like dialect types (tensor<...>, memref<...>, vector<...>)
(
r"\b(complex|memref|tensor|tuple|vector)\s*(<)",
bygroups(Keyword.Type, Punctuation),
"angled-type",
),
# affine constructs
(r"\b(affine_map|affine_set)\b", Keyword.Reserved),
# common builtin operators / functions inside affine_map
(r"\b(ceildiv|floordiv|mod|symbol)\b", Name.Other),
# identifiers / bare words
(r"\b[_A-Za-z][_A-Za-z0-9\.-]*\b", Name.Other),
# numbers: hex, float (with exponent), integer
(r"\b0x[0-9A-Fa-f]+\b", Number.Hex),
(r"\b([0-9]+(\.[0-9]*)?|\.[0-9]+)([eE][+-]?[0-9]+)?\b", Number.Float),
(r"\b[0-9]+\b", Number.Integer),
# strings
(r'"', String.Double, "string"),
# punctuation and arrow-like tokens
(r"->|>=|<=|\>=|\<=|\->|\=>", Operator),
(r"[()\[\]{}<>,.:=]", Punctuation),
# operators
(r"[-+*/%]", Operator),
],
# string state with common escapes
"string": [
(r'\\[ntr"\\]', String.Escape),
(r'[^"\\]+', String.Double),
(r'"', String.Double, "#pop"),
],
# angled-type content
"angled-type": [
# match nested '<' and '>'
(r"<", Punctuation, "#push"),
(r">", Punctuation, "#pop"),
# dimensions like 3x or 3x3x... and standalone numbers:
# - match numbers that are followed by an 'x' (dimension separator)
(r"([0-9]+)(?=(?:x))", Number.Integer),
# - match bare numbers (sizes)
(r"[0-9]+", Number.Integer),
(r"[0-9]*\.[0-9]*", Number.Float),
(r'"[^"]*"', String.Double),
(r"affine_map", Keyword.Reserved),
# TODO: this should be within affine maps only
(r"\+-\*\/", Operator),
(r"floordiv", Operator.Word),
(r"ceildiv", Operator.Word),
(r"mod", Operator.Word),
(r"()\[\]<>,{}", Punctuation),
(r"\/\/.*\n", Comment.Single),
]
# dynamic dimension '?'
(r"\?", Name.Integer),
# the 'x' dimension separator (treat as punctuation)
(r"x", Punctuation),
# element / builtin types inside angle brackets (no word-boundary)
# NOTE: please sync changes to corresponding builtin type rule in "root"
(
r"(?:bf16|f16|f32|f64|f80|f128|index|none|(?:[us]?i[0-9]+))",
Keyword.Type,
),
# also allow nested container-like types to be recognized
(
r"\b(complex|memref|tensor|tuple|vector)\s*(<)",
bygroups(Keyword.Type, Punctuation),
"angled-type",
),
# fall back to root rules for anything else
include("root"),
],
}