[LIT] remove to_unicode, to_string, and to_bytes helpers (#165950)

These helpers, which handle the difference between Python 2.x and Python 3.x, are no longer required. Co-authored-by: Alexander Richardson <mail@alexrichardson.me>
2026-01-14 03:50:17 +08:00 · 2025-12-02 05:06:17 +09:00
parent 33bcde0678
commit c8031c3dd7
7 changed files with 45 additions and 148 deletions
--- a/libcxx/test/selftest/dsl/lit.local.cfg
+++ b/libcxx/test/selftest/dsl/lit.local.cfg
@@ -10,6 +10,6 @@
 # within the test.
 import base64, lit.util, pickle

-base64Encode = lambda s: lit.util.to_string(base64.b64encode(lit.util.to_bytes(s)))
+base64Encode = lambda s: base64.b64encode(s).decode("utf-8")
 escapedSubstitutions = base64Encode(pickle.dumps(config.substitutions))
 config.substitutions.append(("%{substitutions}", escapedSubstitutions))
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -21,7 +21,6 @@ from lit.ShCommands import GlobItem, Command
 import lit.ShUtil as ShUtil
 import lit.Test as Test
 import lit.util
-from lit.util import to_bytes, to_string, to_unicode
 from lit.BooleanExpression import BooleanExpression


@@ -391,18 +390,14 @@ def executeBuiltinEcho(cmd, shenv):
    # Some tests have un-redirected echo commands to help debug test failures.
    # Buffer our output and return it to the caller.
    is_redirected = True
-    encode = lambda x: x
    if stdout == subprocess.PIPE:
        is_redirected = False
        stdout = StringIO()
    elif kIsWindows:
-        # Reopen stdout in binary mode to avoid CRLF translation. The versions
-        # of echo we are replacing on Windows all emit plain LF, and the LLVM
-        # tests now depend on this.
-        # When we open as binary, however, this also means that we have to write
-        # 'bytes' objects to stdout instead of 'str' objects.
-        encode = lit.util.to_bytes
-        stdout = open(stdout.name, stdout.mode + "b")
+        # Reopen stdout with `newline=""` to avoid CRLF translation.
+        # The versions of echo we are replacing on Windows all emit plain LF,
+        # and the LLVM tests now depend on this.
+        stdout = open(stdout.name, stdout.mode, encoding="utf-8", newline="")
        opened_files.append((None, None, stdout, None))

    # Implement echo flags. We only support -e and -n, and not yet in
@@ -423,16 +418,15 @@ def executeBuiltinEcho(cmd, shenv):
        if not interpret_escapes:
            return arg

-        arg = lit.util.to_bytes(arg)
-        return arg.decode("unicode_escape")
+        return arg.encode("utf-8").decode("unicode_escape")

    if args:
        for arg in args[:-1]:
-            stdout.write(encode(maybeUnescape(arg)))
-            stdout.write(encode(" "))
-        stdout.write(encode(maybeUnescape(args[-1])))
+            stdout.write(maybeUnescape(arg))
+            stdout.write(" ")
+        stdout.write(maybeUnescape(args[-1]))
    if write_newline:
-        stdout.write(encode("\n"))
+        stdout.write("\n")

    for (name, mode, f, path) in opened_files:
        f.close()
@@ -463,7 +457,7 @@ def executeBuiltinMkdir(cmd, cmd_shenv):
    exitCode = 0
    for dir in args:
        dir = pathlib.Path(dir)
-        cwd = pathlib.Path(to_unicode(cmd_shenv.cwd))
+        cwd = pathlib.Path(cmd_shenv.cwd)
        if not dir.is_absolute():
            dir = lit.util.abs_path_preserve_drive(cwd / dir)
        if parent:
@@ -508,8 +502,6 @@ def executeBuiltinRm(cmd, cmd_shenv):
    exitCode = 0
    for path in args:
        cwd = cmd_shenv.cwd
-        path = to_unicode(path) if kIsWindows else to_bytes(path)
-        cwd = to_unicode(cwd) if kIsWindows else to_bytes(cwd)
        if not os.path.isabs(path):
            path = lit.util.abs_path_preserve_drive(os.path.join(cwd, path))
        if force and not os.path.exists(path):
@@ -718,10 +710,7 @@ def processRedirects(cmd, stdin_source, cmd_shenv, opened_files):
        else:
            # Make sure relative paths are relative to the cwd.
            redir_filename = os.path.join(cmd_shenv.cwd, name)
-            redir_filename = (
-                to_unicode(redir_filename) if kIsWindows else to_bytes(redir_filename)
-            )
-            fd = open(redir_filename, mode)
+            fd = open(redir_filename, mode, encoding="utf-8")
        # Workaround a Win32 and/or subprocess bug when appending.
        #
        # FIXME: Actually, this is probably an instance of PR6753.
@@ -1083,14 +1072,14 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
            if out is None:
                out = ""
            else:
-                out = to_string(out.decode("utf-8", errors="replace"))
+                out = out.decode("utf-8", errors="replace")
        except:
            out = str(out)
        try:
            if err is None:
                err = ""
            else:
-                err = to_string(err.decode("utf-8", errors="replace"))
+                err = err.decode("utf-8", errors="replace")
        except:
            err = str(err)

@@ -1284,7 +1273,7 @@ def executeScriptInternal(

        # Add the command output, if redirected.
        for (name, path, data) in result.outputFiles:
-            data = to_string(data.decode("utf-8", errors="replace"))
+            data = data.decode("utf-8", errors="replace")
            out += formatOutput(f"redirected output from '{name}'", data, limit=1024)
        if result.stdout.strip():
            out += formatOutput("command stdout", result.stdout)
@@ -1340,13 +1329,6 @@ def executeScript(test, litConfig, tmpBase, commands, cwd):
        script += ".bat"

    # Write script file
-    mode = "w"
-    open_kwargs = {}
-    if litConfig.isWindows and not isWin32CMDEXE:
-        mode += "b"  # Avoid CRLFs when writing bash scripts.
-    else:
-        open_kwargs["encoding"] = "utf-8"
-    f = open(script, mode, **open_kwargs)
    if isWin32CMDEXE:
        for i, ln in enumerate(commands):
            match = re.fullmatch(kPdbgRegex, ln)
@@ -1355,8 +1337,9 @@ def executeScript(test, litConfig, tmpBase, commands, cwd):
                commands[i] = match.expand(
                    "echo '\\1' > nul && " if command else "echo '\\1' > nul"
                )
-        f.write("@echo on\n")
-        f.write("\n@if %ERRORLEVEL% NEQ 0 EXIT\n".join(commands))
+        with open(script, "w", encoding="utf-8") as f:
+            f.write("@echo on\n")
+            f.write("\n@if %ERRORLEVEL% NEQ 0 EXIT\n".join(commands))
    else:
        for i, ln in enumerate(commands):
            match = re.fullmatch(kPdbgRegex, ln)
@@ -1395,8 +1378,6 @@ def executeScript(test, litConfig, tmpBase, commands, cwd):
                # seen the latter manage to terminate the shell running lit.
                if command:
                    commands[i] += f" && {{ {command}; }}"
-        if test.config.pipefail:
-            f.write(b"set -o pipefail;" if mode == "wb" else "set -o pipefail;")

        # Manually export any DYLD_* variables used by dyld on macOS because
        # otherwise they are lost when the shell executable is run, before the
@@ -1406,14 +1387,14 @@ def executeScript(test, litConfig, tmpBase, commands, cwd):
            for k, v in test.config.environment.items()
            if k.startswith("DYLD_")
        )
-        f.write(bytes(env_str, "utf-8") if mode == "wb" else env_str)
-        f.write(b"set -x;" if mode == "wb" else "set -x;")
-        if mode == "wb":
-            f.write(bytes("{ " + "; } &&\n{ ".join(commands) + "; }", "utf-8"))
-        else:
+
+        with open(script, "w", encoding="utf-8", newline="") as f:
+            if test.config.pipefail:
+                f.write("set -o pipefail;")
+            f.write(env_str)
+            f.write("set -x;")
            f.write("{ " + "; } &&\n{ ".join(commands) + "; }")
-    f.write(b"\n" if mode == "wb" else "\n")
-    f.close()
+            f.write("\n")

    if isWin32CMDEXE:
        command = ["cmd", "/c", script]
@@ -1449,19 +1430,11 @@ def parseIntegratedTestScriptCommands(source_path, keywords):
    (line_number, command_type, line).
    """

-    # This code is carefully written to be dual compatible with Python 2.5+ and
-    # Python 3 without requiring input files to always have valid codings. The
-    # trick we use is to open the file in binary mode and use the regular
-    # expression library to find the commands, with it scanning strings in
-    # Python2 and bytes in Python3.
-    #
-    # Once we find a match, we do require each script line to be decodable to
-    # UTF-8, so we convert the outputs to UTF-8 before returning. This way the
-    # remaining code can work with "strings" agnostic of the executing Python
-    # version.
+    # We use `bytes` for scanning input files to avoid requiring them to always
+    # have valid codings.

    keywords_re = re.compile(
-        to_bytes("(%s)(.*)\n" % ("|".join(re.escape(k) for k in keywords),))
+        b"(%s)(.*)\n" % (b"|".join(re.escape(k.encode("utf-8")) for k in keywords),)
    )

    f = open(source_path, "rb")
@@ -1470,8 +1443,8 @@ def parseIntegratedTestScriptCommands(source_path, keywords):
        data = f.read()

        # Ensure the data ends with a newline.
-        if not data.endswith(to_bytes("\n")):
-            data = data + to_bytes("\n")
+        if not data.endswith(b"\n"):
+            data = data + b"\n"

        # Iterate over the matches.
        line_number = 1
@@ -1480,15 +1453,11 @@ def parseIntegratedTestScriptCommands(source_path, keywords):
            # Compute the updated line number by counting the intervening
            # newlines.
            match_position = match.start()
-            line_number += data.count(
-                to_bytes("\n"), last_match_position, match_position
-            )
+            line_number += data.count(b"\n", last_match_position, match_position)
            last_match_position = match_position

            # Convert the keyword and line to UTF-8 strings and yield the
-            # command. Note that we take care to return regular strings in
-            # Python 2, to avoid other code having to differentiate between the
-            # str and unicode types.
+            # command.
            #
            # Opening the file in binary mode prevented Windows \r newline
            # characters from being converted to Unix \n newlines, so manually
@@ -1496,8 +1465,8 @@ def parseIntegratedTestScriptCommands(source_path, keywords):
            keyword, ln = match.groups()
            yield (
                line_number,
-                to_string(keyword.decode("utf-8")),
-                to_string(ln.decode("utf-8").rstrip("\r")),
+                keyword.decode("utf-8"),
+                ln.decode("utf-8").rstrip("\r"),
            )
    finally:
        f.close()
--- a/llvm/utils/lit/lit/builtin_commands/diff.py
+++ b/llvm/utils/lit/lit/builtin_commands/diff.py
@@ -8,7 +8,6 @@ import re
 import sys

 import util
-from util import to_string


 class DiffFlags:
@@ -67,10 +66,9 @@ def compareTwoBinaryFiles(flags, filepaths, filelines):
        filepaths[1].encode(),
        n=flags.num_context_lines,
    )
-    diffs = [diff.decode(errors="backslashreplace") for diff in diffs]

    for diff in diffs:
-        sys.stdout.write(to_string(diff))
+        sys.stdout.write(diff.decode(errors="backslashreplace"))
        exitCode = 1
    return exitCode

@@ -117,7 +115,7 @@ def compareTwoTextFiles(flags, filepaths, filelines_bin, encoding):
        filepaths[1],
        n=flags.num_context_lines,
    ):
-        sys.stdout.write(to_string(diff))
+        sys.stdout.write(diff)
        exitCode = 1
    return exitCode

--- a/llvm/utils/lit/lit/formats/googletest.py
+++ b/llvm/utils/lit/lit/formats/googletest.py
@@ -43,7 +43,7 @@ class GoogleTest(TestFormat):
            return None
        return sum(
            map(
-                lambda line: lit.util.to_string(line).startswith("  "),
+                lambda line: line.startswith(b"  "),
                out.splitlines(False),
            )
        )
--- a/llvm/utils/lit/lit/llvm/config.py
+++ b/llvm/utils/lit/lit/llvm/config.py
@@ -226,7 +226,7 @@ class LLVMConfig(object):
                        continue

                    # We found it, stop enumerating.
-                    return lit.util.to_string(candidate_path)
+                    return candidate_path
            except:
                continue

@@ -287,8 +287,8 @@ class LLVMConfig(object):
                env=self.config.environment,
            )
            stdout, stderr = cmd.communicate()
-            stdout = lit.util.to_string(stdout)
-            stderr = lit.util.to_string(stderr)
+            stdout = stdout.decode("utf-8", errors="replace")
+            stderr = stderr.decode("utf-8", errors="replace")
            return (stdout, stderr)
        except OSError:
            self.lit_config.fatal("Could not run process %s" % command)
--- a/llvm/utils/lit/lit/reports.py
+++ b/llvm/utils/lit/lit/reports.py
@@ -29,10 +29,10 @@ class Report(object):
            fd, _ = tempfile.mkstemp(
                suffix=ext, prefix=f"{filename}.", dir=os.path.dirname(self.output_file)
            )
-            report_file = os.fdopen(fd, "w")
+            report_file = os.fdopen(fd, "w", encoding="utf-8")
        else:
            # Overwrite if the results already exist.
-            report_file = open(self.output_file, "w")
+            report_file = open(self.output_file, "w", encoding="utf-8")

        with report_file:
            self._write_results_to_file(tests, elapsed, report_file)
--- a/llvm/utils/lit/lit/util.py
+++ b/llvm/utils/lit/lit/util.py
@@ -33,76 +33,6 @@ def make_word_regex(word):
    return r"\b" + word + r"\b"


-def to_bytes(s):
-    """Return the parameter as type 'bytes', possibly encoding it.
-
-    In Python2, the 'bytes' type is the same as 'str'. In Python3, they
-    are distinct.
-
-    """
-    if isinstance(s, bytes):
-        # In Python2, this branch is taken for both 'str' and 'bytes'.
-        # In Python3, this branch is taken only for 'bytes'.
-        return s
-    # In Python2, 's' is a 'unicode' object.
-    # In Python3, 's' is a 'str' object.
-    # Encode to UTF-8 to get 'bytes' data.
-    return s.encode("utf-8")
-
-
-def to_string(b):
-    """Return the parameter as type 'str', possibly encoding it.
-
-    In Python2, the 'str' type is the same as 'bytes'. In Python3, the
-    'str' type is (essentially) Python2's 'unicode' type, and 'bytes' is
-    distinct.
-
-    """
-    if isinstance(b, str):
-        # In Python2, this branch is taken for types 'str' and 'bytes'.
-        # In Python3, this branch is taken only for 'str'.
-        return b
-    if isinstance(b, bytes):
-        # In Python2, this branch is never taken ('bytes' is handled as 'str').
-        # In Python3, this is true only for 'bytes'.
-        try:
-            return b.decode("utf-8")
-        except UnicodeDecodeError:
-            # If the value is not valid Unicode, return the default
-            # repr-line encoding.
-            return str(b)
-
-    # By this point, here's what we *don't* have:
-    #
-    #  - In Python2:
-    #    - 'str' or 'bytes' (1st branch above)
-    #  - In Python3:
-    #    - 'str' (1st branch above)
-    #    - 'bytes' (2nd branch above)
-    #
-    # The last type we might expect is the Python2 'unicode' type. There is no
-    # 'unicode' type in Python3 (all the Python3 cases were already handled). In
-    # order to get a 'str' object, we need to encode the 'unicode' object.
-    try:
-        return b.encode("utf-8")
-    except AttributeError:
-        raise TypeError("not sure how to convert %s to %s" % (type(b), str))
-
-
-def to_unicode(s):
-    """Return the parameter as type which supports unicode, possibly decoding
-    it.
-
-    In Python2, this is the unicode type. In Python3 it's the str type.
-
-    """
-    if isinstance(s, bytes):
-        # In Python2, this branch is taken for both 'str' and 'bytes'.
-        # In Python3, this branch is taken only for 'bytes'.
-        return s.decode("utf-8")
-    return s
-
-
 def usable_core_count():
    """Return the number of cores the current process can use, if supported.
    Otherwise, return the total number of cores (like `os.cpu_count()`).
@@ -336,7 +266,7 @@ def executeCommand(

    """
    if input is not None:
-        input = to_bytes(input)
+        input = input.encode("utf-8")
    err_out = subprocess.STDOUT if redirect_stderr else subprocess.PIPE
    p = subprocess.Popen(
        command,
@@ -372,8 +302,8 @@ def executeCommand(
            timerObject.cancel()

    # Ensure the resulting output is always of string type.
-    out = to_string(out)
-    err = "" if redirect_stderr else to_string(err)
+    out = out.decode("utf-8", errors="replace")
+    err = "" if redirect_stderr else err.decode("utf-8", errors="replace")

    if hitTimeOut[0]:
        raise ExecuteCommandTimeoutException(