[LIT] remove to_unicode, to_string, and to_bytes helpers (#165950)

These helpers, which handle the difference between Python 2.x and Python
3.x, are no longer required.

Co-authored-by: Alexander Richardson <mail@alexrichardson.me>
This commit is contained in:
Tomohiro Kashiwada
2025-12-02 05:06:17 +09:00
committed by GitHub
parent 33bcde0678
commit c8031c3dd7
7 changed files with 45 additions and 148 deletions

View File

@@ -10,6 +10,6 @@
# within the test.
import base64, lit.util, pickle
base64Encode = lambda s: lit.util.to_string(base64.b64encode(lit.util.to_bytes(s)))
base64Encode = lambda s: base64.b64encode(s).decode("utf-8")
escapedSubstitutions = base64Encode(pickle.dumps(config.substitutions))
config.substitutions.append(("%{substitutions}", escapedSubstitutions))

View File

@@ -21,7 +21,6 @@ from lit.ShCommands import GlobItem, Command
import lit.ShUtil as ShUtil
import lit.Test as Test
import lit.util
from lit.util import to_bytes, to_string, to_unicode
from lit.BooleanExpression import BooleanExpression
@@ -391,18 +390,14 @@ def executeBuiltinEcho(cmd, shenv):
# Some tests have un-redirected echo commands to help debug test failures.
# Buffer our output and return it to the caller.
is_redirected = True
encode = lambda x: x
if stdout == subprocess.PIPE:
is_redirected = False
stdout = StringIO()
elif kIsWindows:
# Reopen stdout in binary mode to avoid CRLF translation. The versions
# of echo we are replacing on Windows all emit plain LF, and the LLVM
# tests now depend on this.
# When we open as binary, however, this also means that we have to write
# 'bytes' objects to stdout instead of 'str' objects.
encode = lit.util.to_bytes
stdout = open(stdout.name, stdout.mode + "b")
# Reopen stdout with `newline=""` to avoid CRLF translation.
# The versions of echo we are replacing on Windows all emit plain LF,
# and the LLVM tests now depend on this.
stdout = open(stdout.name, stdout.mode, encoding="utf-8", newline="")
opened_files.append((None, None, stdout, None))
# Implement echo flags. We only support -e and -n, and not yet in
@@ -423,16 +418,15 @@ def executeBuiltinEcho(cmd, shenv):
if not interpret_escapes:
return arg
arg = lit.util.to_bytes(arg)
return arg.decode("unicode_escape")
return arg.encode("utf-8").decode("unicode_escape")
if args:
for arg in args[:-1]:
stdout.write(encode(maybeUnescape(arg)))
stdout.write(encode(" "))
stdout.write(encode(maybeUnescape(args[-1])))
stdout.write(maybeUnescape(arg))
stdout.write(" ")
stdout.write(maybeUnescape(args[-1]))
if write_newline:
stdout.write(encode("\n"))
stdout.write("\n")
for (name, mode, f, path) in opened_files:
f.close()
@@ -463,7 +457,7 @@ def executeBuiltinMkdir(cmd, cmd_shenv):
exitCode = 0
for dir in args:
dir = pathlib.Path(dir)
cwd = pathlib.Path(to_unicode(cmd_shenv.cwd))
cwd = pathlib.Path(cmd_shenv.cwd)
if not dir.is_absolute():
dir = lit.util.abs_path_preserve_drive(cwd / dir)
if parent:
@@ -508,8 +502,6 @@ def executeBuiltinRm(cmd, cmd_shenv):
exitCode = 0
for path in args:
cwd = cmd_shenv.cwd
path = to_unicode(path) if kIsWindows else to_bytes(path)
cwd = to_unicode(cwd) if kIsWindows else to_bytes(cwd)
if not os.path.isabs(path):
path = lit.util.abs_path_preserve_drive(os.path.join(cwd, path))
if force and not os.path.exists(path):
@@ -718,10 +710,7 @@ def processRedirects(cmd, stdin_source, cmd_shenv, opened_files):
else:
# Make sure relative paths are relative to the cwd.
redir_filename = os.path.join(cmd_shenv.cwd, name)
redir_filename = (
to_unicode(redir_filename) if kIsWindows else to_bytes(redir_filename)
)
fd = open(redir_filename, mode)
fd = open(redir_filename, mode, encoding="utf-8")
# Workaround a Win32 and/or subprocess bug when appending.
#
# FIXME: Actually, this is probably an instance of PR6753.
@@ -1083,14 +1072,14 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
if out is None:
out = ""
else:
out = to_string(out.decode("utf-8", errors="replace"))
out = out.decode("utf-8", errors="replace")
except:
out = str(out)
try:
if err is None:
err = ""
else:
err = to_string(err.decode("utf-8", errors="replace"))
err = err.decode("utf-8", errors="replace")
except:
err = str(err)
@@ -1284,7 +1273,7 @@ def executeScriptInternal(
# Add the command output, if redirected.
for (name, path, data) in result.outputFiles:
data = to_string(data.decode("utf-8", errors="replace"))
data = data.decode("utf-8", errors="replace")
out += formatOutput(f"redirected output from '{name}'", data, limit=1024)
if result.stdout.strip():
out += formatOutput("command stdout", result.stdout)
@@ -1340,13 +1329,6 @@ def executeScript(test, litConfig, tmpBase, commands, cwd):
script += ".bat"
# Write script file
mode = "w"
open_kwargs = {}
if litConfig.isWindows and not isWin32CMDEXE:
mode += "b" # Avoid CRLFs when writing bash scripts.
else:
open_kwargs["encoding"] = "utf-8"
f = open(script, mode, **open_kwargs)
if isWin32CMDEXE:
for i, ln in enumerate(commands):
match = re.fullmatch(kPdbgRegex, ln)
@@ -1355,8 +1337,9 @@ def executeScript(test, litConfig, tmpBase, commands, cwd):
commands[i] = match.expand(
"echo '\\1' > nul && " if command else "echo '\\1' > nul"
)
f.write("@echo on\n")
f.write("\n@if %ERRORLEVEL% NEQ 0 EXIT\n".join(commands))
with open(script, "w", encoding="utf-8") as f:
f.write("@echo on\n")
f.write("\n@if %ERRORLEVEL% NEQ 0 EXIT\n".join(commands))
else:
for i, ln in enumerate(commands):
match = re.fullmatch(kPdbgRegex, ln)
@@ -1395,8 +1378,6 @@ def executeScript(test, litConfig, tmpBase, commands, cwd):
# seen the latter manage to terminate the shell running lit.
if command:
commands[i] += f" && {{ {command}; }}"
if test.config.pipefail:
f.write(b"set -o pipefail;" if mode == "wb" else "set -o pipefail;")
# Manually export any DYLD_* variables used by dyld on macOS because
# otherwise they are lost when the shell executable is run, before the
@@ -1406,14 +1387,14 @@ def executeScript(test, litConfig, tmpBase, commands, cwd):
for k, v in test.config.environment.items()
if k.startswith("DYLD_")
)
f.write(bytes(env_str, "utf-8") if mode == "wb" else env_str)
f.write(b"set -x;" if mode == "wb" else "set -x;")
if mode == "wb":
f.write(bytes("{ " + "; } &&\n{ ".join(commands) + "; }", "utf-8"))
else:
with open(script, "w", encoding="utf-8", newline="") as f:
if test.config.pipefail:
f.write("set -o pipefail;")
f.write(env_str)
f.write("set -x;")
f.write("{ " + "; } &&\n{ ".join(commands) + "; }")
f.write(b"\n" if mode == "wb" else "\n")
f.close()
f.write("\n")
if isWin32CMDEXE:
command = ["cmd", "/c", script]
@@ -1449,19 +1430,11 @@ def parseIntegratedTestScriptCommands(source_path, keywords):
(line_number, command_type, line).
"""
# This code is carefully written to be dual compatible with Python 2.5+ and
# Python 3 without requiring input files to always have valid codings. The
# trick we use is to open the file in binary mode and use the regular
# expression library to find the commands, with it scanning strings in
# Python2 and bytes in Python3.
#
# Once we find a match, we do require each script line to be decodable to
# UTF-8, so we convert the outputs to UTF-8 before returning. This way the
# remaining code can work with "strings" agnostic of the executing Python
# version.
# We use `bytes` for scanning input files to avoid requiring them to always
# have valid codings.
keywords_re = re.compile(
to_bytes("(%s)(.*)\n" % ("|".join(re.escape(k) for k in keywords),))
b"(%s)(.*)\n" % (b"|".join(re.escape(k.encode("utf-8")) for k in keywords),)
)
f = open(source_path, "rb")
@@ -1470,8 +1443,8 @@ def parseIntegratedTestScriptCommands(source_path, keywords):
data = f.read()
# Ensure the data ends with a newline.
if not data.endswith(to_bytes("\n")):
data = data + to_bytes("\n")
if not data.endswith(b"\n"):
data = data + b"\n"
# Iterate over the matches.
line_number = 1
@@ -1480,15 +1453,11 @@ def parseIntegratedTestScriptCommands(source_path, keywords):
# Compute the updated line number by counting the intervening
# newlines.
match_position = match.start()
line_number += data.count(
to_bytes("\n"), last_match_position, match_position
)
line_number += data.count(b"\n", last_match_position, match_position)
last_match_position = match_position
# Convert the keyword and line to UTF-8 strings and yield the
# command. Note that we take care to return regular strings in
# Python 2, to avoid other code having to differentiate between the
# str and unicode types.
# command.
#
# Opening the file in binary mode prevented Windows \r newline
# characters from being converted to Unix \n newlines, so manually
@@ -1496,8 +1465,8 @@ def parseIntegratedTestScriptCommands(source_path, keywords):
keyword, ln = match.groups()
yield (
line_number,
to_string(keyword.decode("utf-8")),
to_string(ln.decode("utf-8").rstrip("\r")),
keyword.decode("utf-8"),
ln.decode("utf-8").rstrip("\r"),
)
finally:
f.close()

View File

@@ -8,7 +8,6 @@ import re
import sys
import util
from util import to_string
class DiffFlags:
@@ -67,10 +66,9 @@ def compareTwoBinaryFiles(flags, filepaths, filelines):
filepaths[1].encode(),
n=flags.num_context_lines,
)
diffs = [diff.decode(errors="backslashreplace") for diff in diffs]
for diff in diffs:
sys.stdout.write(to_string(diff))
sys.stdout.write(diff.decode(errors="backslashreplace"))
exitCode = 1
return exitCode
@@ -117,7 +115,7 @@ def compareTwoTextFiles(flags, filepaths, filelines_bin, encoding):
filepaths[1],
n=flags.num_context_lines,
):
sys.stdout.write(to_string(diff))
sys.stdout.write(diff)
exitCode = 1
return exitCode

View File

@@ -43,7 +43,7 @@ class GoogleTest(TestFormat):
return None
return sum(
map(
lambda line: lit.util.to_string(line).startswith(" "),
lambda line: line.startswith(b" "),
out.splitlines(False),
)
)

View File

@@ -226,7 +226,7 @@ class LLVMConfig(object):
continue
# We found it, stop enumerating.
return lit.util.to_string(candidate_path)
return candidate_path
except:
continue
@@ -287,8 +287,8 @@ class LLVMConfig(object):
env=self.config.environment,
)
stdout, stderr = cmd.communicate()
stdout = lit.util.to_string(stdout)
stderr = lit.util.to_string(stderr)
stdout = stdout.decode("utf-8", errors="replace")
stderr = stderr.decode("utf-8", errors="replace")
return (stdout, stderr)
except OSError:
self.lit_config.fatal("Could not run process %s" % command)

View File

@@ -29,10 +29,10 @@ class Report(object):
fd, _ = tempfile.mkstemp(
suffix=ext, prefix=f"{filename}.", dir=os.path.dirname(self.output_file)
)
report_file = os.fdopen(fd, "w")
report_file = os.fdopen(fd, "w", encoding="utf-8")
else:
# Overwrite if the results already exist.
report_file = open(self.output_file, "w")
report_file = open(self.output_file, "w", encoding="utf-8")
with report_file:
self._write_results_to_file(tests, elapsed, report_file)

View File

@@ -33,76 +33,6 @@ def make_word_regex(word):
return r"\b" + word + r"\b"
def to_bytes(s):
"""Return the parameter as type 'bytes', possibly encoding it.
In Python2, the 'bytes' type is the same as 'str'. In Python3, they
are distinct.
"""
if isinstance(s, bytes):
# In Python2, this branch is taken for both 'str' and 'bytes'.
# In Python3, this branch is taken only for 'bytes'.
return s
# In Python2, 's' is a 'unicode' object.
# In Python3, 's' is a 'str' object.
# Encode to UTF-8 to get 'bytes' data.
return s.encode("utf-8")
def to_string(b):
"""Return the parameter as type 'str', possibly encoding it.
In Python2, the 'str' type is the same as 'bytes'. In Python3, the
'str' type is (essentially) Python2's 'unicode' type, and 'bytes' is
distinct.
"""
if isinstance(b, str):
# In Python2, this branch is taken for types 'str' and 'bytes'.
# In Python3, this branch is taken only for 'str'.
return b
if isinstance(b, bytes):
# In Python2, this branch is never taken ('bytes' is handled as 'str').
# In Python3, this is true only for 'bytes'.
try:
return b.decode("utf-8")
except UnicodeDecodeError:
# If the value is not valid Unicode, return the default
# repr-line encoding.
return str(b)
# By this point, here's what we *don't* have:
#
# - In Python2:
# - 'str' or 'bytes' (1st branch above)
# - In Python3:
# - 'str' (1st branch above)
# - 'bytes' (2nd branch above)
#
# The last type we might expect is the Python2 'unicode' type. There is no
# 'unicode' type in Python3 (all the Python3 cases were already handled). In
# order to get a 'str' object, we need to encode the 'unicode' object.
try:
return b.encode("utf-8")
except AttributeError:
raise TypeError("not sure how to convert %s to %s" % (type(b), str))
def to_unicode(s):
"""Return the parameter as type which supports unicode, possibly decoding
it.
In Python2, this is the unicode type. In Python3 it's the str type.
"""
if isinstance(s, bytes):
# In Python2, this branch is taken for both 'str' and 'bytes'.
# In Python3, this branch is taken only for 'bytes'.
return s.decode("utf-8")
return s
def usable_core_count():
"""Return the number of cores the current process can use, if supported.
Otherwise, return the total number of cores (like `os.cpu_count()`).
@@ -336,7 +266,7 @@ def executeCommand(
"""
if input is not None:
input = to_bytes(input)
input = input.encode("utf-8")
err_out = subprocess.STDOUT if redirect_stderr else subprocess.PIPE
p = subprocess.Popen(
command,
@@ -372,8 +302,8 @@ def executeCommand(
timerObject.cancel()
# Ensure the resulting output is always of string type.
out = to_string(out)
err = "" if redirect_stderr else to_string(err)
out = out.decode("utf-8", errors="replace")
err = "" if redirect_stderr else err.decode("utf-8", errors="replace")
if hitTimeOut[0]:
raise ExecuteCommandTimeoutException(