suite: add testsuite tool 'test_mc.sh' to compare output of Capstone & LLVM

This commit is contained in:
Nguyen Anh Quynh
2014-11-07 17:24:01 +08:00
parent 278afa3380
commit 8ba7250a14
3 changed files with 198 additions and 0 deletions

View File

@ -2,6 +2,12 @@ This directory contains some tools used by developers of Capstone project.
Average users should ignore all the contents here. Average users should ignore all the contents here.
- arm/
Test some ARM's special input.
- MC/
Input used to test various architectures & modes.
- benchmark.py - benchmark.py
This script benchmarks Python binding by disassembling some random code. This script benchmarks Python binding by disassembling some random code.
@ -17,3 +23,13 @@ Average users should ignore all the contents here.
- fuzz.py - fuzz.py
This simple script disassembles random code for all archs (or selected arch) This simple script disassembles random code for all archs (or selected arch)
in order to find segfaults. in order to find segfaults.
- test_mc.sh
This script compares the output of Capstone with LLVM's llvm-mc with the
input coming from MC/. This relies on test_mc.py to do all the hard works.
- x86odd.py
Test some tricky X86 instructions.
- ppcbranch.py
Test some tricky branch PPC instructions.

167
suite/test_mc.py Executable file
View File

@ -0,0 +1,167 @@
#!/usr/bin/python
import array, os.path, sys
from subprocess import Popen, PIPE, STDOUT
from capstone import *
def run_mc(arch, hexcode, option, syntax=None):
def normalize(text):
# remove tabs
items = text.split()
text = ' '.join(items)
# remove comment after #
if arch == CS_ARCH_X86:
i = text.find('# ')
if i != -1:
return text[:i].lower()
return text.lower()
#print("Trying to decode: %s" %hexcode)
if syntax:
if arch == CS_ARCH_MIPS:
p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex', '-mattr=+msa', syntax] + option, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
else:
p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex', syntax] + option, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
else:
if arch == CS_ARCH_MIPS:
#p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex', '-mattr=+msa,micromips', option], stdout=PIPE, stdin=PIPE, stderr=STDOUT)
#p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex', '-mattr=+msa', option], stdout=PIPE, stdin=PIPE, stderr=STDOUT)
p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex', '-mattr=+msa'] + option, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
else:
p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex'] + option, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
output = p.communicate(input=hexcode)[0]
lines = output.split('\n')
#print lines
if 'invalid' in lines[0]:
#print 'invalid ----'
return 'FAILED to disassemble'
else:
#print 'OK:', lines[1]
return normalize(lines[1].strip())
def test_file(fname):
print("Test %s" %fname);
f = open(fname)
lines = f.readlines()
f.close()
if not lines[0].startswith('# '):
print("ERROR: decoding information is missing")
return
# skip '# ' at the front, then split line to get out hexcode
# Note: option can be '', or 'None'
#print lines[0]
#print lines[0][2:].split(', ')
(arch, mode, option) = lines[0][2:].split(', ')
mode = mode.replace(' ', '')
option = option.strip()
archs = {
"CS_ARCH_ARM": CS_ARCH_ARM,
"CS_ARCH_ARM64": CS_ARCH_ARM64,
"CS_ARCH_MIPS": CS_ARCH_MIPS,
"CS_ARCH_PPC": CS_ARCH_PPC,
"CS_ARCH_SPARC": CS_ARCH_SPARC,
"CS_ARCH_SYSZ": CS_ARCH_SYSZ,
"CS_ARCH_X86": CS_ARCH_X86,
"CS_ARCH_XCORE": CS_ARCH_XCORE,
}
modes = {
"CS_MODE_16": CS_MODE_16,
"CS_MODE_32": CS_MODE_32,
"CS_MODE_64": CS_MODE_64,
"0": CS_MODE_ARM,
"CS_MODE_ARM": CS_MODE_ARM,
"CS_MODE_THUMB": CS_MODE_THUMB,
"CS_MODE_LITTLE_ENDIAN": CS_MODE_LITTLE_ENDIAN,
"CS_MODE_BIG_ENDIAN": CS_MODE_BIG_ENDIAN,
"CS_MODE_32+CS_MODE_BIG_ENDIAN": CS_MODE_32+CS_MODE_BIG_ENDIAN,
"CS_MODE_32+CS_MODE_LITTLE_ENDIAN": CS_MODE_32+CS_MODE_LITTLE_ENDIAN,
"CS_MODE_64+CS_MODE_LITTLE_ENDIAN": CS_MODE_64+CS_MODE_LITTLE_ENDIAN,
"CS_MODE_64+CS_MODE_BIG_ENDIAN": CS_MODE_64+CS_MODE_BIG_ENDIAN,
"CS_MODE_32+CS_MODE_MICRO": CS_MODE_32+CS_MODE_MICRO,
"CS_MODE_32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN": CS_MODE_32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN,
"CS_MODE_32+CS_MODE_BIG_ENDIAN+CS_MODE_MICRO": CS_MODE_32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN,
"CS_MODE_BIG_ENDIAN+CS_MODE_V9": CS_MODE_BIG_ENDIAN + CS_MODE_V9,
}
options = {
"CS_OPT_SYNTAX_ATT": CS_OPT_SYNTAX_ATT,
"CS_OPT_SYNTAX_NOREGNAME": CS_OPT_SYNTAX_NOREGNAME,
}
mc_modes = {
("CS_ARCH_X86", "CS_MODE_32"): ['-triple=i386'],
("CS_ARCH_X86", "CS_MODE_64"): ['-triple=x86_64'],
("CS_ARCH_ARM", "CS_MODE_ARM"): ['-triple=armv7'],
("CS_ARCH_ARM", "CS_MODE_THUMB"): ['-triple=armv7'],
("CS_ARCH_ARM64", "0"): ['-triple=aarch64'],
("CS_ARCH_MIPS", "CS_MODE_32+CS_MODE_BIG_ENDIAN"): ['-triple=mips'],
("CS_ARCH_MIPS", "CS_MODE_32+CS_MODE_MICRO"): ['-triple=mipsel', '-mattr=+micromips'],
("CS_ARCH_MIPS", "CS_MODE_64"): ['-triple=mips64el'],
("CS_ARCH_MIPS", "CS_MODE_32"): ['-triple=mipsel'],
("CS_ARCH_MIPS", "CS_MODE_64+CS_MODE_BIG_ENDIAN"): ['-triple=mips64'],
("CS_ARCH_MIPS", "CS_MODE_32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN"): ['-triple=mips', '-mattr=+micromips'],
("CS_ARCH_MIPS", "CS_MODE_32+CS_MODE_BIG_ENDIAN+CS_MODE_MICRO"): ['-triple=mips', '-mattr=+micromips'],
("CS_ARCH_PPC", "CS_MODE_BIG_ENDIAN"): ['-triple=powerpc64'],
('CS_ARCH_SPARC', 'CS_MODE_BIG_ENDIAN'): ['-triple=sparc'],
('CS_ARCH_SPARC', 'CS_MODE_BIG_ENDIAN+CS_MODE_V9'): ['-triple=sparcv9'],
('CS_ARCH_SYSZ', '0'): ['-triple=s390x'],
}
#if not option in ('', 'None'):
# print archs[arch], modes[mode], options[option]
#print(arch, mode, option)
md = Cs(archs[arch], modes[mode])
mc_option = None
if arch == 'CS_ARCH_X86':
mc_option = '-output-asm-variant=1'
if arch == 'CS_ARCH_ARM':
md.syntax = CS_OPT_SYNTAX_NOREGNAME
#if not option in ('', 'None'): # ATT syntax?
# #md.syntax = options[option]
for line in lines[1:]:
if line.startswith('#'):
continue
#print("Check %s" %line)
code = line.split(' = ')[0]
hex_code = code.replace('0x', '')
hex_code = hex_code.replace(',', '')
hex_data = hex_code.decode('hex')
#hex_bytes = array.array('B', hex_data)
x = list(md.disasm(hex_data, 0))
if len(x) > 0:
if x[0].op_str != '':
cs_output = "%s %s" %(x[0].mnemonic, x[0].op_str)
else:
cs_output = x[0].mnemonic
else:
cs_output = 'FAILED to disassemble'
mc_output = run_mc(archs[arch], code, mc_modes[(arch, mode)], mc_option)
if (cs_output != mc_output):
print("Mismatch: %s" %code)
print("\tMC = %s" %mc_output)
print("\tCS = %s" %cs_output)
if __name__ == '__main__':
if len(sys.argv) == 1:
fnames = sys.stdin.readlines()
for fname in fnames:
test_file(fname.strip())
else:
#print("Usage: ./test_mc.py <input-file.s.cs>")
test_file(sys.argv[1])
#run_mc('0x33', '-triple=i386')
#run_mc('0x0f,0x0e', '-triple=i386')
#run_mc('0x0f,0x0f,0xca,0xae', '-triple=i386', '-output-asm-variant=1')

15
suite/test_mc.sh Executable file
View File

@ -0,0 +1,15 @@
#!/bin/sh
# This script test all architectures by default.
# At the output are all the mismatches between Capstone (CS) & LLVM (MC).
# While most differences coming from the fact that Capstone uses more friendly
# number format, some mismatches might be because Capstone is based on older
# version of LLVM (which should be fixed in the next release)
find MC/ -name *.cs | ./test_mc.py
# To test just one architecture, specify the corresponsing dir:
# $ find MC/X86 -name *.cs | ./test_mc.py
# To test just one input file, run test_mc.py with that file:
# $ ./test_mc.py MC/X86/x86-32-fma3.s.cs