diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/python/parso/py3/tests/fuzz_diff_parser.py | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'contrib/python/parso/py3/tests/fuzz_diff_parser.py')
-rw-r--r-- | contrib/python/parso/py3/tests/fuzz_diff_parser.py | 307 |
1 files changed, 307 insertions, 0 deletions
diff --git a/contrib/python/parso/py3/tests/fuzz_diff_parser.py b/contrib/python/parso/py3/tests/fuzz_diff_parser.py new file mode 100644 index 00000000000..39b93f21d57 --- /dev/null +++ b/contrib/python/parso/py3/tests/fuzz_diff_parser.py @@ -0,0 +1,307 @@ +""" +A script to find bugs in the diff parser. + +This script is extremely useful if changes are made to the diff parser. By +running a few thousand iterations, we can assure that the diff parser is in +good shape. + +Usage: + fuzz_diff_parser.py [--pdb|--ipdb] [-l] [-n=<nr>] [-x=<nr>] random [<path>] + fuzz_diff_parser.py [--pdb|--ipdb] [-l] redo [-o=<nr>] [-p] + fuzz_diff_parser.py -h | --help + +Options: + -h --help Show this screen + -n, --maxtries=<nr> Maximum of random tries [default: 1000] + -x, --changes=<nr> Amount of changes to be done to a file per try [default: 5] + -l, --logging Prints all the logs + -o, --only-last=<nr> Only runs the last n iterations; Defaults to running all + -p, --print-code Print all test diffs + --pdb Launch pdb when error is raised + --ipdb Launch ipdb when error is raised +""" + +from __future__ import print_function +import logging +import sys +import os +import random +import pickle + +import parso +from parso.utils import split_lines +from test.test_diff_parser import _check_error_leaves_nodes + +_latest_grammar = parso.load_grammar(version='3.8') +_python_reserved_strings = tuple( + # Keywords are ususally only interesting in combination with spaces after + # them. We don't put a space before keywords, to avoid indentation errors. + s + (' ' if s.isalpha() else '') + for s in _latest_grammar._pgen_grammar.reserved_syntax_strings.keys() +) +_random_python_fragments = _python_reserved_strings + ( + ' ', '\t', '\n', '\r', '\f', 'f"', 'F"""', "fr'", "RF'''", '"', '"""', "'", + "'''", ';', ' some_random_word ', '\\', '#', +) + + +def find_python_files_in_tree(file_path): + if not os.path.isdir(file_path): + yield file_path + return + for root, dirnames, filenames in os.walk(file_path): + if 'chardet' in root: + # Stuff like chardet/langcyrillicmodel.py is just very slow to + # parse and machine generated, so ignore those. + continue + + for name in filenames: + if name.endswith('.py'): + yield os.path.join(root, name) + + +def _print_copyable_lines(lines): + for line in lines: + line = repr(line)[1:-1] + if line.endswith(r'\n'): + line = line[:-2] + '\n' + print(line, end='') + + +def _get_first_error_start_pos_or_none(module): + error_leaf = _check_error_leaves_nodes(module) + return None if error_leaf is None else error_leaf.start_pos + + +class LineReplacement: + def __init__(self, line_nr, new_line): + self._line_nr = line_nr + self._new_line = new_line + + def apply(self, code_lines): + # print(repr(self._new_line)) + code_lines[self._line_nr] = self._new_line + + +class LineDeletion: + def __init__(self, line_nr): + self.line_nr = line_nr + + def apply(self, code_lines): + del code_lines[self.line_nr] + + +class LineCopy: + def __init__(self, copy_line, insertion_line): + self._copy_line = copy_line + self._insertion_line = insertion_line + + def apply(self, code_lines): + code_lines.insert( + self._insertion_line, + # Use some line from the file. This doesn't feel totally + # random, but for the diff parser it will feel like it. + code_lines[self._copy_line] + ) + + +class FileModification: + @classmethod + def generate(cls, code_lines, change_count, previous_file_modification=None): + if previous_file_modification is not None and random.random() > 0.5: + # We want to keep the previous modifications in some cases to make + # more complex parser issues visible. + code_lines = previous_file_modification.apply(code_lines) + added_modifications = previous_file_modification.modification_list + else: + added_modifications = [] + return cls( + added_modifications + + list(cls._generate_line_modifications(code_lines, change_count)), + # work with changed trees more than with normal ones. + check_original=random.random() > 0.8, + ) + + @staticmethod + def _generate_line_modifications(lines, change_count): + def random_line(include_end=False): + return random.randint(0, len(lines) - (not include_end)) + + lines = list(lines) + for _ in range(change_count): + rand = random.randint(1, 4) + if rand == 1: + if len(lines) == 1: + # We cannot delete every line, that doesn't make sense to + # fuzz and it would be annoying to rewrite everything here. + continue + ld = LineDeletion(random_line()) + elif rand == 2: + # Copy / Insertion + # Make it possible to insert into the first and the last line + ld = LineCopy(random_line(), random_line(include_end=True)) + elif rand in (3, 4): + # Modify a line in some weird random ways. + line_nr = random_line() + line = lines[line_nr] + column = random.randint(0, len(line)) + random_string = '' + for _ in range(random.randint(1, 3)): + if random.random() > 0.8: + # The lower characters cause way more issues. + unicode_range = 0x1f if random.randint(0, 1) else 0x3000 + random_string += chr(random.randint(0, unicode_range)) + else: + # These insertions let us understand how random + # keyword/operator insertions work. Theoretically this + # could also be done with unicode insertions, but the + # fuzzer is just way more effective here. + random_string += random.choice(_random_python_fragments) + if random.random() > 0.5: + # In this case we insert at a very random place that + # probably breaks syntax. + line = line[:column] + random_string + line[column:] + else: + # Here we have better chances to not break syntax, because + # we really replace the line with something that has + # indentation. + line = ' ' * random.randint(0, 12) + random_string + '\n' + ld = LineReplacement(line_nr, line) + ld.apply(lines) + yield ld + + def __init__(self, modification_list, check_original): + self.modification_list = modification_list + self._check_original = check_original + + def apply(self, code_lines): + changed_lines = list(code_lines) + for modification in self.modification_list: + modification.apply(changed_lines) + return changed_lines + + def run(self, grammar, code_lines, print_code): + code = ''.join(code_lines) + modified_lines = self.apply(code_lines) + modified_code = ''.join(modified_lines) + + if print_code: + if self._check_original: + print('Original:') + _print_copyable_lines(code_lines) + + print('\nModified:') + _print_copyable_lines(modified_lines) + print() + + if self._check_original: + m = grammar.parse(code, diff_cache=True) + start1 = _get_first_error_start_pos_or_none(m) + + grammar.parse(modified_code, diff_cache=True) + + if self._check_original: + # Also check if it's possible to "revert" the changes. + m = grammar.parse(code, diff_cache=True) + start2 = _get_first_error_start_pos_or_none(m) + assert start1 == start2, (start1, start2) + + +class FileTests: + def __init__(self, file_path, test_count, change_count): + self._path = file_path + with open(file_path, errors='replace') as f: + code = f.read() + self._code_lines = split_lines(code, keepends=True) + self._test_count = test_count + self._code_lines = self._code_lines + self._change_count = change_count + self._file_modifications = [] + + def _run(self, grammar, file_modifications, debugger, print_code=False): + try: + for i, fm in enumerate(file_modifications, 1): + fm.run(grammar, self._code_lines, print_code=print_code) + print('.', end='') + sys.stdout.flush() + print() + except Exception: + print("Issue in file: %s" % self._path) + if debugger: + einfo = sys.exc_info() + pdb = __import__(debugger) + pdb.post_mortem(einfo[2]) + raise + + def redo(self, grammar, debugger, only_last, print_code): + mods = self._file_modifications + if only_last is not None: + mods = mods[-only_last:] + self._run(grammar, mods, debugger, print_code=print_code) + + def run(self, grammar, debugger): + def iterate(): + fm = None + for _ in range(self._test_count): + fm = FileModification.generate( + self._code_lines, self._change_count, + previous_file_modification=fm + ) + self._file_modifications.append(fm) + yield fm + + self._run(grammar, iterate(), debugger) + + +def main(arguments): + debugger = 'pdb' if arguments['--pdb'] else \ + 'ipdb' if arguments['--ipdb'] else None + redo_file = os.path.join(os.path.dirname(__file__), 'fuzz-redo.pickle') + + if arguments['--logging']: + root = logging.getLogger() + root.setLevel(logging.DEBUG) + + ch = logging.StreamHandler(sys.stdout) + ch.setLevel(logging.DEBUG) + root.addHandler(ch) + + grammar = parso.load_grammar() + parso.python.diff.DEBUG_DIFF_PARSER = True + if arguments['redo']: + with open(redo_file, 'rb') as f: + file_tests_obj = pickle.load(f) + only_last = arguments['--only-last'] and int(arguments['--only-last']) + file_tests_obj.redo( + grammar, + debugger, + only_last=only_last, + print_code=arguments['--print-code'] + ) + elif arguments['random']: + # A random file is used to do diff parser checks if no file is given. + # This helps us to find errors in a lot of different files. + file_paths = list(find_python_files_in_tree(arguments['<path>'] or '.')) + max_tries = int(arguments['--maxtries']) + tries = 0 + try: + while tries < max_tries: + path = random.choice(file_paths) + print("Checking %s: %s tries" % (path, tries)) + now_tries = min(1000, max_tries - tries) + file_tests_obj = FileTests(path, now_tries, int(arguments['--changes'])) + file_tests_obj.run(grammar, debugger) + tries += now_tries + except Exception: + with open(redo_file, 'wb') as f: + pickle.dump(file_tests_obj, f) + raise + else: + raise NotImplementedError('Command is not implemented') + + +if __name__ == '__main__': + from docopt import docopt + + arguments = docopt(__doc__) + main(arguments) |