CoCalc -- flag-bisect.py

GitHub Repository: Roblox/luau
Path: blob/master/tools/flag-bisect.py
²⁷²³ views
1
#!/usr/bin/python3
2
# This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
3

4
import argparse
5
import asyncio
6
import copy
7
import json
8
import math
9
import os
10
import platform
11
import re
12
import subprocess
13
import sys
14
import textwrap
15
from enum import Enum
16

17
def add_parser(subparsers):
18
    flag_bisect_command = subparsers.add_parser('flag-bisect',
19
        help=help(),
20
        description=help(),
21
        epilog=epilog(),
22
        formatter_class=argparse.RawDescriptionHelpFormatter,
23
        )
24

25
    add_argument_parsers(flag_bisect_command)
26
    flag_bisect_command.set_defaults(func=flag_bisect_main)
27
    return flag_bisect_command
28

29
def help():
30
    return 'Search for a set of flags triggering the faulty behavior in unit tests'
31

32
def get_terminal_width():
33
    try:
34
        return os.get_terminal_size().columns
35
    except:
36
        # Return a reasonable default when a terminal is not available
37
        return 80
38
def wrap_text(text, width):
39
    leading_whitespace_re = re.compile('( *)')
40

41
    def get_paragraphs_and_indent(string):
42
        lines = string.split('\n')
43
        result = ''
44
        line_count = 0
45
        initial_indent = ''
46
        subsequent_indent = ''
47
        for line in lines:
48
            if len(line.strip()) == 0:
49
                if line_count > 0:
50
                    yield result, initial_indent, subsequent_indent
51
                    result = ''
52
                    line_count = 0
53
            else:
54
                line_count += 1
55
                if line_count == 1:
56
                    initial_indent = leading_whitespace_re.match(line).group(1)
57
                    subsequent_indent = initial_indent
58
                elif line_count == 2:
59
                    subsequent_indent = leading_whitespace_re.match(line).group(1)
60
                result += line.strip() + '\n'
61

62
    result = ''
63
    for paragraph, initial_indent, subsequent_indent in get_paragraphs_and_indent(text):
64
        result += textwrap.fill(paragraph, width=width, initial_indent=initial_indent, subsequent_indent=subsequent_indent, break_on_hyphens=False) + '\n\n'
65
    return result
66

67
def wrap_text_for_terminal(text):
68
    right_margin = 2 # This margin matches what argparse uses when formatting argument documentation
69
    min_width = 20
70
    width = max(min_width, get_terminal_width() - right_margin)
71
    return wrap_text(text, width)
72

73
def epilog():
74
    return wrap_text_for_terminal('''
75
    This tool uses the delta debugging algorithm to minimize the set of flags to the ones that are faulty in your unit tests,
76
    and the usage is trivial. Just provide a path to the unit test and you're done, the tool will do the rest.
77

78
    There are many use cases with flag-bisect. Included but not limited to:
79

80
      1: If your test is failing when you omit `--fflags=true` but it works when passing `--fflags=true`, then you can
81
         use this tool to find that set of flag requirements to see which flags are missing that will help to fix it. Ditto
82
         for the opposite too, this tool is generalized for that case.
83

84
      2: If you happen to run into a problem on production, and you're not sure which flags is the problem and you can easily
85
         create a unit test, you can run flag-bisect on that unit test to rapidly find the set of flags.
86

87
      3: If you have a flag that causes a performance regression, there's also the `--timeout=N` where `N` is in seconds.
88

89
      4: If you have tests that are demonstrating flakiness behavior, you can also use `--tries=N` where `N` is the number of
90
         attempts to run the same set of flags before moving on to the new set. This will eventually drill down to the flaky flag(s).
91
         Generally 8 tries should be more than enough, but it depends on the rarity. The more rare it is, the higher the attempts count
92
         needs to be. Note that this comes with a performance cost the higher you go, but certainly still faster than manual search.
93
         This argument will disable parallel mode by default. If this is not desired, explicitly write `--parallel=on`.
94

95
      5: By default flag-bisect runs in parallel mode which uses a slightly modified version of delta debugging algorithm to support
96
         trying multiple sets of flags concurrently. This means that the number of sets the algorithm will try at once is equal to the
97
         number of concurrent jobs. There is currently no upper bound to that, so heed this warning that your machine may slow down
98
         significantly. In this mode, we display the number of jobs it is running in parallel. Use `--parallel=off` to disable parallel
99
         mode.
100

101
         Be aware that this introduces some level of *non-determinism*, and it is fundamental due to the interaction with flag dependencies
102
         and the fact one job may finish faster than another job that got ran in the same cycle. However, it generally shouldn't matter
103
         if your test is deterministic and has no implicit flag dependencies in the codebase.
104

105
    The tool will try to automatically figure out which of `--pass` or `--fail` to use if you omit them or use `--auto` by applying
106
    heuristics. For example, if the tests works using `--fflags=true` and crashes if omitting `--fflags=true`, then it knows
107
    to use `--pass` to give you set of flags that will cause that crash. As usual, vice versa is also true. Since this is a
108
    heuristic, if it gets that guess wrong, you can override with `--pass` or `--fail`.
109

110
    You can speed this process up by scoping it to as few tests as possible, for example if you're using doctest then you'd
111
    pass `--tc=my_test` as an argument after `--`, so `flag-bisect ./path/to/binary -- --tc=my_test`.
112
    ''')
113

114
class InterestnessMode(Enum):
115
    AUTO = 0,
116
    FAIL = 1,
117
    PASS = 2,
118

119
def add_argument_parsers(parser):
120
    parser.add_argument('binary_path', help='Path to the unit test binary that will be bisected for a set of flags')
121

122
    parser.add_argument('--tries', dest='attempts', type=int, default=1, metavar='N',
123
        help='If the tests are flaky, flag-bisect will try again with the same set by N amount of times before moving on')
124

125
    parser.add_argument('--parallel', dest='parallel', choices=['on', 'off'], default='default',
126
        help='Test multiple sets of flags in parallel, useful when the test takes a while to run.')
127

128
    parser.add_argument('--explicit', dest='explicit', action='store_true', default=False, help='Explicitly set flags to false')
129

130
    parser.add_argument('--filter', dest='filter', default=None, help='Regular expression to filter for a subset of flags to test')
131

132
    parser.add_argument('--verbose', dest='verbose', action='store_true', default=False, help='Show stdout and stderr of the program being run')
133

134
    interestness_parser = parser.add_mutually_exclusive_group()
135
    interestness_parser.add_argument('--auto', dest='mode', action='store_const', const=InterestnessMode.AUTO,
136
        default=InterestnessMode.AUTO, help='Automatically figure out which one of --pass or --fail should be used')
137
    interestness_parser.add_argument('--fail', dest='mode', action='store_const', const=InterestnessMode.FAIL,
138
        help='You want this if passing --fflags=true causes tests to fail')
139
    interestness_parser.add_argument('--pass', dest='mode', action='store_const', const=InterestnessMode.PASS,
140
        help='You want this if passing --fflags=true causes tests to pass')
141
    interestness_parser.add_argument('--timeout', dest='timeout', type=int, default=0, metavar='SECONDS',
142
        help='Find the flag(s) causing performance regression if time to run exceeds the timeout in seconds')
143

144
class Options:
145
    def __init__(self, args, other_args, sense):
146
        self.path = args.binary_path
147
        self.explicit = args.explicit
148
        self.sense = sense
149
        self.timeout = args.timeout
150
        self.interested_in_timeouts = args.timeout != 0
151
        self.attempts = args.attempts
152
        self.parallel = (args.parallel == 'on' or args.parallel == 'default') if args.attempts == 1 else args.parallel == 'on'
153
        self.filter = re.compile(".*" + args.filter + ".*") if args.filter else None
154
        self.verbose = args.verbose
155
        self.other_args = [arg for arg in other_args if arg != '--'] # Useless to have -- here, discard.
156

157
    def copy_with_sense(self, sense):
158
        new_copy = copy.copy(self)
159
        new_copy.sense = sense
160
        return new_copy
161

162
class InterestnessResult(Enum):
163
    FAIL = 0,
164
    PASS = 1,
165
    TIMED_OUT = 2,
166

167
class Progress:
168
    def __init__(self, count, n_of_jobs=None):
169
        self.count = count
170
        self.steps = 0
171
        self.n_of_jobs = n_of_jobs
172
        self.buffer = None
173

174
    def show(self):
175
        # remaining is actually the height of the current search tree.
176
        remain = int(math.log2(self.count))
177
        flag_plural = 'flag' if self.count == 1 else 'flags'
178
        node_plural = 'node' if remain == 1 else 'nodes'
179
        jobs_info = f', running {self.n_of_jobs} jobs' if self.n_of_jobs is not None else ''
180
        return f'flag bisection: testing {self.count} {flag_plural} (step {self.steps}, {remain} {node_plural} remain{jobs_info})'
181

182
    def hide(self):
183
        if self.buffer:
184
            sys.stdout.write('\b \b' * len(self.buffer))
185

186
    def update(self, len, n_of_jobs=None):
187
        self.hide()
188
        self.count = len
189
        self.steps += 1
190
        self.n_of_jobs = n_of_jobs
191
        self.buffer = self.show()
192
        sys.stdout.write(self.buffer)
193
        sys.stdout.flush()
194

195
def list_fflags(options):
196
    try:
197
        out = subprocess.check_output([options.path, '--list-fflags'], encoding='UTF-8')
198
        flag_names = []
199

200
        # It's unlikely that a program we're going to test has no flags.
201
        # So if the output doesn't start with FFlag, assume it doesn't support --list-fflags and therefore cannot be bisected.
202
        if not out.startswith('FFlag') and not out.startswith('DFFlag') and not out.startswith('SFFlag'):
203
            return None
204

205
        flag_names = out.split('\n')[:-1]
206

207
        subset = [flag for flag in flag_names if options.filter.match(flag) is not None] if options.filter else flag_names
208
        return subset if subset else None
209
    except:
210
        return None
211

212
def mk_flags_argument(options, flags, initial_flags):
213
    lst = [flag + '=true' for flag in flags]
214

215
    # When --explicit is provided, we'd like to find the set of flags from initial_flags that's not in active flags.
216
    # This is so that we can provide a =false value instead of leaving them out to be the default value.
217
    if options.explicit:
218
        for flag in initial_flags:
219
            if flag not in flags:
220
                lst.append(flag + '=false')
221

222
    return '--fflags=' + ','.join(lst)
223

224
def mk_command_line(options, flags_argument):
225
    arguments = [options.path, *options.other_args]
226
    if flags_argument is not None:
227
        arguments.append(flags_argument)
228
    return arguments
229

230
async def get_interestness(options, flags_argument):
231
    try:
232
        timeout = options.timeout if options.interested_in_timeouts else None
233
        cmd = mk_command_line(options, flags_argument)
234
        stdout = subprocess.PIPE if not options.verbose else None
235
        stderr = subprocess.PIPE if not options.verbose else None
236
        process = subprocess.run(cmd, stdout=stdout, stderr=stderr, timeout=timeout)
237
        return InterestnessResult.PASS if process.returncode == 0 else InterestnessResult.FAIL
238
    except subprocess.TimeoutExpired:
239
        return InterestnessResult.TIMED_OUT
240

241
async def is_hot(options, flags_argument, pred=any):
242
    results = await asyncio.gather(*[get_interestness(options, flags_argument) for _ in range(options.attempts)])
243

244
    if options.interested_in_timeouts:
245
        return pred([InterestnessResult.TIMED_OUT == x for x in results])
246
    else:
247
        return pred([(InterestnessResult.PASS if options.sense else InterestnessResult.FAIL) == x for x in results])
248

249
def pairwise_disjoints(flags, granularity):
250
    offset = 0
251
    per_slice_len = len(flags) // granularity
252
    while offset < len(flags):
253
        yield flags[offset:offset + per_slice_len]
254
        offset += per_slice_len
255

256
def subsets_and_complements(flags, granularity):
257
    for disjoint_set in pairwise_disjoints(flags, granularity):
258
        yield disjoint_set, [flag for flag in flags if flag not in disjoint_set]
259

260
# https://www.cs.purdue.edu/homes/xyzhang/fall07/Papers/delta-debugging.pdf
261
async def ddmin(options, initial_flags):
262
    current = initial_flags
263
    granularity = 2
264

265
    progress = Progress(len(current))
266
    progress.update(len(current))
267

268
    while len(current) >= 2:
269
        changed = False
270

271
        for (subset, complement) in subsets_and_complements(current, granularity):
272
            progress.update(len(current))
273
            if await is_hot(options, mk_flags_argument(options, complement, initial_flags)):
274
                current = complement
275
                granularity = max(granularity - 1, 2)
276
                changed = True
277
                break
278
            elif await is_hot(options, mk_flags_argument(options, subset, initial_flags)):
279
                current = subset
280
                granularity = 2
281
                changed = True
282
                break
283

284
        if not changed:
285
            if granularity == len(current):
286
                break
287
            granularity = min(granularity * 2, len(current))
288

289
    progress.hide()
290
    return current
291

292
async def ddmin_parallel(options, initial_flags):
293
    current = initial_flags
294
    granularity = 2
295

296
    progress = Progress(len(current))
297
    progress.update(len(current), granularity)
298

299
    while len(current) >= 2:
300
        changed = False
301

302
        subset_jobs = []
303
        complement_jobs = []
304

305
        def advance(task):
306
            nonlocal current
307
            nonlocal granularity
308
            nonlocal changed
309
            # task.cancel() calls the callback passed to add_done_callback...
310
            if task.cancelled():
311
                return
312
            hot, new_delta, new_granularity = task.result()
313
            if hot and not changed:
314
                current = new_delta
315
                granularity = new_granularity
316
                changed = True
317
                for job in subset_jobs:
318
                    job.cancel()
319
                for job in complement_jobs:
320
                    job.cancel()
321

322
        for (subset, complement) in subsets_and_complements(current, granularity):
323
            async def work(flags, new_granularity):
324
                hot = await is_hot(options, mk_flags_argument(options, flags, initial_flags))
325
                return (hot, flags, new_granularity)
326

327
            # We want to run subset jobs in parallel first.
328
            subset_job = asyncio.create_task(work(subset, 2))
329
            subset_job.add_done_callback(advance)
330
            subset_jobs.append(subset_job)
331

332
            # Then the complements afterwards, but only if we didn't find a new subset.
333
            complement_job = asyncio.create_task(work(complement, max(granularity - 1, 2)))
334
            complement_job.add_done_callback(advance)
335
            complement_jobs.append(complement_job)
336

337
        # When we cancel jobs, the asyncio.gather will be waiting pointlessly.
338
        # In that case, we'd like to return the control to this routine.
339
        await asyncio.gather(*subset_jobs, return_exceptions=True)
340
        if not changed:
341
            await asyncio.gather(*complement_jobs, return_exceptions=True)
342
        progress.update(len(current), granularity)
343

344
        if not changed:
345
            if granularity == len(current):
346
                break
347
            granularity = min(granularity * 2, len(current))
348

349
    progress.hide()
350
    return current
351

352
def search(options, initial_flags):
353
    if options.parallel:
354
        return ddmin_parallel(options, initial_flags)
355
    else:
356
        return ddmin(options, initial_flags)
357

358
async def do_work(args, other_args):
359
    sense = None
360

361
    # If --timeout isn't used, try to apply a heuristic to figure out which of --pass or --fail we want.
362
    if args.timeout == 0 and args.mode == InterestnessMode.AUTO:
363
        inner_options = Options(args, other_args, sense)
364

365
        # We aren't interested in timeout for this heuristic. It just makes no sense to assume timeouts.
366
        # This actually cannot happen by this point, but if we make timeout a non-exclusive switch to --auto, this will go wrong.
367
        inner_options.timeout = 0
368
        inner_options.interested_in_timeouts = False
369

370
        all_tasks = asyncio.gather(
371
            is_hot(inner_options.copy_with_sense(True), '--fflags=true', all),
372
            is_hot(inner_options.copy_with_sense(False), '--fflags=false' if inner_options.explicit else None, all),
373
        )
374

375
        # If it times out, we can print a message saying that this is still working. We intentionally want to continue doing work.
376
        done, pending = await asyncio.wait([all_tasks], timeout=1.5)
377
        if all_tasks not in done:
378
            print('Hang on! I\'m running your program to try and figure out which of --pass or --fail to use!')
379
            print('Need to find out faster? Cancel the work and explicitly write --pass or --fail')
380

381
        is_pass_hot, is_fail_hot = await all_tasks
382

383
        # This is a bit counter-intuitive, but the following table tells us which of the sense we want.
384
        # Because when you omit --fflags=true argument and it fails, then is_fail_hot is True.
385
        # Consequently, you need to use --pass to find out what that set of flags is. And vice versa.
386
        #
387
        # Also, when is_pass_hot is True and is_fail_hot is False, then that program is working as expected.
388
        # There should be no reason to run flag bisection.
389
        # However, this can be ambiguous in the opposite of the aforementioned outcome!
390
        #
391
        # is_pass_hot | is_fail_hot | is ambiguous?
392
        #-------------|-------------|---------------
393
        # True        | True        | No! Pick --pass.
394
        # False       | False       | No! Pick --fail.
395
        # True        | False       | No! But this is the exact situation where you shouldn't need to flag-bisect. Raise an error.
396
        # False       | True        | Yes! But we'll pragmatically pick --fail here in the hope it gives the correct set of flags.
397

398
        if is_pass_hot and not is_fail_hot:
399
            print('The tests seems to be working fine for me. If you really need to flag-bisect, please try again with an explicit --pass or --fail', file=sys.stderr)
400
            return 1
401

402
        if not is_pass_hot and is_fail_hot:
403
            print('I couldn\'t quite figure out which of --pass or --fail to use, but I\'ll carry on anyway')
404

405
        sense = is_pass_hot
406
        argument = '--pass' if sense else '--fail'
407
        print(f'I\'m bisecting flags as if {argument} was used')
408
    else:
409
        sense = True if args.mode == InterestnessMode.PASS else False
410

411
    options = Options(args, other_args, sense)
412

413
    initial_flags = list_fflags(options)
414
    if initial_flags is None:
415
        print('I cannot bisect flags with ' + options.path, file=sys.stderr)
416
        print('These are required for me to be able to cooperate:', file=sys.stderr)
417
        print('\t--list-fflags must print a list of flags separated by newlines, including FFlag prefix', file=sys.stderr)
418
        print('\t--fflags=... to accept a comma-separated pair of flag names and their value in the form FFlagFoo=true', file=sys.stderr)
419
        return 1
420

421
    # On Windows, there is an upper bound on the numbers of characters for a command line incantation.
422
    # If we don't handle this ourselves, the runtime error is going to look nothing like the actual problem.
423
    # It'd say "file name way too long" or something to that effect. We can teed up a better error message and
424
    # tell the user how to work around it by using --filter.
425
    if platform.system() == 'Windows':
426
        cmd_line = ' '.join(mk_command_line(options, mk_flags_argument(options, initial_flags, [])))
427
        if len(cmd_line) >= 8191:
428
            print(f'Never mind! The command line is too long because we have {len(initial_flags)} flags to test', file=sys.stderr)
429
            print('Consider using `--filter=<regex>` to narrow it down upfront, or use any version of WSL instead', file=sys.stderr)
430
            return 1
431

432
    hot_flags = await search(options, initial_flags)
433
    if hot_flags:
434
        print('I narrowed down to these flags:')
435
        print(textwrap.indent('\n'.join(hot_flags), prefix='\t'))
436

437
        # If we showed the command line in explicit mode, all flags would be listed here.
438
        # This would pollute the terminal with 3000 flags. We don't want that. Don't show it.
439
        # Ditto for when the number flags we bisected are equal.
440
        if not options.explicit and len(hot_flags) != len(initial_flags):
441
            print('$ ' + ' '.join(mk_command_line(options, mk_flags_argument(options, hot_flags, initial_flags))))
442

443
        return 0
444

445
    print('I found nothing, sorry', file=sys.stderr)
446
    return 1
447

448
def flag_bisect_main(args, other_args):
449
    return asyncio.run(do_work(args, other_args))
450

451
def main():
452
    parser = argparse.ArgumentParser(description=help(), epilog=epilog(), formatter_class=argparse.RawTextHelpFormatter)
453
    add_argument_parsers(parser)
454
    args, other_args = parser.parse_known_args()
455
    return flag_bisect_main(args, other_args)
456

457
if __name__ == '__main__':
458
    sys.exit(main())
459

460
Product

Resources

Company