Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
aws
GitHub Repository: aws/aws-cli
Path: blob/develop/scripts/ci/run-benchmark
1566 views
#!/usr/bin/env python
"""Script to benchmark several high level cli commands.

As of now this benchmarks `cp` and `rm` with test cases for multiple 4kb files
(default 10000 files) and a single large file (default 10gb, `cp` only).
"""
import os
import json
from subprocess import check_call, Popen, PIPE
from datetime import datetime
import random
import argparse
import inspect
import shutil
import platform

import awscli
import s3transfer


TEST_BUCKET = os.environ.get('PERF_TEST_BUCKET')
REPO_ROOT = os.path.dirname(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
WORKDIR = os.environ.get('PERF_WORKDIR', os.path.join(REPO_ROOT, 'workdir'))
MANY_FILES_DIR = 'many'
LARGE_FILE_DIR = 'large'


def run(command):
    return check_call(command, shell=True)


def generate_run_id():
    run_id = datetime.now().strftime("%Y-%m-%d-%H-%M-%S-")
    run_id += str(random.randint(1, 10000))
    return run_id


def initialize_files(num_files, file_size):
    # TODO: We probably need to recreate these files each time.
    # Because you can specify --num-files and --large-file-size
    # those arguments can be potentially ignored if you've run
    # this previously with different values.
    many_files_dir = os.path.join(WORKDIR, MANY_FILES_DIR)
    if not os.path.exists(many_files_dir):
        os.makedirs(many_files_dir)
        run('caf gen --file-size 4kb --max-files %s --directory %s' %
            (num_files, many_files_dir))

    large_file_dir = os.path.join(WORKDIR, LARGE_FILE_DIR)
    if not os.path.exists(large_file_dir):
        os.makedirs(large_file_dir)
        run('caf gen --file-size %s --max-files 1 --directory %s' %
            (file_size, large_file_dir))


def write_metadata_file(filename):
    metadata = _collect_metadata()
    with open(filename, 'w') as f:
        f.write(json.dumps(metadata, indent=2))


def _collect_metadata():
    # We want to track things like the machine where the benchmark was run,
    # and which version of the aws-cli/s3transfer code is being run.
    # This helps us make more meaningful comparison.
    metadata = {
        'python_version': platform.python_version(),
        'os': '%s/%s' % (platform.system(), platform.release()),
    }
    _inject_package_info(awscli, metadata)
    _inject_package_info(s3transfer, metadata)
    return metadata


def _inject_package_info(package, metadata):
    name = package.__name__
    metadata[name + '_version'] = package.__version__
    metadata[name + '_git_version'] = _get_git_version(package)


def _get_git_version(package):
    dname = os.path.dirname(inspect.getfile(package))
    git_sha = Popen(
        'git rev-parse HEAD',
        cwd=dname, shell=True, stdout=PIPE).communicate()[0].strip()
    git_branch = Popen(
        'git rev-parse --abbrev-ref HEAD',
        cwd=dname, shell=True, stdout=PIPE).communicate()[0].strip()
    return '%s (%s)' % (git_sha, git_branch)


def main(args):
    initialize_files(args.num_files, args.large_file_size)
    run_id = generate_run_id()
    results_dir = os.path.join(WORKDIR, 'results', run_id)
    os.makedirs(results_dir)
    write_metadata_file(os.path.join(results_dir, 'metadata.json'))
    try:
        benchmark(args.bucket, results_dir, args.num_iterations)
        print("RUN ID: " + run_id)
    except Exception:
        shutil.rmtree(results_dir)
        raise


def benchmark(bucket, results_dir, num_iterations=1):
    perf_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
    perf_dir = os.path.join(perf_dir, 'performance')

    s3_location = bucket + '/' + MANY_FILES_DIR
    local_dir = os.path.join(WORKDIR, MANY_FILES_DIR)
    try:
        # 10k upload
        results = os.path.join(results_dir, 'upload-10k-small')
        os.makedirs(results)
        benchmark_cp = os.path.join(perf_dir, 'benchmark-cp')
        run(benchmark_cp + ' --recursive --num-iterations %s '
            '--source %s --dest %s --result-dir %s --no-cleanup' % (
                num_iterations, local_dir, s3_location, results))

        # 10k download
        results = os.path.join(results_dir, 'download-10k-small')
        os.makedirs(results)
        run(benchmark_cp + ' --recursive --num-iterations %s '
            '--source %s --dest %s --result-dir %s' % (
                num_iterations, s3_location, local_dir, results))

        # 10k rm
        results = os.path.join(results_dir, 'delete-10k-small')
        os.makedirs(results)
        benchmark_rm = os.path.join(perf_dir, 'benchmark-rm')
        run(benchmark_rm + ' --recursive --num-iterations %s '
            '--target %s --result-dir %s' % (
                num_iterations, s3_location, results))
    finally:
        # Note that the delete-10k-small benchmark restores
        # the files it's deleted once the script is finished.
        # Therefore we need to explicitly cleanup any files
        # we've created.
        run('aws s3 rm --recursive ' + s3_location)

    s3_location = bucket + '/' + LARGE_FILE_DIR
    local_dir = os.path.join(WORKDIR, LARGE_FILE_DIR)
    try:
        # 10gb upload
        results = os.path.join(results_dir, 'upload-10gb')
        os.makedirs(results)
        run(benchmark_cp + ' --recursive --num-iterations %s '
            '--source %s --dest %s --result-dir %s --no-cleanup' % (
                num_iterations, local_dir, s3_location, results))

        # 10gb download
        results = os.path.join(results_dir, 'download-10gb')
        os.makedirs(results)
        run(benchmark_cp + ' --recursive --num-iterations %s '
            '--source %s --dest %s --result-dir %s' % (
                num_iterations, s3_location, local_dir, results))
    finally:
        # Not benchmarking a single rm call since it's just a single call
        run('aws s3 rm --recursive ' + s3_location)


def s3_uri(value):
    if not value.startswith('s3://'):
        return 's3://' + value
    return value


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-n', '--num-iterations', type=int, default=10,
        help='The number of times to run each test.'
    )
    parser.add_argument(
        '-b', '--bucket', default=TEST_BUCKET, type=s3_uri,
        required=TEST_BUCKET is None,
        help='The bucket to use for testing as an s3 uri. This can also be '
             'set by the environment variable PERF_TEST_BUCKET. If the '
             'environment variable is not set, then this argument is required.'
    )
    parser.add_argument(
        '--num-files', default=10000, type=int,
        help='The number of files to use for the multiple file case.'
    )
    parser.add_argument(
        '--large-file-size', default='10gb',
        help='The file size for the large file case. This can be in the form '
             '10gb, 4kb, etc.'
    )
    main(parser.parse_args())