#!/usr/bin/env python
"""Script to benchmark several high level cli commands.
As of now this benchmarks `cp` and `rm` with test cases for multiple 4kb files
(default 10000 files) and a single large file (default 10gb, `cp` only).
"""
import os
import json
from subprocess import check_call, Popen, PIPE
from datetime import datetime
import random
import argparse
import inspect
import shutil
import platform
import awscli
import s3transfer
TEST_BUCKET = os.environ.get('PERF_TEST_BUCKET')
REPO_ROOT = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
WORKDIR = os.environ.get('PERF_WORKDIR', os.path.join(REPO_ROOT, 'workdir'))
MANY_FILES_DIR = 'many'
LARGE_FILE_DIR = 'large'
def run(command):
return check_call(command, shell=True)
def generate_run_id():
run_id = datetime.now().strftime("%Y-%m-%d-%H-%M-%S-")
run_id += str(random.randint(1, 10000))
return run_id
def initialize_files(num_files, file_size):
# TODO: We probably need to recreate these files each time.
# Because you can specify --num-files and --large-file-size
# those arguments can be potentially ignored if you've run
# this previously with different values.
many_files_dir = os.path.join(WORKDIR, MANY_FILES_DIR)
if not os.path.exists(many_files_dir):
os.makedirs(many_files_dir)
run('caf gen --file-size 4kb --max-files %s --directory %s' %
(num_files, many_files_dir))
large_file_dir = os.path.join(WORKDIR, LARGE_FILE_DIR)
if not os.path.exists(large_file_dir):
os.makedirs(large_file_dir)
run('caf gen --file-size %s --max-files 1 --directory %s' %
(file_size, large_file_dir))
def write_metadata_file(filename):
metadata = _collect_metadata()
with open(filename, 'w') as f:
f.write(json.dumps(metadata, indent=2))
def _collect_metadata():
# We want to track things like the machine where the benchmark was run,
# and which version of the aws-cli/s3transfer code is being run.
# This helps us make more meaningful comparison.
metadata = {
'python_version': platform.python_version(),
'os': '%s/%s' % (platform.system(), platform.release()),
}
_inject_package_info(awscli, metadata)
_inject_package_info(s3transfer, metadata)
return metadata
def _inject_package_info(package, metadata):
name = package.__name__
metadata[name + '_version'] = package.__version__
metadata[name + '_git_version'] = _get_git_version(package)
def _get_git_version(package):
dname = os.path.dirname(inspect.getfile(package))
git_sha = Popen(
'git rev-parse HEAD',
cwd=dname, shell=True, stdout=PIPE).communicate()[0].strip()
git_branch = Popen(
'git rev-parse --abbrev-ref HEAD',
cwd=dname, shell=True, stdout=PIPE).communicate()[0].strip()
return '%s (%s)' % (git_sha, git_branch)
def main(args):
initialize_files(args.num_files, args.large_file_size)
run_id = generate_run_id()
results_dir = os.path.join(WORKDIR, 'results', run_id)
os.makedirs(results_dir)
write_metadata_file(os.path.join(results_dir, 'metadata.json'))
try:
benchmark(args.bucket, results_dir, args.num_iterations)
print("RUN ID: " + run_id)
except Exception:
shutil.rmtree(results_dir)
raise
def benchmark(bucket, results_dir, num_iterations=1):
perf_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
perf_dir = os.path.join(perf_dir, 'performance')
s3_location = bucket + '/' + MANY_FILES_DIR
local_dir = os.path.join(WORKDIR, MANY_FILES_DIR)
try:
# 10k upload
results = os.path.join(results_dir, 'upload-10k-small')
os.makedirs(results)
benchmark_cp = os.path.join(perf_dir, 'benchmark-cp')
run(benchmark_cp + ' --recursive --num-iterations %s '
'--source %s --dest %s --result-dir %s --no-cleanup' % (
num_iterations, local_dir, s3_location, results))
# 10k download
results = os.path.join(results_dir, 'download-10k-small')
os.makedirs(results)
run(benchmark_cp + ' --recursive --num-iterations %s '
'--source %s --dest %s --result-dir %s' % (
num_iterations, s3_location, local_dir, results))
# 10k rm
results = os.path.join(results_dir, 'delete-10k-small')
os.makedirs(results)
benchmark_rm = os.path.join(perf_dir, 'benchmark-rm')
run(benchmark_rm + ' --recursive --num-iterations %s '
'--target %s --result-dir %s' % (
num_iterations, s3_location, results))
finally:
# Note that the delete-10k-small benchmark restores
# the files it's deleted once the script is finished.
# Therefore we need to explicitly cleanup any files
# we've created.
run('aws s3 rm --recursive ' + s3_location)
s3_location = bucket + '/' + LARGE_FILE_DIR
local_dir = os.path.join(WORKDIR, LARGE_FILE_DIR)
try:
# 10gb upload
results = os.path.join(results_dir, 'upload-10gb')
os.makedirs(results)
run(benchmark_cp + ' --recursive --num-iterations %s '
'--source %s --dest %s --result-dir %s --no-cleanup' % (
num_iterations, local_dir, s3_location, results))
# 10gb download
results = os.path.join(results_dir, 'download-10gb')
os.makedirs(results)
run(benchmark_cp + ' --recursive --num-iterations %s '
'--source %s --dest %s --result-dir %s' % (
num_iterations, s3_location, local_dir, results))
finally:
# Not benchmarking a single rm call since it's just a single call
run('aws s3 rm --recursive ' + s3_location)
def s3_uri(value):
if not value.startswith('s3://'):
return 's3://' + value
return value
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
'-n', '--num-iterations', type=int, default=10,
help='The number of times to run each test.'
)
parser.add_argument(
'-b', '--bucket', default=TEST_BUCKET, type=s3_uri,
required=TEST_BUCKET is None,
help='The bucket to use for testing as an s3 uri. This can also be '
'set by the environment variable PERF_TEST_BUCKET. If the '
'environment variable is not set, then this argument is required.'
)
parser.add_argument(
'--num-files', default=10000, type=int,
help='The number of files to use for the multiple file case.'
)
parser.add_argument(
'--large-file-size', default='10gb',
help='The file size for the large file case. This can be in the form '
'10gb, 4kb, etc.'
)
main(parser.parse_args())