CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
sagemathinc

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.

GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/scripts/multi-diffbundler
Views: 687
#!/usr/bin/env python

import json, os, shutil, sys

join        = os.path.join
isdir       = os.path.isdir
islink       = os.path.islink
listdir     = os.listdir
path_exists = os.path.exists
splitext    = os.path.splitext
getsize     = os.path.getsize

from subprocess import Popen, PIPE

multi_diffbundler = os.path.abspath(os.path.realpath(__file__))
diffbundler = join(os.path.split(multi_diffbundler)[0], 'diffbundler')

verbose = False

def cmd(args, dry_run=False, wait=True, ignore_errors=False):
    if isinstance(args, str):
        shell = True
        if verbose: print args
    else:
        if verbose: print ' '.join(args)
        shell = False
    if dry_run:
        return
    out = Popen(args, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=shell)
    if wait:
        e = out.wait()
        stdout = out.stdout.read()
        stderr =  out.stderr.read()
        if verbose: print stdout,
        if verbose: print stderr,
        if e and not ignore_errors:
            sys.exit(e)
        return {'stdout':stdout, 'stderr':stderr}
    else:
        return out

def test():
    target = '/tmp/test'  # stupid
    cmd(["rm","-rf", target])
    cmd(["mkdir", "-p", target])
    os.chdir(target)
    cmd("git init; echo 'foo'>123; git add 123; git commit -a -m 'init'")
    cmd("mkdir x; cd x; git init; touch abc; git add abc; git commit -a -m 'init'; git branch test")
    cmd("mkdir x/y; cd x/y; git init; touch abc; git add abc; git commit -a -m 'init'")
    cmd("git add .; git commit -a -m 'add subdirs'")
    cmd("mkdir -p .git/salvus/bundles/")
    cmd([multi_diffbundler, 'create', target, join(target, '.git/salvus/bundles')])
    cmd("touch abc; git add abc; git commit -a -m 'abc'")
    cmd([multi_diffbundler, 'create', target, join(target, '.git/salvus/bundles')])
    # Now make a big random file in x/y/ and add to repo
    cmd("cd x/y; head -c 15000000 /dev/urandom > big_file; git add big_file; git commit -a -m 'big file added'")
    cmd([multi_diffbundler, 'create', target, join(target, '.git/salvus/bundles')])
    # And make another, which must force creation of a new bundle 1.diffbundle.
    cmd("cd x/y; head -c 15000000 /dev/urandom > big_file; git add big_file; git commit -a -m 'big file added'")
    cmd([multi_diffbundler, 'create', target, join(target, '.git/salvus/bundles')])
    # Now immediately do another save; now the 1.diffbundle should not be considered new.
    cmd([multi_diffbundler, 'create', target, join(target, '.git/salvus/bundles')])

def cleanup_path(path):
    if not os.path.exists(path):
        os.makedirs(path)
    elif not os.path.isdir(path):
        raise ValueError("'%s' is not a directory"%path)
    return os.path.realpath(path)

def newest_bundle_filename(path):
    if not os.path.exists(path):
        return None
    v = [int(splitext(filename)[0]) for filename in listdir(path) if not isdir(join(path, filename))]
    v.sort()
    if len(v) > 0:
        return join(path, "%s.diffbundle"%v[-1])
    else:
        return None

def create_multidiffbundle(working_dir, bundler_dir, thresh, new_bundles):
    if path_exists(join(working_dir, '.git')):
        last_bundle = None
        last_bundlename = newest_bundle_filename(bundler_dir)

        # "git gc" before bundling is *very* important, since
        # otherwise the bundles mushroom for no good reason.
        os.chdir(working_dir)
        cmd(['git', 'commit', '-a', '-m', 'Salvus save'], ignore_errors=True)
        cmd(['git', 'gc'])

        # Try making a new bundle first
        cmd([diffbundler, 'create', working_dir, bundler_dir], dry_run=False)
        new_bundlename = newest_bundle_filename(bundler_dir)
        if new_bundlename != last_bundlename:
            # There were changes to the repo that we had not already bundled up.
            # First, check if we should merge the last two bundles.
            if thresh > 0 and last_bundlename is not None and getsize(last_bundlename)/1000000.0 < thresh:
                os.unlink(last_bundlename)
                os.unlink(new_bundlename)
                cmd([diffbundler, 'create', working_dir, bundler_dir], dry_run=False)
                new_bundles.append(last_bundlename)
            else:
                new_bundles.append(new_bundlename)

    for path in listdir(working_dir):
        p = join(working_dir, path)
        if isdir(p) and not islink(p):
            create_multidiffbundle(p, join(bundler_dir, path), thresh, new_bundles)

def extract_multidiffbundle(bundle_dir, working_dir):
    print bundle_dir, working_dir
    if os.path.exists(working_dir):
        cmd([diffbundler, 'extract', bundle_dir, working_dir], dry_run=False)
    for path in listdir(bundle_dir):
        if isdir(join(bundle_dir, path)):
            extract_multidiffbundle(join(bundle_dir, path), join(working_dir, path))

#########################################################################################################


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(description=
            'Utility script for managing git repositories that may contain other git repositories, stored in diffbundles')

    parser.add_argument('--thresh', help='threshhold in megabytes for creating new bundles', default=5, type=int, dest='thresh')
    parser.add_argument('--verbose', help='be more verbose (default: False)', action="store_true")

    subparsers = parser.add_subparsers()

    test_parser = subparsers.add_parser('test', help='run a test (involves creating a directory /tmp/test/)')

    create_parser = subparsers.add_parser('create',
            help='update new multi-diffbundle path')
    create_parser.add_argument('repodir', type=str,
            help='repository to bundle')
    create_parser.add_argument('bundledir', type=str,
            help='directory to place new diffbundle')

    extract_parser = subparsers.add_parser('extract',
            help='extract a repository with nested git repos from a multi-diffbundle path')
    extract_parser.add_argument('bundledir', type=str,
            help='directory with diffbundles')
    extract_parser.add_argument('extractdir', type=str,
            help='output directory')

    args = parser.parse_args()

    if args.verbose:
        verbose = True

    if 'repodir' in args:
        bundledir = cleanup_path(args.bundledir)
        new_bundles = []
        create_multidiffbundle(cleanup_path(args.repodir), bundledir, args.thresh, new_bundles)
        n = len(bundledir)
        new_bundles = [f[n+1:] for f in new_bundles]
        print json.dumps(new_bundles, separators=(',',':'))
    elif 'extractdir' in args:
        bundledir = cleanup_path(args.bundledir)
    	extract_multidiffbundle(bundledir, cleanup_path(args.extractdir))
    elif 'test' in sys.argv:
        test()
    else:
        raise RuntimeError('command line arguments not properly processed')