Path: blob/master/drivers/md/dm-log-userspace-base.c
15109 views
/*1* Copyright (C) 2006-2009 Red Hat, Inc.2*3* This file is released under the LGPL.4*/56#include <linux/bio.h>7#include <linux/slab.h>8#include <linux/dm-dirty-log.h>9#include <linux/device-mapper.h>10#include <linux/dm-log-userspace.h>1112#include "dm-log-userspace-transfer.h"1314#define DM_LOG_USERSPACE_VSN "1.1.0"1516struct flush_entry {17int type;18region_t region;19struct list_head list;20};2122/*23* This limit on the number of mark and clear request is, to a degree,24* arbitrary. However, there is some basis for the choice in the limits25* imposed on the size of data payload by dm-log-userspace-transfer.c:26* dm_consult_userspace().27*/28#define MAX_FLUSH_GROUP_COUNT 322930struct log_c {31struct dm_target *ti;32uint32_t region_size;33region_t region_count;34uint64_t luid;35char uuid[DM_UUID_LEN];3637char *usr_argv_str;38uint32_t usr_argc;3940/*41* in_sync_hint gets set when doing is_remote_recovering. It42* represents the first region that needs recovery. IOW, the43* first zero bit of sync_bits. This can be useful for to limit44* traffic for calls like is_remote_recovering and get_resync_work,45* but be take care in its use for anything else.46*/47uint64_t in_sync_hint;4849/*50* Mark and clear requests are held until a flush is issued51* so that we can group, and thereby limit, the amount of52* network traffic between kernel and userspace. The 'flush_lock'53* is used to protect these lists.54*/55spinlock_t flush_lock;56struct list_head mark_list;57struct list_head clear_list;58};5960static mempool_t *flush_entry_pool;6162static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)63{64return kmalloc(sizeof(struct flush_entry), gfp_mask);65}6667static void flush_entry_free(void *element, void *pool_data)68{69kfree(element);70}7172static int userspace_do_request(struct log_c *lc, const char *uuid,73int request_type, char *data, size_t data_size,74char *rdata, size_t *rdata_size)75{76int r;7778/*79* If the server isn't there, -ESRCH is returned,80* and we must keep trying until the server is81* restored.82*/83retry:84r = dm_consult_userspace(uuid, lc->luid, request_type, data,85data_size, rdata, rdata_size);8687if (r != -ESRCH)88return r;8990DMERR(" Userspace log server not found.");91while (1) {92set_current_state(TASK_INTERRUPTIBLE);93schedule_timeout(2*HZ);94DMWARN("Attempting to contact userspace log server...");95r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR,96lc->usr_argv_str,97strlen(lc->usr_argv_str) + 1,98NULL, NULL);99if (!r)100break;101}102DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");103r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL,1040, NULL, NULL);105if (!r)106goto retry;107108DMERR("Error trying to resume userspace log: %d", r);109110return -ESRCH;111}112113static int build_constructor_string(struct dm_target *ti,114unsigned argc, char **argv,115char **ctr_str)116{117int i, str_size;118char *str = NULL;119120*ctr_str = NULL;121122for (i = 0, str_size = 0; i < argc; i++)123str_size += strlen(argv[i]) + 1; /* +1 for space between args */124125str_size += 20; /* Max number of chars in a printed u64 number */126127str = kzalloc(str_size, GFP_KERNEL);128if (!str) {129DMWARN("Unable to allocate memory for constructor string");130return -ENOMEM;131}132133str_size = sprintf(str, "%llu", (unsigned long long)ti->len);134for (i = 0; i < argc; i++)135str_size += sprintf(str + str_size, " %s", argv[i]);136137*ctr_str = str;138return str_size;139}140141/*142* userspace_ctr143*144* argv contains:145* <UUID> <other args>146* Where 'other args' is the userspace implementation specific log147* arguments. An example might be:148* <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]149*150* So, this module will strip off the <UUID> for identification purposes151* when communicating with userspace about a log; but will pass on everything152* else.153*/154static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,155unsigned argc, char **argv)156{157int r = 0;158int str_size;159char *ctr_str = NULL;160struct log_c *lc = NULL;161uint64_t rdata;162size_t rdata_size = sizeof(rdata);163164if (argc < 3) {165DMWARN("Too few arguments to userspace dirty log");166return -EINVAL;167}168169lc = kmalloc(sizeof(*lc), GFP_KERNEL);170if (!lc) {171DMWARN("Unable to allocate userspace log context.");172return -ENOMEM;173}174175/* The ptr value is sufficient for local unique id */176lc->luid = (unsigned long)lc;177178lc->ti = ti;179180if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {181DMWARN("UUID argument too long.");182kfree(lc);183return -EINVAL;184}185186strncpy(lc->uuid, argv[0], DM_UUID_LEN);187spin_lock_init(&lc->flush_lock);188INIT_LIST_HEAD(&lc->mark_list);189INIT_LIST_HEAD(&lc->clear_list);190191str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);192if (str_size < 0) {193kfree(lc);194return str_size;195}196197/* Send table string */198r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,199ctr_str, str_size, NULL, NULL);200201if (r < 0) {202if (r == -ESRCH)203DMERR("Userspace log server not found");204else205DMERR("Userspace log server failed to create log");206goto out;207}208209/* Since the region size does not change, get it now */210rdata_size = sizeof(rdata);211r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE,212NULL, 0, (char *)&rdata, &rdata_size);213214if (r) {215DMERR("Failed to get region size of dirty log");216goto out;217}218219lc->region_size = (uint32_t)rdata;220lc->region_count = dm_sector_div_up(ti->len, lc->region_size);221222out:223if (r) {224kfree(lc);225kfree(ctr_str);226} else {227lc->usr_argv_str = ctr_str;228lc->usr_argc = argc;229log->context = lc;230}231232return r;233}234235static void userspace_dtr(struct dm_dirty_log *log)236{237struct log_c *lc = log->context;238239(void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,240NULL, 0,241NULL, NULL);242243kfree(lc->usr_argv_str);244kfree(lc);245246return;247}248249static int userspace_presuspend(struct dm_dirty_log *log)250{251int r;252struct log_c *lc = log->context;253254r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND,255NULL, 0,256NULL, NULL);257258return r;259}260261static int userspace_postsuspend(struct dm_dirty_log *log)262{263int r;264struct log_c *lc = log->context;265266r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND,267NULL, 0,268NULL, NULL);269270return r;271}272273static int userspace_resume(struct dm_dirty_log *log)274{275int r;276struct log_c *lc = log->context;277278lc->in_sync_hint = 0;279r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME,280NULL, 0,281NULL, NULL);282283return r;284}285286static uint32_t userspace_get_region_size(struct dm_dirty_log *log)287{288struct log_c *lc = log->context;289290return lc->region_size;291}292293/*294* userspace_is_clean295*296* Check whether a region is clean. If there is any sort of297* failure when consulting the server, we return not clean.298*299* Returns: 1 if clean, 0 otherwise300*/301static int userspace_is_clean(struct dm_dirty_log *log, region_t region)302{303int r;304uint64_t region64 = (uint64_t)region;305int64_t is_clean;306size_t rdata_size;307struct log_c *lc = log->context;308309rdata_size = sizeof(is_clean);310r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN,311(char *)®ion64, sizeof(region64),312(char *)&is_clean, &rdata_size);313314return (r) ? 0 : (int)is_clean;315}316317/*318* userspace_in_sync319*320* Check if the region is in-sync. If there is any sort321* of failure when consulting the server, we assume that322* the region is not in sync.323*324* If 'can_block' is set, return immediately325*326* Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK327*/328static int userspace_in_sync(struct dm_dirty_log *log, region_t region,329int can_block)330{331int r;332uint64_t region64 = region;333int64_t in_sync;334size_t rdata_size;335struct log_c *lc = log->context;336337/*338* We can never respond directly - even if in_sync_hint is339* set. This is because another machine could see a device340* failure and mark the region out-of-sync. If we don't go341* to userspace to ask, we might think the region is in-sync342* and allow a read to pick up data that is stale. (This is343* very unlikely if a device actually fails; but it is very344* likely if a connection to one device from one machine fails.)345*346* There still might be a problem if the mirror caches the region347* state as in-sync... but then this call would not be made. So,348* that is a mirror problem.349*/350if (!can_block)351return -EWOULDBLOCK;352353rdata_size = sizeof(in_sync);354r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC,355(char *)®ion64, sizeof(region64),356(char *)&in_sync, &rdata_size);357return (r) ? 0 : (int)in_sync;358}359360static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)361{362int r = 0;363struct flush_entry *fe;364365list_for_each_entry(fe, flush_list, list) {366r = userspace_do_request(lc, lc->uuid, fe->type,367(char *)&fe->region,368sizeof(fe->region),369NULL, NULL);370if (r)371break;372}373374return r;375}376377static int flush_by_group(struct log_c *lc, struct list_head *flush_list)378{379int r = 0;380int count;381uint32_t type = 0;382struct flush_entry *fe, *tmp_fe;383LIST_HEAD(tmp_list);384uint64_t group[MAX_FLUSH_GROUP_COUNT];385386/*387* Group process the requests388*/389while (!list_empty(flush_list)) {390count = 0;391392list_for_each_entry_safe(fe, tmp_fe, flush_list, list) {393group[count] = fe->region;394count++;395396list_del(&fe->list);397list_add(&fe->list, &tmp_list);398399type = fe->type;400if (count >= MAX_FLUSH_GROUP_COUNT)401break;402}403404r = userspace_do_request(lc, lc->uuid, type,405(char *)(group),406count * sizeof(uint64_t),407NULL, NULL);408if (r) {409/* Group send failed. Attempt one-by-one. */410list_splice_init(&tmp_list, flush_list);411r = flush_one_by_one(lc, flush_list);412break;413}414}415416/*417* Must collect flush_entrys that were successfully processed418* as a group so that they will be free'd by the caller.419*/420list_splice_init(&tmp_list, flush_list);421422return r;423}424425/*426* userspace_flush427*428* This function is ok to block.429* The flush happens in two stages. First, it sends all430* clear/mark requests that are on the list. Then it431* tells the server to commit them. This gives the432* server a chance to optimise the commit, instead of433* doing it for every request.434*435* Additionally, we could implement another thread that436* sends the requests up to the server - reducing the437* load on flush. Then the flush would have less in438* the list and be responsible for the finishing commit.439*440* Returns: 0 on success, < 0 on failure441*/442static int userspace_flush(struct dm_dirty_log *log)443{444int r = 0;445unsigned long flags;446struct log_c *lc = log->context;447LIST_HEAD(mark_list);448LIST_HEAD(clear_list);449struct flush_entry *fe, *tmp_fe;450451spin_lock_irqsave(&lc->flush_lock, flags);452list_splice_init(&lc->mark_list, &mark_list);453list_splice_init(&lc->clear_list, &clear_list);454spin_unlock_irqrestore(&lc->flush_lock, flags);455456if (list_empty(&mark_list) && list_empty(&clear_list))457return 0;458459r = flush_by_group(lc, &mark_list);460if (r)461goto fail;462463r = flush_by_group(lc, &clear_list);464if (r)465goto fail;466467r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,468NULL, 0, NULL, NULL);469470fail:471/*472* We can safely remove these entries, even if failure.473* Calling code will receive an error and will know that474* the log facility has failed.475*/476list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) {477list_del(&fe->list);478mempool_free(fe, flush_entry_pool);479}480list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) {481list_del(&fe->list);482mempool_free(fe, flush_entry_pool);483}484485if (r)486dm_table_event(lc->ti->table);487488return r;489}490491/*492* userspace_mark_region493*494* This function should avoid blocking unless absolutely required.495* (Memory allocation is valid for blocking.)496*/497static void userspace_mark_region(struct dm_dirty_log *log, region_t region)498{499unsigned long flags;500struct log_c *lc = log->context;501struct flush_entry *fe;502503/* Wait for an allocation, but _never_ fail */504fe = mempool_alloc(flush_entry_pool, GFP_NOIO);505BUG_ON(!fe);506507spin_lock_irqsave(&lc->flush_lock, flags);508fe->type = DM_ULOG_MARK_REGION;509fe->region = region;510list_add(&fe->list, &lc->mark_list);511spin_unlock_irqrestore(&lc->flush_lock, flags);512513return;514}515516/*517* userspace_clear_region518*519* This function must not block.520* So, the alloc can't block. In the worst case, it is ok to521* fail. It would simply mean we can't clear the region.522* Does nothing to current sync context, but does mean523* the region will be re-sync'ed on a reload of the mirror524* even though it is in-sync.525*/526static void userspace_clear_region(struct dm_dirty_log *log, region_t region)527{528unsigned long flags;529struct log_c *lc = log->context;530struct flush_entry *fe;531532/*533* If we fail to allocate, we skip the clearing of534* the region. This doesn't hurt us in any way, except535* to cause the region to be resync'ed when the536* device is activated next time.537*/538fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);539if (!fe) {540DMERR("Failed to allocate memory to clear region.");541return;542}543544spin_lock_irqsave(&lc->flush_lock, flags);545fe->type = DM_ULOG_CLEAR_REGION;546fe->region = region;547list_add(&fe->list, &lc->clear_list);548spin_unlock_irqrestore(&lc->flush_lock, flags);549550return;551}552553/*554* userspace_get_resync_work555*556* Get a region that needs recovery. It is valid to return557* an error for this function.558*559* Returns: 1 if region filled, 0 if no work, <0 on error560*/561static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)562{563int r;564size_t rdata_size;565struct log_c *lc = log->context;566struct {567int64_t i; /* 64-bit for mix arch compatibility */568region_t r;569} pkg;570571if (lc->in_sync_hint >= lc->region_count)572return 0;573574rdata_size = sizeof(pkg);575r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,576NULL, 0,577(char *)&pkg, &rdata_size);578579*region = pkg.r;580return (r) ? r : (int)pkg.i;581}582583/*584* userspace_set_region_sync585*586* Set the sync status of a given region. This function587* must not fail.588*/589static void userspace_set_region_sync(struct dm_dirty_log *log,590region_t region, int in_sync)591{592int r;593struct log_c *lc = log->context;594struct {595region_t r;596int64_t i;597} pkg;598599pkg.r = region;600pkg.i = (int64_t)in_sync;601602r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,603(char *)&pkg, sizeof(pkg),604NULL, NULL);605606/*607* It would be nice to be able to report failures.608* However, it is easy emough to detect and resolve.609*/610return;611}612613/*614* userspace_get_sync_count615*616* If there is any sort of failure when consulting the server,617* we assume that the sync count is zero.618*619* Returns: sync count on success, 0 on failure620*/621static region_t userspace_get_sync_count(struct dm_dirty_log *log)622{623int r;624size_t rdata_size;625uint64_t sync_count;626struct log_c *lc = log->context;627628rdata_size = sizeof(sync_count);629r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,630NULL, 0,631(char *)&sync_count, &rdata_size);632633if (r)634return 0;635636if (sync_count >= lc->region_count)637lc->in_sync_hint = lc->region_count;638639return (region_t)sync_count;640}641642/*643* userspace_status644*645* Returns: amount of space consumed646*/647static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,648char *result, unsigned maxlen)649{650int r = 0;651char *table_args;652size_t sz = (size_t)maxlen;653struct log_c *lc = log->context;654655switch (status_type) {656case STATUSTYPE_INFO:657r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,658NULL, 0,659result, &sz);660661if (r) {662sz = 0;663DMEMIT("%s 1 COM_FAILURE", log->type->name);664}665break;666case STATUSTYPE_TABLE:667sz = 0;668table_args = strchr(lc->usr_argv_str, ' ');669BUG_ON(!table_args); /* There will always be a ' ' */670table_args++;671672DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc,673lc->uuid, table_args);674break;675}676return (r) ? 0 : (int)sz;677}678679/*680* userspace_is_remote_recovering681*682* Returns: 1 if region recovering, 0 otherwise683*/684static int userspace_is_remote_recovering(struct dm_dirty_log *log,685region_t region)686{687int r;688uint64_t region64 = region;689struct log_c *lc = log->context;690static unsigned long long limit;691struct {692int64_t is_recovering;693uint64_t in_sync_hint;694} pkg;695size_t rdata_size = sizeof(pkg);696697/*698* Once the mirror has been reported to be in-sync,699* it will never again ask for recovery work. So,700* we can safely say there is not a remote machine701* recovering if the device is in-sync. (in_sync_hint702* must be reset at resume time.)703*/704if (region < lc->in_sync_hint)705return 0;706else if (jiffies < limit)707return 1;708709limit = jiffies + (HZ / 4);710r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING,711(char *)®ion64, sizeof(region64),712(char *)&pkg, &rdata_size);713if (r)714return 1;715716lc->in_sync_hint = pkg.in_sync_hint;717718return (int)pkg.is_recovering;719}720721static struct dm_dirty_log_type _userspace_type = {722.name = "userspace",723.module = THIS_MODULE,724.ctr = userspace_ctr,725.dtr = userspace_dtr,726.presuspend = userspace_presuspend,727.postsuspend = userspace_postsuspend,728.resume = userspace_resume,729.get_region_size = userspace_get_region_size,730.is_clean = userspace_is_clean,731.in_sync = userspace_in_sync,732.flush = userspace_flush,733.mark_region = userspace_mark_region,734.clear_region = userspace_clear_region,735.get_resync_work = userspace_get_resync_work,736.set_region_sync = userspace_set_region_sync,737.get_sync_count = userspace_get_sync_count,738.status = userspace_status,739.is_remote_recovering = userspace_is_remote_recovering,740};741742static int __init userspace_dirty_log_init(void)743{744int r = 0;745746flush_entry_pool = mempool_create(100, flush_entry_alloc,747flush_entry_free, NULL);748749if (!flush_entry_pool) {750DMWARN("Unable to create flush_entry_pool: No memory.");751return -ENOMEM;752}753754r = dm_ulog_tfr_init();755if (r) {756DMWARN("Unable to initialize userspace log communications");757mempool_destroy(flush_entry_pool);758return r;759}760761r = dm_dirty_log_type_register(&_userspace_type);762if (r) {763DMWARN("Couldn't register userspace dirty log type");764dm_ulog_tfr_exit();765mempool_destroy(flush_entry_pool);766return r;767}768769DMINFO("version " DM_LOG_USERSPACE_VSN " loaded");770return 0;771}772773static void __exit userspace_dirty_log_exit(void)774{775dm_dirty_log_type_unregister(&_userspace_type);776dm_ulog_tfr_exit();777mempool_destroy(flush_entry_pool);778779DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");780return;781}782783module_init(userspace_dirty_log_init);784module_exit(userspace_dirty_log_exit);785786MODULE_DESCRIPTION(DM_NAME " userspace dirty log link");787MODULE_AUTHOR("Jonathan Brassow <[email protected]>");788MODULE_LICENSE("GPL");789790791