// SPDX-License-Identifier: GPL-2.01/*2* Copyright(c) 2014 Intel Mobile Communications GmbH3* Copyright(c) 2015 Intel Deutschland GmbH4*5* Author: Johannes Berg <[email protected]>6*/7#include <linux/module.h>8#include <linux/device.h>9#include <linux/devcoredump.h>10#include <linux/list.h>11#include <linux/slab.h>12#include <linux/fs.h>13#include <linux/workqueue.h>1415static struct class devcd_class;1617/* global disable flag, for security purposes */18static bool devcd_disabled;1920struct devcd_entry {21struct device devcd_dev;22void *data;23size_t datalen;24/*25* There are 2 races for which mutex is required.26*27* The first race is between device creation and userspace writing to28* schedule immediately destruction.29*30* This race is handled by arming the timer before device creation, but31* when device creation fails the timer still exists.32*33* To solve this, hold the mutex during device_add(), and set34* init_completed on success before releasing the mutex.35*36* That way the timer will never fire until device_add() is called,37* it will do nothing if init_completed is not set. The timer is also38* cancelled in that case.39*40* The second race involves multiple parallel invocations of devcd_free(),41* add a deleted flag so only 1 can call the destructor.42*/43struct mutex mutex;44bool init_completed, deleted;45struct module *owner;46ssize_t (*read)(char *buffer, loff_t offset, size_t count,47void *data, size_t datalen);48void (*free)(void *data);49/*50* If nothing interferes and device_add() was returns success,51* del_wk will destroy the device after the timer fires.52*53* Multiple userspace processes can interfere in the working of the timer:54* - Writing to the coredump will reschedule the timer to run immediately,55* if still armed.56*57* This is handled by using "if (cancel_delayed_work()) {58* schedule_delayed_work() }", to prevent re-arming after having59* been previously fired.60* - Writing to /sys/class/devcoredump/disabled will destroy the61* coredump synchronously.62* This is handled by using disable_delayed_work_sync(), and then63* checking if deleted flag is set with &devcd->mutex held.64*/65struct delayed_work del_wk;66struct device *failing_dev;67};6869static struct devcd_entry *dev_to_devcd(struct device *dev)70{71return container_of(dev, struct devcd_entry, devcd_dev);72}7374static void devcd_dev_release(struct device *dev)75{76struct devcd_entry *devcd = dev_to_devcd(dev);7778devcd->free(devcd->data);79module_put(devcd->owner);8081/*82* this seems racy, but I don't see a notifier or such on83* a struct device to know when it goes away?84*/85if (devcd->failing_dev->kobj.sd)86sysfs_delete_link(&devcd->failing_dev->kobj, &dev->kobj,87"devcoredump");8889put_device(devcd->failing_dev);90kfree(devcd);91}9293static void __devcd_del(struct devcd_entry *devcd)94{95devcd->deleted = true;96device_del(&devcd->devcd_dev);97put_device(&devcd->devcd_dev);98}99100static void devcd_del(struct work_struct *wk)101{102struct devcd_entry *devcd;103bool init_completed;104105devcd = container_of(wk, struct devcd_entry, del_wk.work);106107/* devcd->mutex serializes against dev_coredumpm_timeout */108mutex_lock(&devcd->mutex);109init_completed = devcd->init_completed;110mutex_unlock(&devcd->mutex);111112if (init_completed)113__devcd_del(devcd);114}115116static ssize_t devcd_data_read(struct file *filp, struct kobject *kobj,117const struct bin_attribute *bin_attr,118char *buffer, loff_t offset, size_t count)119{120struct device *dev = kobj_to_dev(kobj);121struct devcd_entry *devcd = dev_to_devcd(dev);122123return devcd->read(buffer, offset, count, devcd->data, devcd->datalen);124}125126static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,127const struct bin_attribute *bin_attr,128char *buffer, loff_t offset, size_t count)129{130struct device *dev = kobj_to_dev(kobj);131struct devcd_entry *devcd = dev_to_devcd(dev);132133/*134* Although it's tempting to use mod_delayed work here,135* that will cause a reschedule if the timer already fired.136*/137if (cancel_delayed_work(&devcd->del_wk))138schedule_delayed_work(&devcd->del_wk, 0);139140return count;141}142143static const struct bin_attribute devcd_attr_data =144__BIN_ATTR(data, 0600, devcd_data_read, devcd_data_write, 0);145146static const struct bin_attribute *const devcd_dev_bin_attrs[] = {147&devcd_attr_data, NULL,148};149150static const struct attribute_group devcd_dev_group = {151.bin_attrs = devcd_dev_bin_attrs,152};153154static const struct attribute_group *devcd_dev_groups[] = {155&devcd_dev_group, NULL,156};157158static int devcd_free(struct device *dev, void *data)159{160struct devcd_entry *devcd = dev_to_devcd(dev);161162/*163* To prevent a race with devcd_data_write(), disable work and164* complete manually instead.165*166* We cannot rely on the return value of167* disable_delayed_work_sync() here, because it might be in the168* middle of a cancel_delayed_work + schedule_delayed_work pair.169*170* devcd->mutex here guards against multiple parallel invocations171* of devcd_free().172*/173disable_delayed_work_sync(&devcd->del_wk);174mutex_lock(&devcd->mutex);175if (!devcd->deleted)176__devcd_del(devcd);177mutex_unlock(&devcd->mutex);178return 0;179}180181static ssize_t disabled_show(const struct class *class, const struct class_attribute *attr,182char *buf)183{184return sysfs_emit(buf, "%d\n", devcd_disabled);185}186187/*188*189* disabled_store() worker()190* class_for_each_device(&devcd_class,191* NULL, NULL, devcd_free)192* ...193* ...194* while ((dev = class_dev_iter_next(&iter))195* devcd_del()196* device_del()197* put_device() <- last reference198* error = fn(dev, data) devcd_dev_release()199* devcd_free(dev, data) kfree(devcd)200*201*202* In the above diagram, it looks like disabled_store() would be racing with parallelly203* running devcd_del() and result in memory abort after dropping its last reference with204* put_device(). However, this will not happens as fn(dev, data) runs205* with its own reference to device via klist_node so it is not its last reference.206* so, above situation would not occur.207*/208209static ssize_t disabled_store(const struct class *class, const struct class_attribute *attr,210const char *buf, size_t count)211{212long tmp = simple_strtol(buf, NULL, 10);213214/*215* This essentially makes the attribute write-once, since you can't216* go back to not having it disabled. This is intentional, it serves217* as a system lockdown feature.218*/219if (tmp != 1)220return -EINVAL;221222devcd_disabled = true;223224class_for_each_device(&devcd_class, NULL, NULL, devcd_free);225226return count;227}228static CLASS_ATTR_RW(disabled);229230static struct attribute *devcd_class_attrs[] = {231&class_attr_disabled.attr,232NULL,233};234ATTRIBUTE_GROUPS(devcd_class);235236static struct class devcd_class = {237.name = "devcoredump",238.dev_release = devcd_dev_release,239.dev_groups = devcd_dev_groups,240.class_groups = devcd_class_groups,241};242243static ssize_t devcd_readv(char *buffer, loff_t offset, size_t count,244void *data, size_t datalen)245{246return memory_read_from_buffer(buffer, count, &offset, data, datalen);247}248249static void devcd_freev(void *data)250{251vfree(data);252}253254/**255* dev_coredumpv - create device coredump with vmalloc data256* @dev: the struct device for the crashed device257* @data: vmalloc data containing the device coredump258* @datalen: length of the data259* @gfp: allocation flags260*261* This function takes ownership of the vmalloc'ed data and will free262* it when it is no longer used. See dev_coredumpm() for more information.263*/264void dev_coredumpv(struct device *dev, void *data, size_t datalen,265gfp_t gfp)266{267dev_coredumpm(dev, NULL, data, datalen, gfp, devcd_readv, devcd_freev);268}269EXPORT_SYMBOL_GPL(dev_coredumpv);270271static int devcd_match_failing(struct device *dev, const void *failing)272{273struct devcd_entry *devcd = dev_to_devcd(dev);274275return devcd->failing_dev == failing;276}277278/**279* devcd_free_sgtable - free all the memory of the given scatterlist table280* (i.e. both pages and scatterlist instances)281* NOTE: if two tables allocated with devcd_alloc_sgtable and then chained282* using the sg_chain function then that function should be called only once283* on the chained table284* @data: pointer to sg_table to free285*/286static void devcd_free_sgtable(void *data)287{288_devcd_free_sgtable(data);289}290291/**292* devcd_read_from_sgtable - copy data from sg_table to a given buffer293* and return the number of bytes read294* @buffer: the buffer to copy the data to it295* @buf_len: the length of the buffer296* @data: the scatterlist table to copy from297* @offset: start copy from @offset@ bytes from the head of the data298* in the given scatterlist299* @data_len: the length of the data in the sg_table300*301* Returns: the number of bytes copied302*/303static ssize_t devcd_read_from_sgtable(char *buffer, loff_t offset,304size_t buf_len, void *data,305size_t data_len)306{307struct scatterlist *table = data;308309if (offset > data_len)310return -EINVAL;311312if (offset + buf_len > data_len)313buf_len = data_len - offset;314return sg_pcopy_to_buffer(table, sg_nents(table), buffer, buf_len,315offset);316}317318/**319* dev_coredump_put - remove device coredump320* @dev: the struct device for the crashed device321*322* dev_coredump_put() removes coredump, if exists, for a given device from323* the file system and free its associated data otherwise, does nothing.324*325* It is useful for modules that do not want to keep coredump326* available after its unload.327*/328void dev_coredump_put(struct device *dev)329{330struct device *existing;331332existing = class_find_device(&devcd_class, NULL, dev,333devcd_match_failing);334if (existing) {335devcd_free(existing, NULL);336put_device(existing);337}338}339EXPORT_SYMBOL_GPL(dev_coredump_put);340341/**342* dev_coredumpm_timeout - create device coredump with read/free methods with a343* custom timeout.344* @dev: the struct device for the crashed device345* @owner: the module that contains the read/free functions, use %THIS_MODULE346* @data: data cookie for the @read/@free functions347* @datalen: length of the data348* @gfp: allocation flags349* @read: function to read from the given buffer350* @free: function to free the given buffer351* @timeout: time in jiffies to remove coredump352*353* Creates a new device coredump for the given device. If a previous one hasn't354* been read yet, the new coredump is discarded. The data lifetime is determined355* by the device coredump framework and when it is no longer needed the @free356* function will be called to free the data.357*/358void dev_coredumpm_timeout(struct device *dev, struct module *owner,359void *data, size_t datalen, gfp_t gfp,360ssize_t (*read)(char *buffer, loff_t offset,361size_t count, void *data,362size_t datalen),363void (*free)(void *data),364unsigned long timeout)365{366static atomic_t devcd_count = ATOMIC_INIT(0);367struct devcd_entry *devcd;368struct device *existing;369370if (devcd_disabled)371goto free;372373existing = class_find_device(&devcd_class, NULL, dev,374devcd_match_failing);375if (existing) {376put_device(existing);377goto free;378}379380if (!try_module_get(owner))381goto free;382383devcd = kzalloc(sizeof(*devcd), gfp);384if (!devcd)385goto put_module;386387devcd->owner = owner;388devcd->data = data;389devcd->datalen = datalen;390devcd->read = read;391devcd->free = free;392devcd->failing_dev = get_device(dev);393devcd->deleted = false;394395mutex_init(&devcd->mutex);396device_initialize(&devcd->devcd_dev);397398dev_set_name(&devcd->devcd_dev, "devcd%d",399atomic_inc_return(&devcd_count));400devcd->devcd_dev.class = &devcd_class;401402dev_set_uevent_suppress(&devcd->devcd_dev, true);403404/* devcd->mutex prevents devcd_del() completing until init finishes */405mutex_lock(&devcd->mutex);406devcd->init_completed = false;407INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);408schedule_delayed_work(&devcd->del_wk, timeout);409410if (device_add(&devcd->devcd_dev))411goto put_device;412413/*414* These should normally not fail, but there is no problem415* continuing without the links, so just warn instead of416* failing.417*/418if (sysfs_create_link(&devcd->devcd_dev.kobj, &dev->kobj,419"failing_device") ||420sysfs_create_link(&dev->kobj, &devcd->devcd_dev.kobj,421"devcoredump"))422dev_warn(dev, "devcoredump create_link failed\n");423424dev_set_uevent_suppress(&devcd->devcd_dev, false);425kobject_uevent(&devcd->devcd_dev.kobj, KOBJ_ADD);426427/*428* Safe to run devcd_del() now that we are done with devcd_dev.429* Alternatively we could have taken a ref on devcd_dev before430* dropping the lock.431*/432devcd->init_completed = true;433mutex_unlock(&devcd->mutex);434return;435put_device:436mutex_unlock(&devcd->mutex);437cancel_delayed_work_sync(&devcd->del_wk);438put_device(&devcd->devcd_dev);439440put_module:441module_put(owner);442free:443free(data);444}445EXPORT_SYMBOL_GPL(dev_coredumpm_timeout);446447/**448* dev_coredumpsg - create device coredump that uses scatterlist as data449* parameter450* @dev: the struct device for the crashed device451* @table: the dump data452* @datalen: length of the data453* @gfp: allocation flags454*455* Creates a new device coredump for the given device. If a previous one hasn't456* been read yet, the new coredump is discarded. The data lifetime is determined457* by the device coredump framework and when it is no longer needed458* it will free the data.459*/460void dev_coredumpsg(struct device *dev, struct scatterlist *table,461size_t datalen, gfp_t gfp)462{463dev_coredumpm(dev, NULL, table, datalen, gfp, devcd_read_from_sgtable,464devcd_free_sgtable);465}466EXPORT_SYMBOL_GPL(dev_coredumpsg);467468static int __init devcoredump_init(void)469{470return class_register(&devcd_class);471}472__initcall(devcoredump_init);473474static void __exit devcoredump_exit(void)475{476class_for_each_device(&devcd_class, NULL, NULL, devcd_free);477class_unregister(&devcd_class);478}479__exitcall(devcoredump_exit);480481482