Path: blob/master/drivers/gpu/drm/i915/i915_gem_execbuffer.c
15113 views
/*1* Copyright © 2008,2010 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*22* Authors:23* Eric Anholt <[email protected]>24* Chris Wilson <[email protected]>25*26*/2728#include "drmP.h"29#include "drm.h"30#include "i915_drm.h"31#include "i915_drv.h"32#include "i915_trace.h"33#include "intel_drv.h"3435struct change_domains {36uint32_t invalidate_domains;37uint32_t flush_domains;38uint32_t flush_rings;39uint32_t flips;40};4142/*43* Set the next domain for the specified object. This44* may not actually perform the necessary flushing/invaliding though,45* as that may want to be batched with other set_domain operations46*47* This is (we hope) the only really tricky part of gem. The goal48* is fairly simple -- track which caches hold bits of the object49* and make sure they remain coherent. A few concrete examples may50* help to explain how it works. For shorthand, we use the notation51* (read_domains, write_domain), e.g. (CPU, CPU) to indicate the52* a pair of read and write domain masks.53*54* Case 1: the batch buffer55*56* 1. Allocated57* 2. Written by CPU58* 3. Mapped to GTT59* 4. Read by GPU60* 5. Unmapped from GTT61* 6. Freed62*63* Let's take these a step at a time64*65* 1. Allocated66* Pages allocated from the kernel may still have67* cache contents, so we set them to (CPU, CPU) always.68* 2. Written by CPU (using pwrite)69* The pwrite function calls set_domain (CPU, CPU) and70* this function does nothing (as nothing changes)71* 3. Mapped by GTT72* This function asserts that the object is not73* currently in any GPU-based read or write domains74* 4. Read by GPU75* i915_gem_execbuffer calls set_domain (COMMAND, 0).76* As write_domain is zero, this function adds in the77* current read domains (CPU+COMMAND, 0).78* flush_domains is set to CPU.79* invalidate_domains is set to COMMAND80* clflush is run to get data out of the CPU caches81* then i915_dev_set_domain calls i915_gem_flush to82* emit an MI_FLUSH and drm_agp_chipset_flush83* 5. Unmapped from GTT84* i915_gem_object_unbind calls set_domain (CPU, CPU)85* flush_domains and invalidate_domains end up both zero86* so no flushing/invalidating happens87* 6. Freed88* yay, done89*90* Case 2: The shared render buffer91*92* 1. Allocated93* 2. Mapped to GTT94* 3. Read/written by GPU95* 4. set_domain to (CPU,CPU)96* 5. Read/written by CPU97* 6. Read/written by GPU98*99* 1. Allocated100* Same as last example, (CPU, CPU)101* 2. Mapped to GTT102* Nothing changes (assertions find that it is not in the GPU)103* 3. Read/written by GPU104* execbuffer calls set_domain (RENDER, RENDER)105* flush_domains gets CPU106* invalidate_domains gets GPU107* clflush (obj)108* MI_FLUSH and drm_agp_chipset_flush109* 4. set_domain (CPU, CPU)110* flush_domains gets GPU111* invalidate_domains gets CPU112* wait_rendering (obj) to make sure all drawing is complete.113* This will include an MI_FLUSH to get the data from GPU114* to memory115* clflush (obj) to invalidate the CPU cache116* Another MI_FLUSH in i915_gem_flush (eliminate this somehow?)117* 5. Read/written by CPU118* cache lines are loaded and dirtied119* 6. Read written by GPU120* Same as last GPU access121*122* Case 3: The constant buffer123*124* 1. Allocated125* 2. Written by CPU126* 3. Read by GPU127* 4. Updated (written) by CPU again128* 5. Read by GPU129*130* 1. Allocated131* (CPU, CPU)132* 2. Written by CPU133* (CPU, CPU)134* 3. Read by GPU135* (CPU+RENDER, 0)136* flush_domains = CPU137* invalidate_domains = RENDER138* clflush (obj)139* MI_FLUSH140* drm_agp_chipset_flush141* 4. Updated (written) by CPU again142* (CPU, CPU)143* flush_domains = 0 (no previous write domain)144* invalidate_domains = 0 (no new read domains)145* 5. Read by GPU146* (CPU+RENDER, 0)147* flush_domains = CPU148* invalidate_domains = RENDER149* clflush (obj)150* MI_FLUSH151* drm_agp_chipset_flush152*/153static void154i915_gem_object_set_to_gpu_domain(struct drm_i915_gem_object *obj,155struct intel_ring_buffer *ring,156struct change_domains *cd)157{158uint32_t invalidate_domains = 0, flush_domains = 0;159160/*161* If the object isn't moving to a new write domain,162* let the object stay in multiple read domains163*/164if (obj->base.pending_write_domain == 0)165obj->base.pending_read_domains |= obj->base.read_domains;166167/*168* Flush the current write domain if169* the new read domains don't match. Invalidate170* any read domains which differ from the old171* write domain172*/173if (obj->base.write_domain &&174(((obj->base.write_domain != obj->base.pending_read_domains ||175obj->ring != ring)) ||176(obj->fenced_gpu_access && !obj->pending_fenced_gpu_access))) {177flush_domains |= obj->base.write_domain;178invalidate_domains |=179obj->base.pending_read_domains & ~obj->base.write_domain;180}181/*182* Invalidate any read caches which may have183* stale data. That is, any new read domains.184*/185invalidate_domains |= obj->base.pending_read_domains & ~obj->base.read_domains;186if ((flush_domains | invalidate_domains) & I915_GEM_DOMAIN_CPU)187i915_gem_clflush_object(obj);188189if (obj->base.pending_write_domain)190cd->flips |= atomic_read(&obj->pending_flip);191192/* The actual obj->write_domain will be updated with193* pending_write_domain after we emit the accumulated flush for all194* of our domain changes in execbuffers (which clears objects'195* write_domains). So if we have a current write domain that we196* aren't changing, set pending_write_domain to that.197*/198if (flush_domains == 0 && obj->base.pending_write_domain == 0)199obj->base.pending_write_domain = obj->base.write_domain;200201cd->invalidate_domains |= invalidate_domains;202cd->flush_domains |= flush_domains;203if (flush_domains & I915_GEM_GPU_DOMAINS)204cd->flush_rings |= obj->ring->id;205if (invalidate_domains & I915_GEM_GPU_DOMAINS)206cd->flush_rings |= ring->id;207}208209struct eb_objects {210int and;211struct hlist_head buckets[0];212};213214static struct eb_objects *215eb_create(int size)216{217struct eb_objects *eb;218int count = PAGE_SIZE / sizeof(struct hlist_head) / 2;219while (count > size)220count >>= 1;221eb = kzalloc(count*sizeof(struct hlist_head) +222sizeof(struct eb_objects),223GFP_KERNEL);224if (eb == NULL)225return eb;226227eb->and = count - 1;228return eb;229}230231static void232eb_reset(struct eb_objects *eb)233{234memset(eb->buckets, 0, (eb->and+1)*sizeof(struct hlist_head));235}236237static void238eb_add_object(struct eb_objects *eb, struct drm_i915_gem_object *obj)239{240hlist_add_head(&obj->exec_node,241&eb->buckets[obj->exec_handle & eb->and]);242}243244static struct drm_i915_gem_object *245eb_get_object(struct eb_objects *eb, unsigned long handle)246{247struct hlist_head *head;248struct hlist_node *node;249struct drm_i915_gem_object *obj;250251head = &eb->buckets[handle & eb->and];252hlist_for_each(node, head) {253obj = hlist_entry(node, struct drm_i915_gem_object, exec_node);254if (obj->exec_handle == handle)255return obj;256}257258return NULL;259}260261static void262eb_destroy(struct eb_objects *eb)263{264kfree(eb);265}266267static int268i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,269struct eb_objects *eb,270struct drm_i915_gem_relocation_entry *reloc)271{272struct drm_device *dev = obj->base.dev;273struct drm_gem_object *target_obj;274uint32_t target_offset;275int ret = -EINVAL;276277/* we've already hold a reference to all valid objects */278target_obj = &eb_get_object(eb, reloc->target_handle)->base;279if (unlikely(target_obj == NULL))280return -ENOENT;281282target_offset = to_intel_bo(target_obj)->gtt_offset;283284/* The target buffer should have appeared before us in the285* exec_object list, so it should have a GTT space bound by now.286*/287if (unlikely(target_offset == 0)) {288DRM_ERROR("No GTT space found for object %d\n",289reloc->target_handle);290return ret;291}292293/* Validate that the target is in a valid r/w GPU domain */294if (unlikely(reloc->write_domain & (reloc->write_domain - 1))) {295DRM_ERROR("reloc with multiple write domains: "296"obj %p target %d offset %d "297"read %08x write %08x",298obj, reloc->target_handle,299(int) reloc->offset,300reloc->read_domains,301reloc->write_domain);302return ret;303}304if (unlikely((reloc->write_domain | reloc->read_domains) & I915_GEM_DOMAIN_CPU)) {305DRM_ERROR("reloc with read/write CPU domains: "306"obj %p target %d offset %d "307"read %08x write %08x",308obj, reloc->target_handle,309(int) reloc->offset,310reloc->read_domains,311reloc->write_domain);312return ret;313}314if (unlikely(reloc->write_domain && target_obj->pending_write_domain &&315reloc->write_domain != target_obj->pending_write_domain)) {316DRM_ERROR("Write domain conflict: "317"obj %p target %d offset %d "318"new %08x old %08x\n",319obj, reloc->target_handle,320(int) reloc->offset,321reloc->write_domain,322target_obj->pending_write_domain);323return ret;324}325326target_obj->pending_read_domains |= reloc->read_domains;327target_obj->pending_write_domain |= reloc->write_domain;328329/* If the relocation already has the right value in it, no330* more work needs to be done.331*/332if (target_offset == reloc->presumed_offset)333return 0;334335/* Check that the relocation address is valid... */336if (unlikely(reloc->offset > obj->base.size - 4)) {337DRM_ERROR("Relocation beyond object bounds: "338"obj %p target %d offset %d size %d.\n",339obj, reloc->target_handle,340(int) reloc->offset,341(int) obj->base.size);342return ret;343}344if (unlikely(reloc->offset & 3)) {345DRM_ERROR("Relocation not 4-byte aligned: "346"obj %p target %d offset %d.\n",347obj, reloc->target_handle,348(int) reloc->offset);349return ret;350}351352reloc->delta += target_offset;353if (obj->base.write_domain == I915_GEM_DOMAIN_CPU) {354uint32_t page_offset = reloc->offset & ~PAGE_MASK;355char *vaddr;356357vaddr = kmap_atomic(obj->pages[reloc->offset >> PAGE_SHIFT]);358*(uint32_t *)(vaddr + page_offset) = reloc->delta;359kunmap_atomic(vaddr);360} else {361struct drm_i915_private *dev_priv = dev->dev_private;362uint32_t __iomem *reloc_entry;363void __iomem *reloc_page;364365/* We can't wait for rendering with pagefaults disabled */366if (obj->active && in_atomic())367return -EFAULT;368369ret = i915_gem_object_set_to_gtt_domain(obj, 1);370if (ret)371return ret;372373/* Map the page containing the relocation we're going to perform. */374reloc->offset += obj->gtt_offset;375reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,376reloc->offset & PAGE_MASK);377reloc_entry = (uint32_t __iomem *)378(reloc_page + (reloc->offset & ~PAGE_MASK));379iowrite32(reloc->delta, reloc_entry);380io_mapping_unmap_atomic(reloc_page);381}382383/* and update the user's relocation entry */384reloc->presumed_offset = target_offset;385386return 0;387}388389static int390i915_gem_execbuffer_relocate_object(struct drm_i915_gem_object *obj,391struct eb_objects *eb)392{393struct drm_i915_gem_relocation_entry __user *user_relocs;394struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;395int i, ret;396397user_relocs = (void __user *)(uintptr_t)entry->relocs_ptr;398for (i = 0; i < entry->relocation_count; i++) {399struct drm_i915_gem_relocation_entry reloc;400401if (__copy_from_user_inatomic(&reloc,402user_relocs+i,403sizeof(reloc)))404return -EFAULT;405406ret = i915_gem_execbuffer_relocate_entry(obj, eb, &reloc);407if (ret)408return ret;409410if (__copy_to_user_inatomic(&user_relocs[i].presumed_offset,411&reloc.presumed_offset,412sizeof(reloc.presumed_offset)))413return -EFAULT;414}415416return 0;417}418419static int420i915_gem_execbuffer_relocate_object_slow(struct drm_i915_gem_object *obj,421struct eb_objects *eb,422struct drm_i915_gem_relocation_entry *relocs)423{424const struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;425int i, ret;426427for (i = 0; i < entry->relocation_count; i++) {428ret = i915_gem_execbuffer_relocate_entry(obj, eb, &relocs[i]);429if (ret)430return ret;431}432433return 0;434}435436static int437i915_gem_execbuffer_relocate(struct drm_device *dev,438struct eb_objects *eb,439struct list_head *objects)440{441struct drm_i915_gem_object *obj;442int ret = 0;443444/* This is the fast path and we cannot handle a pagefault whilst445* holding the struct mutex lest the user pass in the relocations446* contained within a mmaped bo. For in such a case we, the page447* fault handler would call i915_gem_fault() and we would try to448* acquire the struct mutex again. Obviously this is bad and so449* lockdep complains vehemently.450*/451pagefault_disable();452list_for_each_entry(obj, objects, exec_list) {453ret = i915_gem_execbuffer_relocate_object(obj, eb);454if (ret)455break;456}457pagefault_enable();458459return ret;460}461462static int463i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,464struct drm_file *file,465struct list_head *objects)466{467struct drm_i915_gem_object *obj;468int ret, retry;469bool has_fenced_gpu_access = INTEL_INFO(ring->dev)->gen < 4;470struct list_head ordered_objects;471472INIT_LIST_HEAD(&ordered_objects);473while (!list_empty(objects)) {474struct drm_i915_gem_exec_object2 *entry;475bool need_fence, need_mappable;476477obj = list_first_entry(objects,478struct drm_i915_gem_object,479exec_list);480entry = obj->exec_entry;481482need_fence =483has_fenced_gpu_access &&484entry->flags & EXEC_OBJECT_NEEDS_FENCE &&485obj->tiling_mode != I915_TILING_NONE;486need_mappable =487entry->relocation_count ? true : need_fence;488489if (need_mappable)490list_move(&obj->exec_list, &ordered_objects);491else492list_move_tail(&obj->exec_list, &ordered_objects);493494obj->base.pending_read_domains = 0;495obj->base.pending_write_domain = 0;496}497list_splice(&ordered_objects, objects);498499/* Attempt to pin all of the buffers into the GTT.500* This is done in 3 phases:501*502* 1a. Unbind all objects that do not match the GTT constraints for503* the execbuffer (fenceable, mappable, alignment etc).504* 1b. Increment pin count for already bound objects.505* 2. Bind new objects.506* 3. Decrement pin count.507*508* This avoid unnecessary unbinding of later objects in order to makr509* room for the earlier objects *unless* we need to defragment.510*/511retry = 0;512do {513ret = 0;514515/* Unbind any ill-fitting objects or pin. */516list_for_each_entry(obj, objects, exec_list) {517struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;518bool need_fence, need_mappable;519if (!obj->gtt_space)520continue;521522need_fence =523has_fenced_gpu_access &&524entry->flags & EXEC_OBJECT_NEEDS_FENCE &&525obj->tiling_mode != I915_TILING_NONE;526need_mappable =527entry->relocation_count ? true : need_fence;528529if ((entry->alignment && obj->gtt_offset & (entry->alignment - 1)) ||530(need_mappable && !obj->map_and_fenceable))531ret = i915_gem_object_unbind(obj);532else533ret = i915_gem_object_pin(obj,534entry->alignment,535need_mappable);536if (ret)537goto err;538539entry++;540}541542/* Bind fresh objects */543list_for_each_entry(obj, objects, exec_list) {544struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;545bool need_fence;546547need_fence =548has_fenced_gpu_access &&549entry->flags & EXEC_OBJECT_NEEDS_FENCE &&550obj->tiling_mode != I915_TILING_NONE;551552if (!obj->gtt_space) {553bool need_mappable =554entry->relocation_count ? true : need_fence;555556ret = i915_gem_object_pin(obj,557entry->alignment,558need_mappable);559if (ret)560break;561}562563if (has_fenced_gpu_access) {564if (need_fence) {565ret = i915_gem_object_get_fence(obj, ring);566if (ret)567break;568} else if (entry->flags & EXEC_OBJECT_NEEDS_FENCE &&569obj->tiling_mode == I915_TILING_NONE) {570/* XXX pipelined! */571ret = i915_gem_object_put_fence(obj);572if (ret)573break;574}575obj->pending_fenced_gpu_access = need_fence;576}577578entry->offset = obj->gtt_offset;579}580581/* Decrement pin count for bound objects */582list_for_each_entry(obj, objects, exec_list) {583if (obj->gtt_space)584i915_gem_object_unpin(obj);585}586587if (ret != -ENOSPC || retry > 1)588return ret;589590/* First attempt, just clear anything that is purgeable.591* Second attempt, clear the entire GTT.592*/593ret = i915_gem_evict_everything(ring->dev, retry == 0);594if (ret)595return ret;596597retry++;598} while (1);599600err:601obj = list_entry(obj->exec_list.prev,602struct drm_i915_gem_object,603exec_list);604while (objects != &obj->exec_list) {605if (obj->gtt_space)606i915_gem_object_unpin(obj);607608obj = list_entry(obj->exec_list.prev,609struct drm_i915_gem_object,610exec_list);611}612613return ret;614}615616static int617i915_gem_execbuffer_relocate_slow(struct drm_device *dev,618struct drm_file *file,619struct intel_ring_buffer *ring,620struct list_head *objects,621struct eb_objects *eb,622struct drm_i915_gem_exec_object2 *exec,623int count)624{625struct drm_i915_gem_relocation_entry *reloc;626struct drm_i915_gem_object *obj;627int *reloc_offset;628int i, total, ret;629630/* We may process another execbuffer during the unlock... */631while (!list_empty(objects)) {632obj = list_first_entry(objects,633struct drm_i915_gem_object,634exec_list);635list_del_init(&obj->exec_list);636drm_gem_object_unreference(&obj->base);637}638639mutex_unlock(&dev->struct_mutex);640641total = 0;642for (i = 0; i < count; i++)643total += exec[i].relocation_count;644645reloc_offset = drm_malloc_ab(count, sizeof(*reloc_offset));646reloc = drm_malloc_ab(total, sizeof(*reloc));647if (reloc == NULL || reloc_offset == NULL) {648drm_free_large(reloc);649drm_free_large(reloc_offset);650mutex_lock(&dev->struct_mutex);651return -ENOMEM;652}653654total = 0;655for (i = 0; i < count; i++) {656struct drm_i915_gem_relocation_entry __user *user_relocs;657658user_relocs = (void __user *)(uintptr_t)exec[i].relocs_ptr;659660if (copy_from_user(reloc+total, user_relocs,661exec[i].relocation_count * sizeof(*reloc))) {662ret = -EFAULT;663mutex_lock(&dev->struct_mutex);664goto err;665}666667reloc_offset[i] = total;668total += exec[i].relocation_count;669}670671ret = i915_mutex_lock_interruptible(dev);672if (ret) {673mutex_lock(&dev->struct_mutex);674goto err;675}676677/* reacquire the objects */678eb_reset(eb);679for (i = 0; i < count; i++) {680obj = to_intel_bo(drm_gem_object_lookup(dev, file,681exec[i].handle));682if (&obj->base == NULL) {683DRM_ERROR("Invalid object handle %d at index %d\n",684exec[i].handle, i);685ret = -ENOENT;686goto err;687}688689list_add_tail(&obj->exec_list, objects);690obj->exec_handle = exec[i].handle;691obj->exec_entry = &exec[i];692eb_add_object(eb, obj);693}694695ret = i915_gem_execbuffer_reserve(ring, file, objects);696if (ret)697goto err;698699list_for_each_entry(obj, objects, exec_list) {700int offset = obj->exec_entry - exec;701ret = i915_gem_execbuffer_relocate_object_slow(obj, eb,702reloc + reloc_offset[offset]);703if (ret)704goto err;705}706707/* Leave the user relocations as are, this is the painfully slow path,708* and we want to avoid the complication of dropping the lock whilst709* having buffers reserved in the aperture and so causing spurious710* ENOSPC for random operations.711*/712713err:714drm_free_large(reloc);715drm_free_large(reloc_offset);716return ret;717}718719static int720i915_gem_execbuffer_flush(struct drm_device *dev,721uint32_t invalidate_domains,722uint32_t flush_domains,723uint32_t flush_rings)724{725drm_i915_private_t *dev_priv = dev->dev_private;726int i, ret;727728if (flush_domains & I915_GEM_DOMAIN_CPU)729intel_gtt_chipset_flush();730731if (flush_domains & I915_GEM_DOMAIN_GTT)732wmb();733734if ((flush_domains | invalidate_domains) & I915_GEM_GPU_DOMAINS) {735for (i = 0; i < I915_NUM_RINGS; i++)736if (flush_rings & (1 << i)) {737ret = i915_gem_flush_ring(&dev_priv->ring[i],738invalidate_domains,739flush_domains);740if (ret)741return ret;742}743}744745return 0;746}747748static int749i915_gem_execbuffer_sync_rings(struct drm_i915_gem_object *obj,750struct intel_ring_buffer *to)751{752struct intel_ring_buffer *from = obj->ring;753u32 seqno;754int ret, idx;755756if (from == NULL || to == from)757return 0;758759/* XXX gpu semaphores are implicated in various hard hangs on SNB */760if (INTEL_INFO(obj->base.dev)->gen < 6 || !i915_semaphores)761return i915_gem_object_wait_rendering(obj);762763idx = intel_ring_sync_index(from, to);764765seqno = obj->last_rendering_seqno;766if (seqno <= from->sync_seqno[idx])767return 0;768769if (seqno == from->outstanding_lazy_request) {770struct drm_i915_gem_request *request;771772request = kzalloc(sizeof(*request), GFP_KERNEL);773if (request == NULL)774return -ENOMEM;775776ret = i915_add_request(from, NULL, request);777if (ret) {778kfree(request);779return ret;780}781782seqno = request->seqno;783}784785from->sync_seqno[idx] = seqno;786return intel_ring_sync(to, from, seqno - 1);787}788789static int790i915_gem_execbuffer_wait_for_flips(struct intel_ring_buffer *ring, u32 flips)791{792u32 plane, flip_mask;793int ret;794795/* Check for any pending flips. As we only maintain a flip queue depth796* of 1, we can simply insert a WAIT for the next display flip prior797* to executing the batch and avoid stalling the CPU.798*/799800for (plane = 0; flips >> plane; plane++) {801if (((flips >> plane) & 1) == 0)802continue;803804if (plane)805flip_mask = MI_WAIT_FOR_PLANE_B_FLIP;806else807flip_mask = MI_WAIT_FOR_PLANE_A_FLIP;808809ret = intel_ring_begin(ring, 2);810if (ret)811return ret;812813intel_ring_emit(ring, MI_WAIT_FOR_EVENT | flip_mask);814intel_ring_emit(ring, MI_NOOP);815intel_ring_advance(ring);816}817818return 0;819}820821822static int823i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring,824struct list_head *objects)825{826struct drm_i915_gem_object *obj;827struct change_domains cd;828int ret;829830memset(&cd, 0, sizeof(cd));831list_for_each_entry(obj, objects, exec_list)832i915_gem_object_set_to_gpu_domain(obj, ring, &cd);833834if (cd.invalidate_domains | cd.flush_domains) {835ret = i915_gem_execbuffer_flush(ring->dev,836cd.invalidate_domains,837cd.flush_domains,838cd.flush_rings);839if (ret)840return ret;841}842843if (cd.flips) {844ret = i915_gem_execbuffer_wait_for_flips(ring, cd.flips);845if (ret)846return ret;847}848849list_for_each_entry(obj, objects, exec_list) {850ret = i915_gem_execbuffer_sync_rings(obj, ring);851if (ret)852return ret;853}854855return 0;856}857858static bool859i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec)860{861return ((exec->batch_start_offset | exec->batch_len) & 0x7) == 0;862}863864static int865validate_exec_list(struct drm_i915_gem_exec_object2 *exec,866int count)867{868int i;869870for (i = 0; i < count; i++) {871char __user *ptr = (char __user *)(uintptr_t)exec[i].relocs_ptr;872int length; /* limited by fault_in_pages_readable() */873874/* First check for malicious input causing overflow */875if (exec[i].relocation_count >876INT_MAX / sizeof(struct drm_i915_gem_relocation_entry))877return -EINVAL;878879length = exec[i].relocation_count *880sizeof(struct drm_i915_gem_relocation_entry);881if (!access_ok(VERIFY_READ, ptr, length))882return -EFAULT;883884/* we may also need to update the presumed offsets */885if (!access_ok(VERIFY_WRITE, ptr, length))886return -EFAULT;887888if (fault_in_pages_readable(ptr, length))889return -EFAULT;890}891892return 0;893}894895static void896i915_gem_execbuffer_move_to_active(struct list_head *objects,897struct intel_ring_buffer *ring,898u32 seqno)899{900struct drm_i915_gem_object *obj;901902list_for_each_entry(obj, objects, exec_list) {903u32 old_read = obj->base.read_domains;904u32 old_write = obj->base.write_domain;905906907obj->base.read_domains = obj->base.pending_read_domains;908obj->base.write_domain = obj->base.pending_write_domain;909obj->fenced_gpu_access = obj->pending_fenced_gpu_access;910911i915_gem_object_move_to_active(obj, ring, seqno);912if (obj->base.write_domain) {913obj->dirty = 1;914obj->pending_gpu_write = true;915list_move_tail(&obj->gpu_write_list,916&ring->gpu_write_list);917intel_mark_busy(ring->dev, obj);918}919920trace_i915_gem_object_change_domain(obj, old_read, old_write);921}922}923924static void925i915_gem_execbuffer_retire_commands(struct drm_device *dev,926struct drm_file *file,927struct intel_ring_buffer *ring)928{929struct drm_i915_gem_request *request;930u32 invalidate;931932/*933* Ensure that the commands in the batch buffer are934* finished before the interrupt fires.935*936* The sampler always gets flushed on i965 (sigh).937*/938invalidate = I915_GEM_DOMAIN_COMMAND;939if (INTEL_INFO(dev)->gen >= 4)940invalidate |= I915_GEM_DOMAIN_SAMPLER;941if (ring->flush(ring, invalidate, 0)) {942i915_gem_next_request_seqno(ring);943return;944}945946/* Add a breadcrumb for the completion of the batch buffer */947request = kzalloc(sizeof(*request), GFP_KERNEL);948if (request == NULL || i915_add_request(ring, file, request)) {949i915_gem_next_request_seqno(ring);950kfree(request);951}952}953954static int955i915_gem_do_execbuffer(struct drm_device *dev, void *data,956struct drm_file *file,957struct drm_i915_gem_execbuffer2 *args,958struct drm_i915_gem_exec_object2 *exec)959{960drm_i915_private_t *dev_priv = dev->dev_private;961struct list_head objects;962struct eb_objects *eb;963struct drm_i915_gem_object *batch_obj;964struct drm_clip_rect *cliprects = NULL;965struct intel_ring_buffer *ring;966u32 exec_start, exec_len;967u32 seqno;968int ret, mode, i;969970if (!i915_gem_check_execbuffer(args)) {971DRM_ERROR("execbuf with invalid offset/length\n");972return -EINVAL;973}974975ret = validate_exec_list(exec, args->buffer_count);976if (ret)977return ret;978979switch (args->flags & I915_EXEC_RING_MASK) {980case I915_EXEC_DEFAULT:981case I915_EXEC_RENDER:982ring = &dev_priv->ring[RCS];983break;984case I915_EXEC_BSD:985if (!HAS_BSD(dev)) {986DRM_ERROR("execbuf with invalid ring (BSD)\n");987return -EINVAL;988}989ring = &dev_priv->ring[VCS];990break;991case I915_EXEC_BLT:992if (!HAS_BLT(dev)) {993DRM_ERROR("execbuf with invalid ring (BLT)\n");994return -EINVAL;995}996ring = &dev_priv->ring[BCS];997break;998default:999DRM_ERROR("execbuf with unknown ring: %d\n",1000(int)(args->flags & I915_EXEC_RING_MASK));1001return -EINVAL;1002}10031004mode = args->flags & I915_EXEC_CONSTANTS_MASK;1005switch (mode) {1006case I915_EXEC_CONSTANTS_REL_GENERAL:1007case I915_EXEC_CONSTANTS_ABSOLUTE:1008case I915_EXEC_CONSTANTS_REL_SURFACE:1009if (ring == &dev_priv->ring[RCS] &&1010mode != dev_priv->relative_constants_mode) {1011if (INTEL_INFO(dev)->gen < 4)1012return -EINVAL;10131014if (INTEL_INFO(dev)->gen > 5 &&1015mode == I915_EXEC_CONSTANTS_REL_SURFACE)1016return -EINVAL;10171018ret = intel_ring_begin(ring, 4);1019if (ret)1020return ret;10211022intel_ring_emit(ring, MI_NOOP);1023intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));1024intel_ring_emit(ring, INSTPM);1025intel_ring_emit(ring,1026I915_EXEC_CONSTANTS_MASK << 16 | mode);1027intel_ring_advance(ring);10281029dev_priv->relative_constants_mode = mode;1030}1031break;1032default:1033DRM_ERROR("execbuf with unknown constants: %d\n", mode);1034return -EINVAL;1035}10361037if (args->buffer_count < 1) {1038DRM_ERROR("execbuf with %d buffers\n", args->buffer_count);1039return -EINVAL;1040}10411042if (args->num_cliprects != 0) {1043if (ring != &dev_priv->ring[RCS]) {1044DRM_ERROR("clip rectangles are only valid with the render ring\n");1045return -EINVAL;1046}10471048cliprects = kmalloc(args->num_cliprects * sizeof(*cliprects),1049GFP_KERNEL);1050if (cliprects == NULL) {1051ret = -ENOMEM;1052goto pre_mutex_err;1053}10541055if (copy_from_user(cliprects,1056(struct drm_clip_rect __user *)(uintptr_t)1057args->cliprects_ptr,1058sizeof(*cliprects)*args->num_cliprects)) {1059ret = -EFAULT;1060goto pre_mutex_err;1061}1062}10631064ret = i915_mutex_lock_interruptible(dev);1065if (ret)1066goto pre_mutex_err;10671068if (dev_priv->mm.suspended) {1069mutex_unlock(&dev->struct_mutex);1070ret = -EBUSY;1071goto pre_mutex_err;1072}10731074eb = eb_create(args->buffer_count);1075if (eb == NULL) {1076mutex_unlock(&dev->struct_mutex);1077ret = -ENOMEM;1078goto pre_mutex_err;1079}10801081/* Look up object handles */1082INIT_LIST_HEAD(&objects);1083for (i = 0; i < args->buffer_count; i++) {1084struct drm_i915_gem_object *obj;10851086obj = to_intel_bo(drm_gem_object_lookup(dev, file,1087exec[i].handle));1088if (&obj->base == NULL) {1089DRM_ERROR("Invalid object handle %d at index %d\n",1090exec[i].handle, i);1091/* prevent error path from reading uninitialized data */1092ret = -ENOENT;1093goto err;1094}10951096if (!list_empty(&obj->exec_list)) {1097DRM_ERROR("Object %p [handle %d, index %d] appears more than once in object list\n",1098obj, exec[i].handle, i);1099ret = -EINVAL;1100goto err;1101}11021103list_add_tail(&obj->exec_list, &objects);1104obj->exec_handle = exec[i].handle;1105obj->exec_entry = &exec[i];1106eb_add_object(eb, obj);1107}11081109/* take note of the batch buffer before we might reorder the lists */1110batch_obj = list_entry(objects.prev,1111struct drm_i915_gem_object,1112exec_list);11131114/* Move the objects en-masse into the GTT, evicting if necessary. */1115ret = i915_gem_execbuffer_reserve(ring, file, &objects);1116if (ret)1117goto err;11181119/* The objects are in their final locations, apply the relocations. */1120ret = i915_gem_execbuffer_relocate(dev, eb, &objects);1121if (ret) {1122if (ret == -EFAULT) {1123ret = i915_gem_execbuffer_relocate_slow(dev, file, ring,1124&objects, eb,1125exec,1126args->buffer_count);1127BUG_ON(!mutex_is_locked(&dev->struct_mutex));1128}1129if (ret)1130goto err;1131}11321133/* Set the pending read domains for the batch buffer to COMMAND */1134if (batch_obj->base.pending_write_domain) {1135DRM_ERROR("Attempting to use self-modifying batch buffer\n");1136ret = -EINVAL;1137goto err;1138}1139batch_obj->base.pending_read_domains |= I915_GEM_DOMAIN_COMMAND;11401141ret = i915_gem_execbuffer_move_to_gpu(ring, &objects);1142if (ret)1143goto err;11441145seqno = i915_gem_next_request_seqno(ring);1146for (i = 0; i < ARRAY_SIZE(ring->sync_seqno); i++) {1147if (seqno < ring->sync_seqno[i]) {1148/* The GPU can not handle its semaphore value wrapping,1149* so every billion or so execbuffers, we need to stall1150* the GPU in order to reset the counters.1151*/1152ret = i915_gpu_idle(dev);1153if (ret)1154goto err;11551156BUG_ON(ring->sync_seqno[i]);1157}1158}11591160trace_i915_gem_ring_dispatch(ring, seqno);11611162exec_start = batch_obj->gtt_offset + args->batch_start_offset;1163exec_len = args->batch_len;1164if (cliprects) {1165for (i = 0; i < args->num_cliprects; i++) {1166ret = i915_emit_box(dev, &cliprects[i],1167args->DR1, args->DR4);1168if (ret)1169goto err;11701171ret = ring->dispatch_execbuffer(ring,1172exec_start, exec_len);1173if (ret)1174goto err;1175}1176} else {1177ret = ring->dispatch_execbuffer(ring, exec_start, exec_len);1178if (ret)1179goto err;1180}11811182i915_gem_execbuffer_move_to_active(&objects, ring, seqno);1183i915_gem_execbuffer_retire_commands(dev, file, ring);11841185err:1186eb_destroy(eb);1187while (!list_empty(&objects)) {1188struct drm_i915_gem_object *obj;11891190obj = list_first_entry(&objects,1191struct drm_i915_gem_object,1192exec_list);1193list_del_init(&obj->exec_list);1194drm_gem_object_unreference(&obj->base);1195}11961197mutex_unlock(&dev->struct_mutex);11981199pre_mutex_err:1200kfree(cliprects);1201return ret;1202}12031204/*1205* Legacy execbuffer just creates an exec2 list from the original exec object1206* list array and passes it to the real function.1207*/1208int1209i915_gem_execbuffer(struct drm_device *dev, void *data,1210struct drm_file *file)1211{1212struct drm_i915_gem_execbuffer *args = data;1213struct drm_i915_gem_execbuffer2 exec2;1214struct drm_i915_gem_exec_object *exec_list = NULL;1215struct drm_i915_gem_exec_object2 *exec2_list = NULL;1216int ret, i;12171218if (args->buffer_count < 1) {1219DRM_ERROR("execbuf with %d buffers\n", args->buffer_count);1220return -EINVAL;1221}12221223/* Copy in the exec list from userland */1224exec_list = drm_malloc_ab(sizeof(*exec_list), args->buffer_count);1225exec2_list = drm_malloc_ab(sizeof(*exec2_list), args->buffer_count);1226if (exec_list == NULL || exec2_list == NULL) {1227DRM_ERROR("Failed to allocate exec list for %d buffers\n",1228args->buffer_count);1229drm_free_large(exec_list);1230drm_free_large(exec2_list);1231return -ENOMEM;1232}1233ret = copy_from_user(exec_list,1234(struct drm_i915_relocation_entry __user *)1235(uintptr_t) args->buffers_ptr,1236sizeof(*exec_list) * args->buffer_count);1237if (ret != 0) {1238DRM_ERROR("copy %d exec entries failed %d\n",1239args->buffer_count, ret);1240drm_free_large(exec_list);1241drm_free_large(exec2_list);1242return -EFAULT;1243}12441245for (i = 0; i < args->buffer_count; i++) {1246exec2_list[i].handle = exec_list[i].handle;1247exec2_list[i].relocation_count = exec_list[i].relocation_count;1248exec2_list[i].relocs_ptr = exec_list[i].relocs_ptr;1249exec2_list[i].alignment = exec_list[i].alignment;1250exec2_list[i].offset = exec_list[i].offset;1251if (INTEL_INFO(dev)->gen < 4)1252exec2_list[i].flags = EXEC_OBJECT_NEEDS_FENCE;1253else1254exec2_list[i].flags = 0;1255}12561257exec2.buffers_ptr = args->buffers_ptr;1258exec2.buffer_count = args->buffer_count;1259exec2.batch_start_offset = args->batch_start_offset;1260exec2.batch_len = args->batch_len;1261exec2.DR1 = args->DR1;1262exec2.DR4 = args->DR4;1263exec2.num_cliprects = args->num_cliprects;1264exec2.cliprects_ptr = args->cliprects_ptr;1265exec2.flags = I915_EXEC_RENDER;12661267ret = i915_gem_do_execbuffer(dev, data, file, &exec2, exec2_list);1268if (!ret) {1269/* Copy the new buffer offsets back to the user's exec list. */1270for (i = 0; i < args->buffer_count; i++)1271exec_list[i].offset = exec2_list[i].offset;1272/* ... and back out to userspace */1273ret = copy_to_user((struct drm_i915_relocation_entry __user *)1274(uintptr_t) args->buffers_ptr,1275exec_list,1276sizeof(*exec_list) * args->buffer_count);1277if (ret) {1278ret = -EFAULT;1279DRM_ERROR("failed to copy %d exec entries "1280"back to user (%d)\n",1281args->buffer_count, ret);1282}1283}12841285drm_free_large(exec_list);1286drm_free_large(exec2_list);1287return ret;1288}12891290int1291i915_gem_execbuffer2(struct drm_device *dev, void *data,1292struct drm_file *file)1293{1294struct drm_i915_gem_execbuffer2 *args = data;1295struct drm_i915_gem_exec_object2 *exec2_list = NULL;1296int ret;12971298if (args->buffer_count < 1) {1299DRM_ERROR("execbuf2 with %d buffers\n", args->buffer_count);1300return -EINVAL;1301}13021303exec2_list = kmalloc(sizeof(*exec2_list)*args->buffer_count,1304GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);1305if (exec2_list == NULL)1306exec2_list = drm_malloc_ab(sizeof(*exec2_list),1307args->buffer_count);1308if (exec2_list == NULL) {1309DRM_ERROR("Failed to allocate exec list for %d buffers\n",1310args->buffer_count);1311return -ENOMEM;1312}1313ret = copy_from_user(exec2_list,1314(struct drm_i915_relocation_entry __user *)1315(uintptr_t) args->buffers_ptr,1316sizeof(*exec2_list) * args->buffer_count);1317if (ret != 0) {1318DRM_ERROR("copy %d exec entries failed %d\n",1319args->buffer_count, ret);1320drm_free_large(exec2_list);1321return -EFAULT;1322}13231324ret = i915_gem_do_execbuffer(dev, data, file, args, exec2_list);1325if (!ret) {1326/* Copy the new buffer offsets back to the user's exec list. */1327ret = copy_to_user((struct drm_i915_relocation_entry __user *)1328(uintptr_t) args->buffers_ptr,1329exec2_list,1330sizeof(*exec2_list) * args->buffer_count);1331if (ret) {1332ret = -EFAULT;1333DRM_ERROR("failed to copy %d exec entries "1334"back to user (%d)\n",1335args->buffer_count, ret);1336}1337}13381339drm_free_large(exec2_list);1340return ret;1341}134213431344