CoCalc -- v3dv_meta

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/broadcom/vulkan/v3dv_meta_copy.c
⁴⁵⁶⁰ views
1
/*
2
 * Copyright © 2019 Raspberry Pi
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 */
23

24
#include "v3dv_private.h"
25
#include "v3dv_meta_copy.h"
26

27
#include "compiler/nir/nir_builder.h"
28
#include "vk_format_info.h"
29
#include "util/u_pack_color.h"
30
#include "vulkan/util/vk_common_entrypoints.h"
31

32
static uint32_t
33
meta_blit_key_hash(const void *key)
34
{
35
   return _mesa_hash_data(key, V3DV_META_BLIT_CACHE_KEY_SIZE);
36
}
37

38
static bool
39
meta_blit_key_compare(const void *key1, const void *key2)
40
{
41
   return memcmp(key1, key2, V3DV_META_BLIT_CACHE_KEY_SIZE) == 0;
42
}
43

44
static bool
45
create_blit_pipeline_layout(struct v3dv_device *device,
46
                            VkDescriptorSetLayout *descriptor_set_layout,
47
                            VkPipelineLayout *pipeline_layout)
48
{
49
   VkResult result;
50

51
   if (*descriptor_set_layout == 0) {
52
      VkDescriptorSetLayoutBinding descriptor_set_layout_binding = {
53
         .binding = 0,
54
         .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
55
         .descriptorCount = 1,
56
         .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
57
      };
58
      VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = {
59
         .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
60
         .bindingCount = 1,
61
         .pBindings = &descriptor_set_layout_binding,
62
      };
63
      result =
64
         v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
65
                                        &descriptor_set_layout_info,
66
                                        &device->vk.alloc,
67
                                        descriptor_set_layout);
68
      if (result != VK_SUCCESS)
69
         return false;
70
   }
71

72
   assert(*pipeline_layout == 0);
73
   VkPipelineLayoutCreateInfo pipeline_layout_info = {
74
      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
75
      .setLayoutCount = 1,
76
      .pSetLayouts = descriptor_set_layout,
77
      .pushConstantRangeCount = 1,
78
      .pPushConstantRanges =
79
         &(VkPushConstantRange) { VK_SHADER_STAGE_VERTEX_BIT, 0, 20 },
80
   };
81

82
   result =
83
      v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
84
                                &pipeline_layout_info,
85
                                &device->vk.alloc,
86
                                pipeline_layout);
87
   return result == VK_SUCCESS;
88
}
89

90
void
91
v3dv_meta_blit_init(struct v3dv_device *device)
92
{
93
   for (uint32_t i = 0; i < 3; i++) {
94
      device->meta.blit.cache[i] =
95
         _mesa_hash_table_create(NULL,
96
                                 meta_blit_key_hash,
97
                                 meta_blit_key_compare);
98
   }
99

100
   create_blit_pipeline_layout(device,
101
                               &device->meta.blit.ds_layout,
102
                               &device->meta.blit.p_layout);
103
}
104

105
void
106
v3dv_meta_blit_finish(struct v3dv_device *device)
107
{
108
   VkDevice _device = v3dv_device_to_handle(device);
109

110
   for (uint32_t i = 0; i < 3; i++) {
111
      hash_table_foreach(device->meta.blit.cache[i], entry) {
112
         struct v3dv_meta_blit_pipeline *item = entry->data;
113
         v3dv_DestroyPipeline(_device, item->pipeline, &device->vk.alloc);
114
         v3dv_DestroyRenderPass(_device, item->pass, &device->vk.alloc);
115
         v3dv_DestroyRenderPass(_device, item->pass_no_load, &device->vk.alloc);
116
         vk_free(&device->vk.alloc, item);
117
      }
118
      _mesa_hash_table_destroy(device->meta.blit.cache[i], NULL);
119
   }
120

121
   if (device->meta.blit.p_layout) {
122
      v3dv_DestroyPipelineLayout(_device, device->meta.blit.p_layout,
123
                                 &device->vk.alloc);
124
   }
125

126
   if (device->meta.blit.ds_layout) {
127
      v3dv_DestroyDescriptorSetLayout(_device, device->meta.blit.ds_layout,
128
                                      &device->vk.alloc);
129
   }
130
}
131

132
static uint32_t
133
meta_texel_buffer_copy_key_hash(const void *key)
134
{
135
   return _mesa_hash_data(key, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
136
}
137

138
static bool
139
meta_texel_buffer_copy_key_compare(const void *key1, const void *key2)
140
{
141
   return memcmp(key1, key2, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE) == 0;
142
}
143

144
static bool
145
create_texel_buffer_copy_pipeline_layout(struct v3dv_device *device,
146
                                         VkDescriptorSetLayout *ds_layout,
147
                                         VkPipelineLayout *p_layout)
148
{
149
   VkResult result;
150

151
   if (*ds_layout == 0) {
152
      VkDescriptorSetLayoutBinding ds_layout_binding = {
153
         .binding = 0,
154
         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
155
         .descriptorCount = 1,
156
         .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
157
      };
158
      VkDescriptorSetLayoutCreateInfo ds_layout_info = {
159
         .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
160
         .bindingCount = 1,
161
         .pBindings = &ds_layout_binding,
162
      };
163
      result =
164
         v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
165
                                        &ds_layout_info,
166
                                        &device->vk.alloc,
167
                                        ds_layout);
168
      if (result != VK_SUCCESS)
169
         return false;
170
   }
171

172
   assert(*p_layout == 0);
173
   /* FIXME: this is abusing a bit the API, since not all of our copy
174
    * pipelines have a geometry shader. We could create 2 different pipeline
175
    * layouts, but this works for us for now.
176
    */
177
#define TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET      0
178
#define TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET  16
179
#define TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET  20
180
#define TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET   24
181
   VkPushConstantRange ranges[2] = {
182
      { VK_SHADER_STAGE_FRAGMENT_BIT, 0, 24 },
183
      { VK_SHADER_STAGE_GEOMETRY_BIT, 24, 4 },
184
   };
185

186
   VkPipelineLayoutCreateInfo p_layout_info = {
187
      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
188
      .setLayoutCount = 1,
189
      .pSetLayouts = ds_layout,
190
      .pushConstantRangeCount = 2,
191
      .pPushConstantRanges = ranges,
192
   };
193

194
   result =
195
      v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
196
                                &p_layout_info,
197
                                &device->vk.alloc,
198
                                p_layout);
199
   return result == VK_SUCCESS;
200
}
201

202
void
203
v3dv_meta_texel_buffer_copy_init(struct v3dv_device *device)
204
{
205
   for (uint32_t i = 0; i < 3; i++) {
206
      device->meta.texel_buffer_copy.cache[i] =
207
         _mesa_hash_table_create(NULL,
208
                                 meta_texel_buffer_copy_key_hash,
209
                                 meta_texel_buffer_copy_key_compare);
210
   }
211

212
   create_texel_buffer_copy_pipeline_layout(
213
      device,
214
      &device->meta.texel_buffer_copy.ds_layout,
215
      &device->meta.texel_buffer_copy.p_layout);
216
}
217

218
void
219
v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device)
220
{
221
   VkDevice _device = v3dv_device_to_handle(device);
222

223
   for (uint32_t i = 0; i < 3; i++) {
224
      hash_table_foreach(device->meta.texel_buffer_copy.cache[i], entry) {
225
         struct v3dv_meta_texel_buffer_copy_pipeline *item = entry->data;
226
         v3dv_DestroyPipeline(_device, item->pipeline, &device->vk.alloc);
227
         v3dv_DestroyRenderPass(_device, item->pass, &device->vk.alloc);
228
         v3dv_DestroyRenderPass(_device, item->pass_no_load, &device->vk.alloc);
229
         vk_free(&device->vk.alloc, item);
230
      }
231
      _mesa_hash_table_destroy(device->meta.texel_buffer_copy.cache[i], NULL);
232
   }
233

234
   if (device->meta.texel_buffer_copy.p_layout) {
235
      v3dv_DestroyPipelineLayout(_device, device->meta.texel_buffer_copy.p_layout,
236
                                 &device->vk.alloc);
237
   }
238

239
   if (device->meta.texel_buffer_copy.ds_layout) {
240
      v3dv_DestroyDescriptorSetLayout(_device, device->meta.texel_buffer_copy.ds_layout,
241
                                      &device->vk.alloc);
242
   }
243
}
244

245
static inline bool
246
can_use_tlb(struct v3dv_image *image,
247
            const VkOffset3D *offset,
248
            VkFormat *compat_format);
249

250
/* Implements a copy using the TLB.
251
 *
252
 * This only works if we are copying from offset (0,0), since a TLB store for
253
 * tile (x,y) will be written at the same tile offset into the destination.
254
 * When this requirement is not met, we need to use a blit instead.
255
 *
256
 * Returns true if the implementation supports the requested operation (even if
257
 * it failed to process it, for example, due to an out-of-memory error).
258
 *
259
 */
260
static bool
261
copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
262
                         struct v3dv_buffer *buffer,
263
                         struct v3dv_image *image,
264
                         const VkBufferImageCopy2KHR *region)
265
{
266
   VkFormat fb_format;
267
   if (!can_use_tlb(image, &region->imageOffset, &fb_format))
268
      return false;
269

270
   uint32_t internal_type, internal_bpp;
271
   v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
272
      (fb_format, region->imageSubresource.aspectMask,
273
       &internal_type, &internal_bpp);
274

275
   uint32_t num_layers;
276
   if (image->type != VK_IMAGE_TYPE_3D)
277
      num_layers = region->imageSubresource.layerCount;
278
   else
279
      num_layers = region->imageExtent.depth;
280
   assert(num_layers > 0);
281

282
   struct v3dv_job *job =
283
      v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
284
   if (!job)
285
      return true;
286

287
   /* Handle copy from compressed format using a compatible format */
288
   const uint32_t block_w = vk_format_get_blockwidth(image->vk_format);
289
   const uint32_t block_h = vk_format_get_blockheight(image->vk_format);
290
   const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
291
   const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
292

293
   v3dv_job_start_frame(job, width, height, num_layers, 1, internal_bpp, false);
294

295
   struct framebuffer_data framebuffer;
296
   v3dv_X(job->device, setup_framebuffer_data)(&framebuffer, fb_format, internal_type,
297
                                               &job->frame_tiling);
298

299
   v3dv_X(job->device, job_emit_binning_flush)(job);
300
   v3dv_X(job->device, job_emit_copy_image_to_buffer_rcl)
301
      (job, buffer, image, &framebuffer, region);
302

303
   v3dv_cmd_buffer_finish_job(cmd_buffer);
304

305
   return true;
306
}
307

308
static bool
309
blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
310
            struct v3dv_image *dst,
311
            VkFormat dst_format,
312
            struct v3dv_image *src,
313
            VkFormat src_format,
314
            VkColorComponentFlags cmask,
315
            VkComponentMapping *cswizzle,
316
            const VkImageBlit2KHR *region,
317
            VkFilter filter,
318
            bool dst_is_padded_image);
319

320
/**
321
 * Returns true if the implementation supports the requested operation (even if
322
 * it failed to process it, for example, due to an out-of-memory error).
323
 */
324
static bool
325
copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
326
                          struct v3dv_buffer *buffer,
327
                          struct v3dv_image *image,
328
                          const VkBufferImageCopy2KHR *region)
329
{
330
   bool handled = false;
331

332
   /* Generally, the bpp of the data in the buffer matches that of the
333
    * source image. The exception is the case where we are copying
334
    * stencil (8bpp) to a combined d24s8 image (32bpp).
335
    */
336
   uint32_t buffer_bpp = image->cpp;
337

338
   VkImageAspectFlags copy_aspect = region->imageSubresource.aspectMask;
339

340
   /* Because we are going to implement the copy as a blit, we need to create
341
    * a linear image from the destination buffer and we also want our blit
342
    * source and destination formats to be the same (to avoid any format
343
    * conversions), so we choose a canonical format that matches the
344
    * source image bpp.
345
    *
346
    * The exception to the above is copying from combined depth/stencil images
347
    * because we are copying only one aspect of the image, so we need to setup
348
    * our formats, color write mask and source swizzle mask to match that.
349
    */
350
   VkFormat dst_format;
351
   VkFormat src_format;
352
   VkColorComponentFlags cmask = 0; /* All components */
353
   VkComponentMapping cswizzle = {
354
      .r = VK_COMPONENT_SWIZZLE_IDENTITY,
355
      .g = VK_COMPONENT_SWIZZLE_IDENTITY,
356
      .b = VK_COMPONENT_SWIZZLE_IDENTITY,
357
      .a = VK_COMPONENT_SWIZZLE_IDENTITY,
358
   };
359
   switch (buffer_bpp) {
360
   case 16:
361
      assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
362
      dst_format = VK_FORMAT_R32G32B32A32_UINT;
363
      src_format = dst_format;
364
      break;
365
   case 8:
366
      assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
367
      dst_format = VK_FORMAT_R16G16B16A16_UINT;
368
      src_format = dst_format;
369
      break;
370
   case 4:
371
      switch (copy_aspect) {
372
      case VK_IMAGE_ASPECT_COLOR_BIT:
373
         src_format = VK_FORMAT_R8G8B8A8_UINT;
374
         dst_format = VK_FORMAT_R8G8B8A8_UINT;
375
         break;
376
      case VK_IMAGE_ASPECT_DEPTH_BIT:
377
         assert(image->vk_format == VK_FORMAT_D32_SFLOAT ||
378
                image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
379
                image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32);
380
         if (image->vk_format == VK_FORMAT_D32_SFLOAT) {
381
            src_format = VK_FORMAT_R32_UINT;
382
            dst_format = VK_FORMAT_R32_UINT;
383
         } else {
384
            /* We want to write depth in the buffer in the first 24-bits,
385
             * however, the hardware has depth in bits 8-31, so swizzle the
386
             * the source components to match what we want. Also, we don't
387
             * want to write bits 24-31 in the destination.
388
             */
389
            src_format = VK_FORMAT_R8G8B8A8_UINT;
390
            dst_format = VK_FORMAT_R8G8B8A8_UINT;
391
            cmask = VK_COLOR_COMPONENT_R_BIT |
392
                    VK_COLOR_COMPONENT_G_BIT |
393
                    VK_COLOR_COMPONENT_B_BIT;
394
            cswizzle.r = VK_COMPONENT_SWIZZLE_G;
395
            cswizzle.g = VK_COMPONENT_SWIZZLE_B;
396
            cswizzle.b = VK_COMPONENT_SWIZZLE_A;
397
            cswizzle.a = VK_COMPONENT_SWIZZLE_ZERO;
398
         }
399
         break;
400
      case VK_IMAGE_ASPECT_STENCIL_BIT:
401
         assert(copy_aspect == VK_IMAGE_ASPECT_STENCIL_BIT);
402
         assert(image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT);
403
         /* Copying from S8D24. We want to write 8-bit stencil values only,
404
          * so adjust the buffer bpp for that. Since the hardware stores stencil
405
          * in the LSB, we can just do a RGBA8UI to R8UI blit.
406
          */
407
         src_format = VK_FORMAT_R8G8B8A8_UINT;
408
         dst_format = VK_FORMAT_R8_UINT;
409
         buffer_bpp = 1;
410
         break;
411
      default:
412
         unreachable("unsupported aspect");
413
         return handled;
414
      };
415
      break;
416
   case 2:
417
      assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT ||
418
             copy_aspect == VK_IMAGE_ASPECT_DEPTH_BIT);
419
      dst_format = VK_FORMAT_R16_UINT;
420
      src_format = dst_format;
421
      break;
422
   case 1:
423
      assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
424
      dst_format = VK_FORMAT_R8_UINT;
425
      src_format = dst_format;
426
      break;
427
   default:
428
      unreachable("unsupported bit-size");
429
      return handled;
430
   };
431

432
   /* The hardware doesn't support linear depth/stencil stores, so we
433
    * implement copies of depth/stencil aspect as color copies using a
434
    * compatible color format.
435
    */
436
   assert(vk_format_is_color(src_format));
437
   assert(vk_format_is_color(dst_format));
438
   copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT;
439

440
   /* We should be able to handle the blit if we got this far */
441
   handled = true;
442

443
   /* Obtain the 2D buffer region spec */
444
   uint32_t buf_width, buf_height;
445
   if (region->bufferRowLength == 0)
446
      buf_width = region->imageExtent.width;
447
   else
448
      buf_width = region->bufferRowLength;
449

450
   if (region->bufferImageHeight == 0)
451
      buf_height = region->imageExtent.height;
452
   else
453
      buf_height = region->bufferImageHeight;
454

455
   /* If the image is compressed, the bpp refers to blocks, not pixels */
456
   uint32_t block_width = vk_format_get_blockwidth(image->vk_format);
457
   uint32_t block_height = vk_format_get_blockheight(image->vk_format);
458
   buf_width = buf_width / block_width;
459
   buf_height = buf_height / block_height;
460

461
   /* Compute layers to copy */
462
   uint32_t num_layers;
463
   if (image->type != VK_IMAGE_TYPE_3D)
464
      num_layers = region->imageSubresource.layerCount;
465
   else
466
      num_layers = region->imageExtent.depth;
467
   assert(num_layers > 0);
468

469
   /* Our blit interface can see the real format of the images to detect
470
    * copies between compressed and uncompressed images and adapt the
471
    * blit region accordingly. Here we are just doing a raw copy of
472
    * compressed data, but we are passing an uncompressed view of the
473
    * buffer for the blit destination image (since compressed formats are
474
    * not renderable), so we also want to provide an uncompressed view of
475
    * the source image.
476
    */
477
   VkResult result;
478
   struct v3dv_device *device = cmd_buffer->device;
479
   VkDevice _device = v3dv_device_to_handle(device);
480
   if (vk_format_is_compressed(image->vk_format)) {
481
      VkImage uiview;
482
      VkImageCreateInfo uiview_info = {
483
         .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
484
         .imageType = VK_IMAGE_TYPE_3D,
485
         .format = dst_format,
486
         .extent = { buf_width, buf_height, image->extent.depth },
487
         .mipLevels = image->levels,
488
         .arrayLayers = image->array_size,
489
         .samples = image->samples,
490
         .tiling = image->tiling,
491
         .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
492
         .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
493
         .queueFamilyIndexCount = 0,
494
         .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
495
      };
496
      result = v3dv_CreateImage(_device, &uiview_info, &device->vk.alloc, &uiview);
497
      if (result != VK_SUCCESS)
498
         return handled;
499

500
      v3dv_cmd_buffer_add_private_obj(
501
         cmd_buffer, (uintptr_t)uiview,
502
         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
503

504
      result =
505
         vk_common_BindImageMemory(_device, uiview,
506
                                   v3dv_device_memory_to_handle(image->mem),
507
                                   image->mem_offset);
508
      if (result != VK_SUCCESS)
509
         return handled;
510

511
      image = v3dv_image_from_handle(uiview);
512
   }
513

514
   /* Copy requested layers */
515
   for (uint32_t i = 0; i < num_layers; i++) {
516
      /* Create the destination blit image from the destination buffer */
517
      VkImageCreateInfo image_info = {
518
         .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
519
         .imageType = VK_IMAGE_TYPE_2D,
520
         .format = dst_format,
521
         .extent = { buf_width, buf_height, 1 },
522
         .mipLevels = 1,
523
         .arrayLayers = 1,
524
         .samples = VK_SAMPLE_COUNT_1_BIT,
525
         .tiling = VK_IMAGE_TILING_LINEAR,
526
         .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
527
         .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
528
         .queueFamilyIndexCount = 0,
529
         .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
530
      };
531

532
      VkImage buffer_image;
533
      result =
534
         v3dv_CreateImage(_device, &image_info, &device->vk.alloc, &buffer_image);
535
      if (result != VK_SUCCESS)
536
         return handled;
537

538
      v3dv_cmd_buffer_add_private_obj(
539
         cmd_buffer, (uintptr_t)buffer_image,
540
         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
541

542
      /* Bind the buffer memory to the image */
543
      VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset +
544
         i * buf_width * buf_height * buffer_bpp;
545
      result =
546
         vk_common_BindImageMemory(_device, buffer_image,
547
                                   v3dv_device_memory_to_handle(buffer->mem),
548
                                   buffer_offset);
549
      if (result != VK_SUCCESS)
550
         return handled;
551

552
      /* Blit-copy the requested image extent.
553
       *
554
       * Since we are copying, the blit must use the same format on the
555
       * destination and source images to avoid format conversions. The
556
       * only exception is copying stencil, which we upload to a R8UI source
557
       * image, but that we need to blit to a S8D24 destination (the only
558
       * stencil format we support).
559
       */
560
      const VkImageBlit2KHR blit_region = {
561
         .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
562
         .srcSubresource = {
563
            .aspectMask = copy_aspect,
564
            .mipLevel = region->imageSubresource.mipLevel,
565
            .baseArrayLayer = region->imageSubresource.baseArrayLayer + i,
566
            .layerCount = 1,
567
         },
568
         .srcOffsets = {
569
            {
570
               DIV_ROUND_UP(region->imageOffset.x, block_width),
571
               DIV_ROUND_UP(region->imageOffset.y, block_height),
572
               region->imageOffset.z + i,
573
            },
574
            {
575
               DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width,
576
                            block_width),
577
               DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height,
578
                            block_height),
579
               region->imageOffset.z + i + 1,
580
            },
581
         },
582
         .dstSubresource = {
583
            .aspectMask = copy_aspect,
584
            .mipLevel = 0,
585
            .baseArrayLayer = 0,
586
            .layerCount = 1,
587
         },
588
         .dstOffsets = {
589
            { 0, 0, 0 },
590
            {
591
               DIV_ROUND_UP(region->imageExtent.width, block_width),
592
               DIV_ROUND_UP(region->imageExtent.height, block_height),
593
               1
594
            },
595
         },
596
      };
597

598
      handled = blit_shader(cmd_buffer,
599
                            v3dv_image_from_handle(buffer_image), dst_format,
600
                            image, src_format,
601
                            cmask, &cswizzle,
602
                            &blit_region, VK_FILTER_NEAREST, false);
603
      if (!handled) {
604
         /* This is unexpected, we should have a supported blit spec */
605
         unreachable("Unable to blit buffer to destination image");
606
         return false;
607
      }
608
   }
609

610
   assert(handled);
611
   return true;
612
}
613

614
static VkFormat
615
get_compatible_tlb_format(VkFormat format)
616
{
617
   switch (format) {
618
   case VK_FORMAT_R8G8B8A8_SNORM:
619
      return VK_FORMAT_R8G8B8A8_UINT;
620

621
   case VK_FORMAT_R8G8_SNORM:
622
      return VK_FORMAT_R8G8_UINT;
623

624
   case VK_FORMAT_R8_SNORM:
625
      return VK_FORMAT_R8_UINT;
626

627
   case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
628
      return VK_FORMAT_A8B8G8R8_UINT_PACK32;
629

630
   case VK_FORMAT_R16_UNORM:
631
   case VK_FORMAT_R16_SNORM:
632
      return VK_FORMAT_R16_UINT;
633

634
   case VK_FORMAT_R16G16_UNORM:
635
   case VK_FORMAT_R16G16_SNORM:
636
      return VK_FORMAT_R16G16_UINT;
637

638
   case VK_FORMAT_R16G16B16A16_UNORM:
639
   case VK_FORMAT_R16G16B16A16_SNORM:
640
      return VK_FORMAT_R16G16B16A16_UINT;
641

642
   case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
643
      return VK_FORMAT_R32_SFLOAT;
644

645
   /* We can't render to compressed formats using the TLB so instead we use
646
    * a compatible format with the same bpp as the compressed format. Because
647
    * the compressed format's bpp is for a full block (i.e. 4x4 pixels in the
648
    * case of ETC), when we implement copies with the compatible format we
649
    * will have to divide offsets and dimensions on the compressed image by
650
    * the compressed block size.
651
    */
652
   case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK:
653
   case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
654
   case VK_FORMAT_EAC_R11G11_UNORM_BLOCK:
655
   case VK_FORMAT_EAC_R11G11_SNORM_BLOCK:
656
   case VK_FORMAT_BC2_UNORM_BLOCK:
657
   case VK_FORMAT_BC2_SRGB_BLOCK:
658
   case VK_FORMAT_BC3_SRGB_BLOCK:
659
   case VK_FORMAT_BC3_UNORM_BLOCK:
660
   case VK_FORMAT_ASTC_4x4_UNORM_BLOCK:
661
   case VK_FORMAT_ASTC_4x4_SRGB_BLOCK:
662
   case VK_FORMAT_ASTC_5x4_UNORM_BLOCK:
663
   case VK_FORMAT_ASTC_5x4_SRGB_BLOCK:
664
   case VK_FORMAT_ASTC_5x5_UNORM_BLOCK:
665
   case VK_FORMAT_ASTC_5x5_SRGB_BLOCK:
666
   case VK_FORMAT_ASTC_6x5_UNORM_BLOCK:
667
   case VK_FORMAT_ASTC_6x5_SRGB_BLOCK:
668
   case VK_FORMAT_ASTC_6x6_UNORM_BLOCK:
669
   case VK_FORMAT_ASTC_6x6_SRGB_BLOCK:
670
   case VK_FORMAT_ASTC_8x5_UNORM_BLOCK:
671
   case VK_FORMAT_ASTC_8x5_SRGB_BLOCK:
672
   case VK_FORMAT_ASTC_8x6_UNORM_BLOCK:
673
   case VK_FORMAT_ASTC_8x6_SRGB_BLOCK:
674
   case VK_FORMAT_ASTC_8x8_UNORM_BLOCK:
675
   case VK_FORMAT_ASTC_8x8_SRGB_BLOCK:
676
   case VK_FORMAT_ASTC_10x5_UNORM_BLOCK:
677
   case VK_FORMAT_ASTC_10x5_SRGB_BLOCK:
678
   case VK_FORMAT_ASTC_10x6_UNORM_BLOCK:
679
   case VK_FORMAT_ASTC_10x6_SRGB_BLOCK:
680
   case VK_FORMAT_ASTC_10x8_UNORM_BLOCK:
681
   case VK_FORMAT_ASTC_10x8_SRGB_BLOCK:
682
   case VK_FORMAT_ASTC_10x10_UNORM_BLOCK:
683
   case VK_FORMAT_ASTC_10x10_SRGB_BLOCK:
684
   case VK_FORMAT_ASTC_12x10_UNORM_BLOCK:
685
   case VK_FORMAT_ASTC_12x10_SRGB_BLOCK:
686
   case VK_FORMAT_ASTC_12x12_UNORM_BLOCK:
687
   case VK_FORMAT_ASTC_12x12_SRGB_BLOCK:
688
      return VK_FORMAT_R32G32B32A32_UINT;
689

690
   case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK:
691
   case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
692
   case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK:
693
   case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
694
   case VK_FORMAT_EAC_R11_UNORM_BLOCK:
695
   case VK_FORMAT_EAC_R11_SNORM_BLOCK:
696
   case VK_FORMAT_BC1_RGB_UNORM_BLOCK:
697
   case VK_FORMAT_BC1_RGB_SRGB_BLOCK:
698
   case VK_FORMAT_BC1_RGBA_UNORM_BLOCK:
699
   case VK_FORMAT_BC1_RGBA_SRGB_BLOCK:
700
      return VK_FORMAT_R16G16B16A16_UINT;
701

702
   default:
703
      return VK_FORMAT_UNDEFINED;
704
   }
705
}
706

707
static inline bool
708
can_use_tlb(struct v3dv_image *image,
709
            const VkOffset3D *offset,
710
            VkFormat *compat_format)
711
{
712
   if (offset->x != 0 || offset->y != 0)
713
      return false;
714

715
   if (image->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) {
716
      if (compat_format)
717
         *compat_format = image->vk_format;
718
      return true;
719
   }
720

721
   /* If the image format is not TLB-supported, then check if we can use
722
    * a compatible format instead.
723
    */
724
   if (compat_format) {
725
      *compat_format = get_compatible_tlb_format(image->vk_format);
726
      if (*compat_format != VK_FORMAT_UNDEFINED)
727
         return true;
728
   }
729

730
   return false;
731
}
732

733
VKAPI_ATTR void VKAPI_CALL
734
v3dv_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer,
735
                              const VkCopyImageToBufferInfo2KHR *info)
736

737
{
738
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
739
   V3DV_FROM_HANDLE(v3dv_image, image, info->srcImage);
740
   V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->dstBuffer);
741

742
   assert(image->samples == VK_SAMPLE_COUNT_1_BIT);
743

744
   for (uint32_t i = 0; i < info->regionCount; i++) {
745
      if (copy_image_to_buffer_tlb(cmd_buffer, buffer, image, &info->pRegions[i]))
746
         continue;
747
      if (copy_image_to_buffer_blit(cmd_buffer, buffer, image, &info->pRegions[i]))
748
         continue;
749
      unreachable("Unsupported image to buffer copy.");
750
   }
751
}
752

753
/**
754
 * Returns true if the implementation supports the requested operation (even if
755
 * it failed to process it, for example, due to an out-of-memory error).
756
 */
757
static bool
758
copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
759
               struct v3dv_image *dst,
760
               struct v3dv_image *src,
761
               const VkImageCopy2KHR *region)
762
{
763
   /* Destination can't be raster format */
764
   if (dst->tiling == VK_IMAGE_TILING_LINEAR)
765
      return false;
766

767
   /* We can only do full copies, so if the format is D24S8 both aspects need
768
    * to be copied. We only need to check the dst format because the spec
769
    * states that depth/stencil formats must match exactly.
770
    */
771
   if (dst->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
772
       const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
773
                                             VK_IMAGE_ASPECT_STENCIL_BIT;
774
       if (region->dstSubresource.aspectMask != ds_aspects)
775
         return false;
776
   }
777

778
   /* Don't handle copies between uncompressed and compressed formats for now.
779
    *
780
    * FIXME: we should be able to handle these easily but there is no coverage
781
    * in CTS at the moment that make such copies with full images (which we
782
    * require here), only partial copies. Also, in that case the code below that
783
    * checks for "dst image complete" requires some changes, since it is
784
    * checking against the region dimensions, which are in units of the source
785
    * image format.
786
    */
787
   if (vk_format_is_compressed(dst->vk_format) !=
788
       vk_format_is_compressed(src->vk_format)) {
789
      return false;
790
   }
791

792
   /* Source region must start at (0,0) */
793
   if (region->srcOffset.x != 0 || region->srcOffset.y != 0)
794
      return false;
795

796
   /* Destination image must be complete */
797
   if (region->dstOffset.x != 0 || region->dstOffset.y != 0)
798
      return false;
799

800
   const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
801
   uint32_t dst_width = u_minify(dst->extent.width, dst_mip_level);
802
   uint32_t dst_height = u_minify(dst->extent.height, dst_mip_level);
803
   if (region->extent.width != dst_width || region->extent.height != dst_height)
804
      return false;
805

806
   /* From vkCmdCopyImage:
807
    *
808
    *   "When copying between compressed and uncompressed formats the extent
809
    *    members represent the texel dimensions of the source image and not
810
    *    the destination."
811
    */
812
   const uint32_t block_w = vk_format_get_blockwidth(src->vk_format);
813
   const uint32_t block_h = vk_format_get_blockheight(src->vk_format);
814
   uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
815
   uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
816

817
   /* Account for sample count */
818
   assert(dst->samples == src->samples);
819
   if (dst->samples > VK_SAMPLE_COUNT_1_BIT) {
820
      assert(dst->samples == VK_SAMPLE_COUNT_4_BIT);
821
      width *= 2;
822
      height *= 2;
823
   }
824

825
   /* The TFU unit doesn't handle format conversions so we need the formats to
826
    * match. On the other hand, vkCmdCopyImage allows different color formats
827
    * on the source and destination images, but only if they are texel
828
    * compatible. For us, this means that we can effectively ignore different
829
    * formats and just make the copy using either of them, since we are just
830
    * moving raw data and not making any conversions.
831
    *
832
    * Also, the formats supported by the TFU unit are limited, but again, since
833
    * we are only doing raw copies here without interpreting or converting
834
    * the underlying pixel data according to its format, we can always choose
835
    * to use compatible formats that are supported with the TFU unit.
836
    */
837
   assert(dst->cpp == src->cpp);
838
   const struct v3dv_format *format =
839
      v3dv_get_compatible_tfu_format(cmd_buffer->device,
840
                                     dst->cpp, NULL);
841

842
   /* Emit a TFU job for each layer to blit */
843
   const uint32_t layer_count = dst->type != VK_IMAGE_TYPE_3D ?
844
      region->dstSubresource.layerCount :
845
      region->extent.depth;
846
   const uint32_t src_mip_level = region->srcSubresource.mipLevel;
847

848
   const uint32_t base_src_layer = src->type != VK_IMAGE_TYPE_3D ?
849
      region->srcSubresource.baseArrayLayer : region->srcOffset.z;
850
   const uint32_t base_dst_layer = dst->type != VK_IMAGE_TYPE_3D ?
851
      region->dstSubresource.baseArrayLayer : region->dstOffset.z;
852
   for (uint32_t i = 0; i < layer_count; i++) {
853
      v3dv_X(cmd_buffer->device, cmd_buffer_emit_tfu_job)
854
         (cmd_buffer, dst, dst_mip_level, base_dst_layer + i,
855
          src, src_mip_level, base_src_layer + i,
856
          width, height, format);
857
   }
858

859
   return true;
860
}
861

862
/**
863
 * Returns true if the implementation supports the requested operation (even if
864
 * it failed to process it, for example, due to an out-of-memory error).
865
 */
866
static bool
867
copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
868
               struct v3dv_image *dst,
869
               struct v3dv_image *src,
870
               const VkImageCopy2KHR *region)
871
{
872
   VkFormat fb_format;
873
   if (!can_use_tlb(src, &region->srcOffset, &fb_format) ||
874
       !can_use_tlb(dst, &region->dstOffset, &fb_format)) {
875
      return false;
876
   }
877

878
   /* From the Vulkan spec, VkImageCopy valid usage:
879
    *
880
    *    "If neither the calling command’s srcImage nor the calling command’s
881
    *     dstImage has a multi-planar image format then the aspectMask member
882
    *     of srcSubresource and dstSubresource must match."
883
    */
884
   assert(region->dstSubresource.aspectMask ==
885
          region->srcSubresource.aspectMask);
886
   uint32_t internal_type, internal_bpp;
887
   v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
888
      (fb_format, region->dstSubresource.aspectMask,
889
       &internal_type, &internal_bpp);
890

891
   /* From the Vulkan spec with VK_KHR_maintenance1, VkImageCopy valid usage:
892
    *
893
    * "The number of slices of the extent (for 3D) or layers of the
894
    *  srcSubresource (for non-3D) must match the number of slices of the
895
    *  extent (for 3D) or layers of the dstSubresource (for non-3D)."
896
    */
897
   assert((src->type != VK_IMAGE_TYPE_3D ?
898
           region->srcSubresource.layerCount : region->extent.depth) ==
899
          (dst->type != VK_IMAGE_TYPE_3D ?
900
           region->dstSubresource.layerCount : region->extent.depth));
901
   uint32_t num_layers;
902
   if (dst->type != VK_IMAGE_TYPE_3D)
903
      num_layers = region->dstSubresource.layerCount;
904
   else
905
      num_layers = region->extent.depth;
906
   assert(num_layers > 0);
907

908
   struct v3dv_job *job =
909
      v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
910
   if (!job)
911
      return true;
912

913
   /* Handle copy to compressed image using compatible format */
914
   const uint32_t block_w = vk_format_get_blockwidth(dst->vk_format);
915
   const uint32_t block_h = vk_format_get_blockheight(dst->vk_format);
916
   const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
917
   const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
918

919
   v3dv_job_start_frame(job, width, height, num_layers, 1, internal_bpp,
920
                        src->samples > VK_SAMPLE_COUNT_1_BIT);
921

922
   struct framebuffer_data framebuffer;
923
   v3dv_X(job->device, setup_framebuffer_data)(&framebuffer, fb_format, internal_type,
924
                                               &job->frame_tiling);
925

926
   v3dv_X(job->device, job_emit_binning_flush)(job);
927
   v3dv_X(job->device, job_emit_copy_image_rcl)(job, dst, src, &framebuffer, region);
928

929
   v3dv_cmd_buffer_finish_job(cmd_buffer);
930

931
   return true;
932
}
933

934
/**
935
 * Takes the image provided as argument and creates a new image that has
936
 * the same specification and aliases the same memory storage, except that:
937
 *
938
 *   - It has the uncompressed format passed in.
939
 *   - Its original width/height are scaled by the factors passed in.
940
 *
941
 * This is useful to implement copies from compressed images using the blit
942
 * path. The idea is that we create uncompressed "image views" of both the
943
 * source and destination images using the uncompressed format and then we
944
 * define the copy blit in terms of that format.
945
 */
946
static struct v3dv_image *
947
create_image_alias(struct v3dv_cmd_buffer *cmd_buffer,
948
                   struct v3dv_image *src,
949
                   float width_scale,
950
                   float height_scale,
951
                   VkFormat format)
952
{
953
   assert(!vk_format_is_compressed(format));
954

955
   VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
956

957
   VkImageCreateInfo info = {
958
      .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
959
      .imageType = src->type,
960
      .format = format,
961
      .extent = {
962
         .width = src->extent.width * width_scale,
963
         .height = src->extent.height * height_scale,
964
         .depth = src->extent.depth,
965
      },
966
      .mipLevels = src->levels,
967
      .arrayLayers = src->array_size,
968
      .samples = src->samples,
969
      .tiling = src->tiling,
970
      .usage = src->usage,
971
   };
972

973
    VkImage _image;
974
    VkResult result =
975
      v3dv_CreateImage(_device, &info, &cmd_buffer->device->vk.alloc, &_image);
976
    if (result != VK_SUCCESS) {
977
       v3dv_flag_oom(cmd_buffer, NULL);
978
       return NULL;
979
    }
980

981
    struct v3dv_image *image = v3dv_image_from_handle(_image);
982
    image->mem = src->mem;
983
    image->mem_offset = src->mem_offset;
984
    return image;
985
}
986

987
/**
988
 * Returns true if the implementation supports the requested operation (even if
989
 * it failed to process it, for example, due to an out-of-memory error).
990
 */
991
static bool
992
copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
993
                struct v3dv_image *dst,
994
                struct v3dv_image *src,
995
                const VkImageCopy2KHR *region)
996
{
997
   const uint32_t src_block_w = vk_format_get_blockwidth(src->vk_format);
998
   const uint32_t src_block_h = vk_format_get_blockheight(src->vk_format);
999
   const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk_format);
1000
   const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk_format);
1001
   const float block_scale_w = (float)src_block_w / (float)dst_block_w;
1002
   const float block_scale_h = (float)src_block_h / (float)dst_block_h;
1003

1004
   /* We need to choose a single format for the blit to ensure that this is
1005
    * really a copy and there are not format conversions going on. Since we
1006
    * going to blit, we need to make sure that the selected format can be
1007
    * both rendered to and textured from.
1008
    */
1009
   VkFormat format;
1010
   float src_scale_w = 1.0f;
1011
   float src_scale_h = 1.0f;
1012
   float dst_scale_w = block_scale_w;
1013
   float dst_scale_h = block_scale_h;
1014
   if (vk_format_is_compressed(src->vk_format)) {
1015
      /* If we are copying from a compressed format we should be aware that we
1016
       * are going to texture from the source image, and the texture setup
1017
       * knows the actual size of the image, so we need to choose a format
1018
       * that has a per-texel (not per-block) bpp that is compatible for that
1019
       * image size. For example, for a source image with size Bw*WxBh*H
1020
       * and format ETC2_RGBA8_UNORM copied to a WxH image of format RGBA32UI,
1021
       * each of the Bw*WxBh*H texels in the compressed source image is 8-bit
1022
       * (which translates to a 128-bit 4x4 RGBA32 block when uncompressed),
1023
       * so we could specify a blit with size Bw*WxBh*H and a format with
1024
       * a bpp of 8-bit per texel (R8_UINT).
1025
       *
1026
       * Unfortunately, when copying from a format like ETC2_RGB8A1_UNORM,
1027
       * which is 64-bit per texel, then we would need a 4-bit format, which
1028
       * we don't have, so instead we still choose an 8-bit format, but we
1029
       * apply a divisor to the row dimensions of the blit, since we are
1030
       * copying two texels per item.
1031
       *
1032
       * Generally, we can choose any format so long as we compute appropriate
1033
       * divisors for the width and height depending on the source image's
1034
       * bpp.
1035
       */
1036
      assert(src->cpp == dst->cpp);
1037

1038
      format = VK_FORMAT_R32G32_UINT;
1039
      switch (src->cpp) {
1040
      case 16:
1041
         format = VK_FORMAT_R32G32B32A32_UINT;
1042
         break;
1043
      case 8:
1044
         format = VK_FORMAT_R16G16B16A16_UINT;
1045
         break;
1046
      default:
1047
         unreachable("Unsupported compressed format");
1048
      }
1049

1050
      /* Create image views of the src/dst images that we can interpret in
1051
       * terms of the canonical format.
1052
       */
1053
      src_scale_w /= src_block_w;
1054
      src_scale_h /= src_block_h;
1055
      dst_scale_w /= src_block_w;
1056
      dst_scale_h /= src_block_h;
1057

1058
      src = create_image_alias(cmd_buffer, src,
1059
                               src_scale_w, src_scale_h, format);
1060

1061
      dst = create_image_alias(cmd_buffer, dst,
1062
                               dst_scale_w, dst_scale_h, format);
1063
   } else {
1064
      format = src->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO ?
1065
         src->vk_format : get_compatible_tlb_format(src->vk_format);
1066
      if (format == VK_FORMAT_UNDEFINED)
1067
         return false;
1068

1069
      const struct v3dv_format *f = v3dv_X(cmd_buffer->device, get_format)(format);
1070
      if (!f->supported || f->tex_type == TEXTURE_DATA_FORMAT_NO)
1071
         return false;
1072
   }
1073

1074
   /* Given an uncompressed image with size WxH, if we copy it to a compressed
1075
    * image, it will result in an image with size W*bWxH*bH, where bW and bH
1076
    * are the compressed format's block width and height. This means that
1077
    * copies between compressed and uncompressed images involve different
1078
    * image sizes, and therefore, we need to take that into account when
1079
    * setting up the source and destination blit regions below, so they are
1080
    * consistent from the point of view of the single compatible format
1081
    * selected for the copy.
1082
    *
1083
    * We should take into account that the dimensions of the region provided
1084
    * to the copy command are specified in terms of the source image. With that
1085
    * in mind, below we adjust the blit destination region to be consistent with
1086
    * the source region for the compatible format, so basically, we apply
1087
    * the block scale factor to the destination offset provided by the copy
1088
    * command (because it is specified in terms of the destination image, not
1089
    * the source), and then we just add the region copy dimensions to that
1090
    * (since the region dimensions are already specified in terms of the source
1091
    * image).
1092
    */
1093
   const VkOffset3D src_start = {
1094
      region->srcOffset.x * src_scale_w,
1095
      region->srcOffset.y * src_scale_h,
1096
      region->srcOffset.z,
1097
   };
1098
   const VkOffset3D src_end = {
1099
      src_start.x + region->extent.width * src_scale_w,
1100
      src_start.y + region->extent.height * src_scale_h,
1101
      src_start.z + region->extent.depth,
1102
   };
1103

1104
   const VkOffset3D dst_start = {
1105
      region->dstOffset.x * dst_scale_w,
1106
      region->dstOffset.y * dst_scale_h,
1107
      region->dstOffset.z,
1108
   };
1109
   const VkOffset3D dst_end = {
1110
      dst_start.x + region->extent.width * src_scale_w,
1111
      dst_start.y + region->extent.height * src_scale_h,
1112
      dst_start.z + region->extent.depth,
1113
   };
1114

1115
   const VkImageBlit2KHR blit_region = {
1116
      .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
1117
      .srcSubresource = region->srcSubresource,
1118
      .srcOffsets = { src_start, src_end },
1119
      .dstSubresource = region->dstSubresource,
1120
      .dstOffsets = { dst_start, dst_end },
1121
   };
1122
   bool handled = blit_shader(cmd_buffer,
1123
                              dst, format,
1124
                              src, format,
1125
                              0, NULL,
1126
                              &blit_region, VK_FILTER_NEAREST, true);
1127

1128
   /* We should have selected formats that we can blit */
1129
   assert(handled);
1130
   return handled;
1131
}
1132

1133
VKAPI_ATTR void VKAPI_CALL
1134
v3dv_CmdCopyImage2KHR(VkCommandBuffer commandBuffer,
1135
                      const VkCopyImageInfo2KHR *info)
1136

1137
{
1138
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1139
   V3DV_FROM_HANDLE(v3dv_image, src, info->srcImage);
1140
   V3DV_FROM_HANDLE(v3dv_image, dst, info->dstImage);
1141

1142
   assert(src->samples == dst->samples);
1143

1144
   for (uint32_t i = 0; i < info->regionCount; i++) {
1145
      if (copy_image_tfu(cmd_buffer, dst, src, &info->pRegions[i]))
1146
         continue;
1147
      if (copy_image_tlb(cmd_buffer, dst, src, &info->pRegions[i]))
1148
         continue;
1149
      if (copy_image_blit(cmd_buffer, dst, src, &info->pRegions[i]))
1150
         continue;
1151
      unreachable("Image copy not supported");
1152
   }
1153
}
1154

1155
static void
1156
get_hw_clear_color(struct v3dv_device *device,
1157
                   const VkClearColorValue *color,
1158
                   VkFormat fb_format,
1159
                   VkFormat image_format,
1160
                   uint32_t internal_type,
1161
                   uint32_t internal_bpp,
1162
                   uint32_t *hw_color)
1163
{
1164
   const uint32_t internal_size = 4 << internal_bpp;
1165

1166
   /* If the image format doesn't match the framebuffer format, then we are
1167
    * trying to clear an unsupported tlb format using a compatible
1168
    * format for the framebuffer. In this case, we want to make sure that
1169
    * we pack the clear value according to the original format semantics,
1170
    * not the compatible format.
1171
    */
1172
   if (fb_format == image_format) {
1173
      v3dv_X(device, get_hw_clear_color)(color, internal_type, internal_size, hw_color);
1174
   } else {
1175
      union util_color uc;
1176
      enum pipe_format pipe_image_format =
1177
         vk_format_to_pipe_format(image_format);
1178
      util_pack_color(color->float32, pipe_image_format, &uc);
1179
      memcpy(hw_color, uc.ui, internal_size);
1180
   }
1181
}
1182

1183
/* Returns true if the implementation is able to handle the case, false
1184
 * otherwise.
1185
*/
1186
static bool
1187
clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
1188
                struct v3dv_image *image,
1189
                const VkClearValue *clear_value,
1190
                const VkImageSubresourceRange *range)
1191
{
1192
   const VkOffset3D origin = { 0, 0, 0 };
1193
   VkFormat fb_format;
1194
   if (!can_use_tlb(image, &origin, &fb_format))
1195
      return false;
1196

1197
   uint32_t internal_type, internal_bpp;
1198
   v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
1199
      (fb_format, range->aspectMask,
1200
       &internal_type, &internal_bpp);
1201

1202
   union v3dv_clear_value hw_clear_value = { 0 };
1203
   if (range->aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1204
      get_hw_clear_color(cmd_buffer->device, &clear_value->color, fb_format,
1205
                         image->vk_format, internal_type, internal_bpp,
1206
                         &hw_clear_value.color[0]);
1207
   } else {
1208
      assert((range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) ||
1209
             (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT));
1210
      hw_clear_value.z = clear_value->depthStencil.depth;
1211
      hw_clear_value.s = clear_value->depthStencil.stencil;
1212
   }
1213

1214
   uint32_t level_count = range->levelCount == VK_REMAINING_MIP_LEVELS ?
1215
                          image->levels - range->baseMipLevel :
1216
                          range->levelCount;
1217
   uint32_t min_level = range->baseMipLevel;
1218
   uint32_t max_level = range->baseMipLevel + level_count;
1219

1220
   /* For 3D images baseArrayLayer and layerCount must be 0 and 1 respectively.
1221
    * Instead, we need to consider the full depth dimension of the image, which
1222
    * goes from 0 up to the level's depth extent.
1223
    */
1224
   uint32_t min_layer;
1225
   uint32_t max_layer;
1226
   if (image->type != VK_IMAGE_TYPE_3D) {
1227
      uint32_t layer_count = range->layerCount == VK_REMAINING_ARRAY_LAYERS ?
1228
                             image->array_size - range->baseArrayLayer :
1229
                             range->layerCount;
1230
      min_layer = range->baseArrayLayer;
1231
      max_layer = range->baseArrayLayer + layer_count;
1232
   } else {
1233
      min_layer = 0;
1234
      max_layer = 0;
1235
   }
1236

1237
   for (uint32_t level = min_level; level < max_level; level++) {
1238
      if (image->type == VK_IMAGE_TYPE_3D)
1239
         max_layer = u_minify(image->extent.depth, level);
1240
      for (uint32_t layer = min_layer; layer < max_layer; layer++) {
1241
         uint32_t width = u_minify(image->extent.width, level);
1242
         uint32_t height = u_minify(image->extent.height, level);
1243

1244
         struct v3dv_job *job =
1245
            v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
1246

1247
         if (!job)
1248
            return true;
1249

1250
         /* We start a a new job for each layer so the frame "depth" is 1 */
1251
         v3dv_job_start_frame(job, width, height, 1, 1, internal_bpp,
1252
                              image->samples > VK_SAMPLE_COUNT_1_BIT);
1253

1254
         struct framebuffer_data framebuffer;
1255
         v3dv_X(job->device, setup_framebuffer_data)(&framebuffer, fb_format, internal_type,
1256
                                                     &job->frame_tiling);
1257

1258
         v3dv_X(job->device, job_emit_binning_flush)(job);
1259

1260
         /* If this triggers it is an application bug: the spec requires
1261
          * that any aspects to clear are present in the image.
1262
          */
1263
         assert(range->aspectMask & image->aspects);
1264

1265
         v3dv_X(job->device, job_emit_clear_image_rcl)
1266
            (job, image, &framebuffer, &hw_clear_value,
1267
             range->aspectMask, layer, level);
1268

1269
         v3dv_cmd_buffer_finish_job(cmd_buffer);
1270
      }
1271
   }
1272

1273
   return true;
1274
}
1275

1276
VKAPI_ATTR void VKAPI_CALL
1277
v3dv_CmdClearColorImage(VkCommandBuffer commandBuffer,
1278
                        VkImage _image,
1279
                        VkImageLayout imageLayout,
1280
                        const VkClearColorValue *pColor,
1281
                        uint32_t rangeCount,
1282
                        const VkImageSubresourceRange *pRanges)
1283
{
1284
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1285
   V3DV_FROM_HANDLE(v3dv_image, image, _image);
1286

1287
   const VkClearValue clear_value = {
1288
      .color = *pColor,
1289
   };
1290

1291
   for (uint32_t i = 0; i < rangeCount; i++) {
1292
      if (clear_image_tlb(cmd_buffer, image, &clear_value, &pRanges[i]))
1293
         continue;
1294
      unreachable("Unsupported color clear.");
1295
   }
1296
}
1297

1298
VKAPI_ATTR void VKAPI_CALL
1299
v3dv_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1300
                               VkImage _image,
1301
                               VkImageLayout imageLayout,
1302
                               const VkClearDepthStencilValue *pDepthStencil,
1303
                               uint32_t rangeCount,
1304
                               const VkImageSubresourceRange *pRanges)
1305
{
1306
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1307
   V3DV_FROM_HANDLE(v3dv_image, image, _image);
1308

1309
   const VkClearValue clear_value = {
1310
      .depthStencil = *pDepthStencil,
1311
   };
1312

1313
   for (uint32_t i = 0; i < rangeCount; i++) {
1314
      if (clear_image_tlb(cmd_buffer, image, &clear_value, &pRanges[i]))
1315
         continue;
1316
      unreachable("Unsupported depth/stencil clear.");
1317
   }
1318
}
1319

1320
VKAPI_ATTR void VKAPI_CALL
1321
v3dv_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer,
1322
                       const VkCopyBufferInfo2KHR *pCopyBufferInfo)
1323
{
1324
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1325
   V3DV_FROM_HANDLE(v3dv_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
1326
   V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
1327

1328
   for (uint32_t i = 0; i < pCopyBufferInfo->regionCount; i++) {
1329
      v3dv_X(cmd_buffer->device, cmd_buffer_copy_buffer)
1330
         (cmd_buffer,
1331
          dst_buffer->mem->bo, dst_buffer->mem_offset,
1332
          src_buffer->mem->bo, src_buffer->mem_offset,
1333
          &pCopyBufferInfo->pRegions[i]);
1334
   }
1335
}
1336

1337
static void
1338
destroy_update_buffer_cb(VkDevice _device,
1339
                         uint64_t pobj,
1340
                         VkAllocationCallbacks *alloc)
1341
{
1342
   V3DV_FROM_HANDLE(v3dv_device, device, _device);
1343
   struct v3dv_bo *bo = (struct v3dv_bo *)((uintptr_t) pobj);
1344
   v3dv_bo_free(device, bo);
1345
}
1346

1347
VKAPI_ATTR void VKAPI_CALL
1348
v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1349
                     VkBuffer dstBuffer,
1350
                     VkDeviceSize dstOffset,
1351
                     VkDeviceSize dataSize,
1352
                     const void *pData)
1353
{
1354
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1355
   V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);
1356

1357
   struct v3dv_bo *src_bo =
1358
      v3dv_bo_alloc(cmd_buffer->device, dataSize, "vkCmdUpdateBuffer", true);
1359
   if (!src_bo) {
1360
      fprintf(stderr, "Failed to allocate BO for vkCmdUpdateBuffer.\n");
1361
      return;
1362
   }
1363

1364
   bool ok = v3dv_bo_map(cmd_buffer->device, src_bo, src_bo->size);
1365
   if (!ok) {
1366
      fprintf(stderr, "Failed to map BO for vkCmdUpdateBuffer.\n");
1367
      return;
1368
   }
1369

1370
   memcpy(src_bo->map, pData, dataSize);
1371

1372
   v3dv_bo_unmap(cmd_buffer->device, src_bo);
1373

1374
   VkBufferCopy2KHR region = {
1375
      .sType = VK_STRUCTURE_TYPE_BUFFER_COPY_2_KHR,
1376
      .srcOffset = 0,
1377
      .dstOffset = dstOffset,
1378
      .size = dataSize,
1379
   };
1380
   struct v3dv_job *copy_job =
1381
      v3dv_X(cmd_buffer->device, cmd_buffer_copy_buffer)
1382
      (cmd_buffer, dst_buffer->mem->bo, dst_buffer->mem_offset,
1383
       src_bo, 0, &region);
1384

1385
   if (!copy_job)
1386
      return;
1387

1388
   v3dv_cmd_buffer_add_private_obj(
1389
      cmd_buffer, (uint64_t)(uintptr_t)src_bo, destroy_update_buffer_cb);
1390
}
1391

1392
VKAPI_ATTR void VKAPI_CALL
1393
v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,
1394
                   VkBuffer dstBuffer,
1395
                   VkDeviceSize dstOffset,
1396
                   VkDeviceSize size,
1397
                   uint32_t data)
1398
{
1399
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1400
   V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);
1401

1402
   struct v3dv_bo *bo = dst_buffer->mem->bo;
1403

1404
   /* From the Vulkan spec:
1405
    *
1406
    *   "If VK_WHOLE_SIZE is used and the remaining size of the buffer is not
1407
    *    a multiple of 4, then the nearest smaller multiple is used."
1408
    */
1409
   if (size == VK_WHOLE_SIZE) {
1410
      size = dst_buffer->size - dstOffset;
1411
      size -= size % 4;
1412
   }
1413

1414
   v3dv_X(cmd_buffer->device, cmd_buffer_fill_buffer)
1415
      (cmd_buffer, bo, dstOffset, size, data);
1416
}
1417

1418
/**
1419
 * Returns true if the implementation supports the requested operation (even if
1420
 * it failed to process it, for example, due to an out-of-memory error).
1421
 */
1422
static bool
1423
copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
1424
                         struct v3dv_image *image,
1425
                         struct v3dv_buffer *buffer,
1426
                         const VkBufferImageCopy2KHR *region)
1427
{
1428
   assert(image->samples == VK_SAMPLE_COUNT_1_BIT);
1429

1430
   /* Destination can't be raster format */
1431
   if (image->tiling == VK_IMAGE_TILING_LINEAR)
1432
      return false;
1433

1434
   /* We can't copy D24S8 because buffer to image copies only copy one aspect
1435
    * at a time, and the TFU copies full images. Also, V3D depth bits for
1436
    * both D24S8 and D24X8 stored in the 24-bit MSB of each 32-bit word, but
1437
    * the Vulkan spec has the buffer data specified the other way around, so it
1438
    * is not a straight copy, we would havew to swizzle the channels, which the
1439
    * TFU can't do.
1440
    */
1441
   if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
1442
       image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) {
1443
         return false;
1444
   }
1445

1446
   /* Region must include full slice */
1447
   const uint32_t offset_x = region->imageOffset.x;
1448
   const uint32_t offset_y = region->imageOffset.y;
1449
   if (offset_x != 0 || offset_y != 0)
1450
      return false;
1451

1452
   uint32_t width, height;
1453
   if (region->bufferRowLength == 0)
1454
      width = region->imageExtent.width;
1455
   else
1456
      width = region->bufferRowLength;
1457

1458
   if (region->bufferImageHeight == 0)
1459
      height = region->imageExtent.height;
1460
   else
1461
      height = region->bufferImageHeight;
1462

1463
   if (width != image->extent.width || height != image->extent.height)
1464
      return false;
1465

1466
   /* Handle region semantics for compressed images */
1467
   const uint32_t block_w = vk_format_get_blockwidth(image->vk_format);
1468
   const uint32_t block_h = vk_format_get_blockheight(image->vk_format);
1469
   width = DIV_ROUND_UP(width, block_w);
1470
   height = DIV_ROUND_UP(height, block_h);
1471

1472
   /* Format must be supported for texturing via the TFU. Since we are just
1473
    * copying raw data and not converting between pixel formats, we can ignore
1474
    * the image's format and choose a compatible TFU format for the image
1475
    * texel size instead, which expands the list of formats we can handle here.
1476
    */
1477
   const struct v3dv_format *format =
1478
      v3dv_get_compatible_tfu_format(cmd_buffer->device,
1479
                                     image->cpp, NULL);
1480

1481
   const uint32_t mip_level = region->imageSubresource.mipLevel;
1482
   const struct v3d_resource_slice *slice = &image->slices[mip_level];
1483

1484
   uint32_t num_layers;
1485
   if (image->type != VK_IMAGE_TYPE_3D)
1486
      num_layers = region->imageSubresource.layerCount;
1487
   else
1488
      num_layers = region->imageExtent.depth;
1489
   assert(num_layers > 0);
1490

1491
   assert(image->mem && image->mem->bo);
1492
   const struct v3dv_bo *dst_bo = image->mem->bo;
1493

1494
   assert(buffer->mem && buffer->mem->bo);
1495
   const struct v3dv_bo *src_bo = buffer->mem->bo;
1496

1497
   /* Emit a TFU job per layer to copy */
1498
   const uint32_t buffer_stride = width * image->cpp;
1499
   for (int i = 0; i < num_layers; i++) {
1500
      uint32_t layer;
1501
      if (image->type != VK_IMAGE_TYPE_3D)
1502
         layer = region->imageSubresource.baseArrayLayer + i;
1503
      else
1504
         layer = region->imageOffset.z + i;
1505

1506
      struct drm_v3d_submit_tfu tfu = {
1507
         .ios = (height << 16) | width,
1508
         .bo_handles = {
1509
            dst_bo->handle,
1510
            src_bo->handle != dst_bo->handle ? src_bo->handle : 0
1511
         },
1512
      };
1513

1514
      const uint32_t buffer_offset =
1515
         buffer->mem_offset + region->bufferOffset +
1516
         height * buffer_stride * i;
1517

1518
      const uint32_t src_offset = src_bo->offset + buffer_offset;
1519
      tfu.iia |= src_offset;
1520
      tfu.icfg |= V3D_TFU_ICFG_FORMAT_RASTER << V3D_TFU_ICFG_FORMAT_SHIFT;
1521
      tfu.iis |= width;
1522

1523
      const uint32_t dst_offset =
1524
         dst_bo->offset + v3dv_layer_offset(image, mip_level, layer);
1525
      tfu.ioa |= dst_offset;
1526

1527
      tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE +
1528
                  (slice->tiling - V3D_TILING_LINEARTILE)) <<
1529
                   V3D_TFU_IOA_FORMAT_SHIFT;
1530
      tfu.icfg |= format->tex_type << V3D_TFU_ICFG_TTYPE_SHIFT;
1531

1532
      /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
1533
       * OPAD field for the destination (how many extra UIF blocks beyond
1534
       * those necessary to cover the height).
1535
       */
1536
      if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
1537
          slice->tiling == V3D_TILING_UIF_XOR) {
1538
         uint32_t uif_block_h = 2 * v3d_utile_height(image->cpp);
1539
         uint32_t implicit_padded_height = align(height, uif_block_h);
1540
         uint32_t icfg =
1541
            (slice->padded_height - implicit_padded_height) / uif_block_h;
1542
         tfu.icfg |= icfg << V3D_TFU_ICFG_OPAD_SHIFT;
1543
      }
1544

1545
      v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
1546
   }
1547

1548
   return true;
1549
}
1550

1551
/**
1552
 * Returns true if the implementation supports the requested operation (even if
1553
 * it failed to process it, for example, due to an out-of-memory error).
1554
 */
1555
static bool
1556
copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
1557
                         struct v3dv_image *image,
1558
                         struct v3dv_buffer *buffer,
1559
                         const VkBufferImageCopy2KHR *region)
1560
{
1561
   VkFormat fb_format;
1562
   if (!can_use_tlb(image, &region->imageOffset, &fb_format))
1563
      return false;
1564

1565
   uint32_t internal_type, internal_bpp;
1566
   v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
1567
      (fb_format, region->imageSubresource.aspectMask,
1568
       &internal_type, &internal_bpp);
1569

1570
   uint32_t num_layers;
1571
   if (image->type != VK_IMAGE_TYPE_3D)
1572
      num_layers = region->imageSubresource.layerCount;
1573
   else
1574
      num_layers = region->imageExtent.depth;
1575
   assert(num_layers > 0);
1576

1577
   struct v3dv_job *job =
1578
      v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
1579
   if (!job)
1580
      return true;
1581

1582
   /* Handle copy to compressed format using a compatible format */
1583
   const uint32_t block_w = vk_format_get_blockwidth(image->vk_format);
1584
   const uint32_t block_h = vk_format_get_blockheight(image->vk_format);
1585
   const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
1586
   const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
1587

1588
   v3dv_job_start_frame(job, width, height, num_layers, 1, internal_bpp, false);
1589

1590
   struct framebuffer_data framebuffer;
1591
   v3dv_X(job->device, setup_framebuffer_data)(&framebuffer, fb_format, internal_type,
1592
                                               &job->frame_tiling);
1593

1594
   v3dv_X(job->device, job_emit_binning_flush)(job);
1595
   v3dv_X(job->device, job_emit_copy_buffer_to_image_rcl)
1596
      (job, image, buffer, &framebuffer, region);
1597

1598
   v3dv_cmd_buffer_finish_job(cmd_buffer);
1599

1600
   return true;
1601
}
1602

1603
static bool
1604
create_tiled_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
1605
                               struct v3dv_image *image,
1606
                               struct v3dv_buffer *buffer,
1607
                               const VkBufferImageCopy2KHR *region)
1608
{
1609
   if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, region))
1610
      return true;
1611
   if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, region))
1612
      return true;
1613
   return false;
1614
}
1615

1616
static VkResult
1617
create_texel_buffer_copy_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
1618
{
1619
   /* If this is not the first pool we create for this command buffer
1620
    * size it based on the size of the currently exhausted pool.
1621
    */
1622
   uint32_t descriptor_count = 64;
1623
   if (cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE) {
1624
      struct v3dv_descriptor_pool *exhausted_pool =
1625
         v3dv_descriptor_pool_from_handle(cmd_buffer->meta.texel_buffer_copy.dspool);
1626
      descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
1627
   }
1628

1629
   /* Create the descriptor pool */
1630
   cmd_buffer->meta.texel_buffer_copy.dspool = VK_NULL_HANDLE;
1631
   VkDescriptorPoolSize pool_size = {
1632
      .type = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
1633
      .descriptorCount = descriptor_count,
1634
   };
1635
   VkDescriptorPoolCreateInfo info = {
1636
      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
1637
      .maxSets = descriptor_count,
1638
      .poolSizeCount = 1,
1639
      .pPoolSizes = &pool_size,
1640
      .flags = 0,
1641
   };
1642
   VkResult result =
1643
      v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
1644
                                &info,
1645
                                &cmd_buffer->device->vk.alloc,
1646
                                &cmd_buffer->meta.texel_buffer_copy.dspool);
1647

1648
   if (result == VK_SUCCESS) {
1649
      assert(cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE);
1650
      const VkDescriptorPool _pool = cmd_buffer->meta.texel_buffer_copy.dspool;
1651

1652
      v3dv_cmd_buffer_add_private_obj(
1653
         cmd_buffer, (uintptr_t) _pool,
1654
         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
1655

1656
      struct v3dv_descriptor_pool *pool =
1657
         v3dv_descriptor_pool_from_handle(_pool);
1658
      pool->is_driver_internal = true;
1659
   }
1660

1661
   return result;
1662
}
1663

1664
static VkResult
1665
allocate_texel_buffer_copy_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
1666
                                          VkDescriptorSet *set)
1667
{
1668
   /* Make sure we have a descriptor pool */
1669
   VkResult result;
1670
   if (cmd_buffer->meta.texel_buffer_copy.dspool == VK_NULL_HANDLE) {
1671
      result = create_texel_buffer_copy_descriptor_pool(cmd_buffer);
1672
      if (result != VK_SUCCESS)
1673
         return result;
1674
   }
1675
   assert(cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE);
1676

1677
   /* Allocate descriptor set */
1678
   struct v3dv_device *device = cmd_buffer->device;
1679
   VkDevice _device = v3dv_device_to_handle(device);
1680
   VkDescriptorSetAllocateInfo info = {
1681
      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
1682
      .descriptorPool = cmd_buffer->meta.texel_buffer_copy.dspool,
1683
      .descriptorSetCount = 1,
1684
      .pSetLayouts = &device->meta.texel_buffer_copy.ds_layout,
1685
   };
1686
   result = v3dv_AllocateDescriptorSets(_device, &info, set);
1687

1688
   /* If we ran out of pool space, grow the pool and try again */
1689
   if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
1690
      result = create_texel_buffer_copy_descriptor_pool(cmd_buffer);
1691
      if (result == VK_SUCCESS) {
1692
         info.descriptorPool = cmd_buffer->meta.texel_buffer_copy.dspool;
1693
         result = v3dv_AllocateDescriptorSets(_device, &info, set);
1694
      }
1695
   }
1696

1697
   return result;
1698
}
1699

1700
static void
1701
get_texel_buffer_copy_pipeline_cache_key(VkFormat format,
1702
                                         VkColorComponentFlags cmask,
1703
                                         VkComponentMapping *cswizzle,
1704
                                         bool is_layered,
1705
                                         uint8_t *key)
1706
{
1707
   memset(key, 0, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
1708

1709
   uint32_t *p = (uint32_t *) key;
1710

1711
   *p = format;
1712
   p++;
1713

1714
   *p = cmask;
1715
   p++;
1716

1717
   /* Note that that we are using a single byte for this, so we could pack
1718
    * more data into this 32-bit slot in the future.
1719
    */
1720
   *p = is_layered ? 1 : 0;
1721
   p++;
1722

1723
   memcpy(p, cswizzle, sizeof(VkComponentMapping));
1724
   p += sizeof(VkComponentMapping) / sizeof(uint32_t);
1725

1726
   assert(((uint8_t*)p - key) == V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
1727
}
1728

1729
static bool
1730
create_blit_render_pass(struct v3dv_device *device,
1731
                        VkFormat dst_format,
1732
                        VkFormat src_format,
1733
                        VkRenderPass *pass_load,
1734
                        VkRenderPass *pass_no_load);
1735

1736
static nir_ssa_def *gen_rect_vertices(nir_builder *b);
1737

1738
static bool
1739
create_pipeline(struct v3dv_device *device,
1740
                struct v3dv_render_pass *pass,
1741
                struct nir_shader *vs_nir,
1742
                struct nir_shader *gs_nir,
1743
                struct nir_shader *fs_nir,
1744
                const VkPipelineVertexInputStateCreateInfo *vi_state,
1745
                const VkPipelineDepthStencilStateCreateInfo *ds_state,
1746
                const VkPipelineColorBlendStateCreateInfo *cb_state,
1747
                const VkPipelineMultisampleStateCreateInfo *ms_state,
1748
                const VkPipelineLayout layout,
1749
                VkPipeline *pipeline);
1750

1751
static nir_shader *
1752
get_texel_buffer_copy_vs()
1753
{
1754
   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
1755
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, options,
1756
                                                  "meta texel buffer copy vs");
1757
   nir_variable *vs_out_pos =
1758
      nir_variable_create(b.shader, nir_var_shader_out,
1759
                          glsl_vec4_type(), "gl_Position");
1760
   vs_out_pos->data.location = VARYING_SLOT_POS;
1761

1762
   nir_ssa_def *pos = gen_rect_vertices(&b);
1763
   nir_store_var(&b, vs_out_pos, pos, 0xf);
1764

1765
   return b.shader;
1766
}
1767

1768
static nir_shader *
1769
get_texel_buffer_copy_gs()
1770
{
1771
   /* FIXME: this creates a geometry shader that takes the index of a single
1772
    * layer to clear from push constants, so we need to emit a draw call for
1773
    * each layer that we want to clear. We could actually do better and have it
1774
    * take a range of layers however, if we were to do this, we would need to
1775
    * be careful not to exceed the maximum number of output vertices allowed in
1776
    * a geometry shader.
1777
    */
1778
   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
1779
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY, options,
1780
                                                  "meta texel buffer copy gs");
1781
   nir_shader *nir = b.shader;
1782
   nir->info.inputs_read = 1ull << VARYING_SLOT_POS;
1783
   nir->info.outputs_written = (1ull << VARYING_SLOT_POS) |
1784
                               (1ull << VARYING_SLOT_LAYER);
1785
   nir->info.gs.input_primitive = GL_TRIANGLES;
1786
   nir->info.gs.output_primitive = GL_TRIANGLE_STRIP;
1787
   nir->info.gs.vertices_in = 3;
1788
   nir->info.gs.vertices_out = 3;
1789
   nir->info.gs.invocations = 1;
1790
   nir->info.gs.active_stream_mask = 0x1;
1791

1792
   /* in vec4 gl_Position[3] */
1793
   nir_variable *gs_in_pos =
1794
      nir_variable_create(b.shader, nir_var_shader_in,
1795
                          glsl_array_type(glsl_vec4_type(), 3, 0),
1796
                          "in_gl_Position");
1797
   gs_in_pos->data.location = VARYING_SLOT_POS;
1798

1799
   /* out vec4 gl_Position */
1800
   nir_variable *gs_out_pos =
1801
      nir_variable_create(b.shader, nir_var_shader_out, glsl_vec4_type(),
1802
                          "out_gl_Position");
1803
   gs_out_pos->data.location = VARYING_SLOT_POS;
1804

1805
   /* out float gl_Layer */
1806
   nir_variable *gs_out_layer =
1807
      nir_variable_create(b.shader, nir_var_shader_out, glsl_float_type(),
1808
                          "out_gl_Layer");
1809
   gs_out_layer->data.location = VARYING_SLOT_LAYER;
1810

1811
   /* Emit output triangle */
1812
   for (uint32_t i = 0; i < 3; i++) {
1813
      /* gl_Position from shader input */
1814
      nir_deref_instr *in_pos_i =
1815
         nir_build_deref_array_imm(&b, nir_build_deref_var(&b, gs_in_pos), i);
1816
      nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i);
1817

1818
      /* gl_Layer from push constants */
1819
      nir_ssa_def *layer =
1820
         nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
1821
                                .base = TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET,
1822
                                .range = 4);
1823
      nir_store_var(&b, gs_out_layer, layer, 0x1);
1824

1825
      nir_emit_vertex(&b, 0);
1826
   }
1827

1828
   nir_end_primitive(&b, 0);
1829

1830
   return nir;
1831
}
1832

1833
static nir_ssa_def *
1834
load_frag_coord(nir_builder *b)
1835
{
1836
   nir_foreach_shader_in_variable(var, b->shader) {
1837
      if (var->data.location == VARYING_SLOT_POS)
1838
         return nir_load_var(b, var);
1839
   }
1840
   nir_variable *pos = nir_variable_create(b->shader, nir_var_shader_in,
1841
                                           glsl_vec4_type(), NULL);
1842
   pos->data.location = VARYING_SLOT_POS;
1843
   return nir_load_var(b, pos);
1844
}
1845

1846
static uint32_t
1847
component_swizzle_to_nir_swizzle(VkComponentSwizzle comp, VkComponentSwizzle swz)
1848
{
1849
   if (swz == VK_COMPONENT_SWIZZLE_IDENTITY)
1850
      swz = comp;
1851

1852
   switch (swz) {
1853
   case VK_COMPONENT_SWIZZLE_R:
1854
      return 0;
1855
   case VK_COMPONENT_SWIZZLE_G:
1856
      return 1;
1857
   case VK_COMPONENT_SWIZZLE_B:
1858
      return 2;
1859
   case VK_COMPONENT_SWIZZLE_A:
1860
      return 3;
1861
   default:
1862
      unreachable("Invalid swizzle");
1863
   };
1864
}
1865

1866
static nir_shader *
1867
get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format,
1868
                         VkComponentMapping *cswizzle)
1869
{
1870
   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
1871
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options,
1872
                                                  "meta texel buffer copy fs");
1873

1874
   /* We only use the copy from texel buffer shader to implement
1875
    * copy_buffer_to_image_shader, which always selects a compatible integer
1876
    * format for the copy.
1877
    */
1878
   assert(vk_format_is_int(format));
1879

1880
   /* Fragment shader output color */
1881
   nir_variable *fs_out_color =
1882
      nir_variable_create(b.shader, nir_var_shader_out,
1883
                          glsl_uvec4_type(), "out_color");
1884
   fs_out_color->data.location = FRAG_RESULT_DATA0;
1885

1886
   /* Texel buffer input */
1887
   const struct glsl_type *sampler_type =
1888
      glsl_sampler_type(GLSL_SAMPLER_DIM_BUF, false, false, GLSL_TYPE_UINT);
1889
   nir_variable *sampler =
1890
      nir_variable_create(b.shader, nir_var_uniform, sampler_type, "texel_buf");
1891
   sampler->data.descriptor_set = 0;
1892
   sampler->data.binding = 0;
1893

1894
   /* Load the box describing the pixel region we want to copy from the
1895
    * texel buffer.
1896
    */
1897
   nir_ssa_def *box =
1898
      nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0),
1899
                             .base = TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET,
1900
                             .range = 16);
1901

1902
   /* Load the buffer stride (this comes in texel units) */
1903
   nir_ssa_def *stride =
1904
      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
1905
                             .base = TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET,
1906
                             .range = 4);
1907

1908
   /* Load the buffer offset (this comes in texel units) */
1909
   nir_ssa_def *offset =
1910
      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
1911
                             .base = TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET,
1912
                             .range = 4);
1913

1914
   nir_ssa_def *coord = nir_f2i32(&b, load_frag_coord(&b));
1915

1916
   /* Load pixel data from texel buffer based on the x,y offset of the pixel
1917
    * within the box. Texel buffers are 1D arrays of texels.
1918
    *
1919
    * Notice that we already make sure that we only generate fragments that are
1920
    * inside the box through the scissor/viewport state, so our offset into the
1921
    * texel buffer should always be within its bounds and we we don't need
1922
    * to add a check for that here.
1923
    */
1924
   nir_ssa_def *x_offset =
1925
      nir_isub(&b, nir_channel(&b, coord, 0),
1926
                   nir_channel(&b, box, 0));
1927
   nir_ssa_def *y_offset =
1928
      nir_isub(&b, nir_channel(&b, coord, 1),
1929
                   nir_channel(&b, box, 1));
1930
   nir_ssa_def *texel_offset =
1931
      nir_iadd(&b, nir_iadd(&b, offset, x_offset),
1932
                   nir_imul(&b, y_offset, stride));
1933

1934
   nir_ssa_def *tex_deref = &nir_build_deref_var(&b, sampler)->dest.ssa;
1935
   nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2);
1936
   tex->sampler_dim = GLSL_SAMPLER_DIM_BUF;
1937
   tex->op = nir_texop_txf;
1938
   tex->src[0].src_type = nir_tex_src_coord;
1939
   tex->src[0].src = nir_src_for_ssa(texel_offset);
1940
   tex->src[1].src_type = nir_tex_src_texture_deref;
1941
   tex->src[1].src = nir_src_for_ssa(tex_deref);
1942
   tex->dest_type = nir_type_uint32;
1943
   tex->is_array = false;
1944
   tex->coord_components = 1;
1945
   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "texel buffer result");
1946
   nir_builder_instr_insert(&b, &tex->instr);
1947

1948
   uint32_t swiz[4];
1949
   swiz[0] =
1950
      component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_R, cswizzle->r);
1951
   swiz[1] =
1952
      component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_G, cswizzle->g);
1953
   swiz[2] =
1954
      component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_B, cswizzle->b);
1955
   swiz[3] =
1956
      component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_A, cswizzle->a);
1957
   nir_ssa_def *s = nir_swizzle(&b, &tex->dest.ssa, swiz, 4);
1958
   nir_store_var(&b, fs_out_color, s, 0xf);
1959

1960
   return b.shader;
1961
}
1962

1963
static bool
1964
create_texel_buffer_copy_pipeline(struct v3dv_device *device,
1965
                                  VkFormat format,
1966
                                  VkColorComponentFlags cmask,
1967
                                  VkComponentMapping *cswizzle,
1968
                                  bool is_layered,
1969
                                  VkRenderPass _pass,
1970
                                  VkPipelineLayout pipeline_layout,
1971
                                  VkPipeline *pipeline)
1972
{
1973
   struct v3dv_render_pass *pass = v3dv_render_pass_from_handle(_pass);
1974

1975
   assert(vk_format_is_color(format));
1976

1977
   nir_shader *vs_nir = get_texel_buffer_copy_vs();
1978
   nir_shader *fs_nir = get_texel_buffer_copy_fs(device, format, cswizzle);
1979
   nir_shader *gs_nir = is_layered ? get_texel_buffer_copy_gs() : NULL;
1980

1981
   const VkPipelineVertexInputStateCreateInfo vi_state = {
1982
      .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
1983
      .vertexBindingDescriptionCount = 0,
1984
      .vertexAttributeDescriptionCount = 0,
1985
   };
1986

1987
   VkPipelineDepthStencilStateCreateInfo ds_state = {
1988
      .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
1989
   };
1990

1991
   VkPipelineColorBlendAttachmentState blend_att_state[1] = { 0 };
1992
   blend_att_state[0] = (VkPipelineColorBlendAttachmentState) {
1993
      .blendEnable = false,
1994
      .colorWriteMask = cmask,
1995
   };
1996

1997
   const VkPipelineColorBlendStateCreateInfo cb_state = {
1998
      .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
1999
      .logicOpEnable = false,
2000
      .attachmentCount = 1,
2001
      .pAttachments = blend_att_state
2002
   };
2003

2004
   const VkPipelineMultisampleStateCreateInfo ms_state = {
2005
      .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
2006
      .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
2007
      .sampleShadingEnable = false,
2008
      .pSampleMask = NULL,
2009
      .alphaToCoverageEnable = false,
2010
      .alphaToOneEnable = false,
2011
   };
2012

2013
   return create_pipeline(device,
2014
                          pass,
2015
                          vs_nir, gs_nir, fs_nir,
2016
                          &vi_state,
2017
                          &ds_state,
2018
                          &cb_state,
2019
                          &ms_state,
2020
                          pipeline_layout,
2021
                          pipeline);
2022
}
2023

2024
static bool
2025
get_copy_texel_buffer_pipeline(
2026
   struct v3dv_device *device,
2027
   VkFormat format,
2028
   VkColorComponentFlags cmask,
2029
   VkComponentMapping *cswizzle,
2030
   VkImageType image_type,
2031
   bool is_layered,
2032
   struct v3dv_meta_texel_buffer_copy_pipeline **pipeline)
2033
{
2034
   bool ok = true;
2035

2036
   uint8_t key[V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE];
2037
   get_texel_buffer_copy_pipeline_cache_key(format, cmask, cswizzle, is_layered,
2038
                                            key);
2039

2040
   mtx_lock(&device->meta.mtx);
2041
   struct hash_entry *entry =
2042
      _mesa_hash_table_search(device->meta.texel_buffer_copy.cache[image_type],
2043
                              &key);
2044
   if (entry) {
2045
      mtx_unlock(&device->meta.mtx);
2046
      *pipeline = entry->data;
2047
      return true;
2048
   }
2049

2050
   *pipeline = vk_zalloc2(&device->vk.alloc, NULL, sizeof(**pipeline), 8,
2051
                          VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
2052

2053
   if (*pipeline == NULL)
2054
      goto fail;
2055

2056
   /* The blit render pass is compatible */
2057
   ok = create_blit_render_pass(device, format, format,
2058
                                &(*pipeline)->pass,
2059
                                &(*pipeline)->pass_no_load);
2060
   if (!ok)
2061
      goto fail;
2062

2063
   ok =
2064
      create_texel_buffer_copy_pipeline(device,
2065
                                        format, cmask, cswizzle, is_layered,
2066
                                        (*pipeline)->pass,
2067
                                        device->meta.texel_buffer_copy.p_layout,
2068
                                        &(*pipeline)->pipeline);
2069
   if (!ok)
2070
      goto fail;
2071

2072
   _mesa_hash_table_insert(device->meta.texel_buffer_copy.cache[image_type],
2073
                           &key, *pipeline);
2074

2075
   mtx_unlock(&device->meta.mtx);
2076
   return true;
2077

2078
fail:
2079
   mtx_unlock(&device->meta.mtx);
2080

2081
   VkDevice _device = v3dv_device_to_handle(device);
2082
   if (*pipeline) {
2083
      if ((*pipeline)->pass)
2084
         v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->vk.alloc);
2085
      if ((*pipeline)->pipeline)
2086
         v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->vk.alloc);
2087
      vk_free(&device->vk.alloc, *pipeline);
2088
      *pipeline = NULL;
2089
   }
2090

2091
   return false;
2092
}
2093

2094
static bool
2095
texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
2096
                         VkImageAspectFlags aspect,
2097
                         struct v3dv_image *image,
2098
                         VkFormat dst_format,
2099
                         VkFormat src_format,
2100
                         struct v3dv_buffer *buffer,
2101
                         uint32_t buffer_bpp,
2102
                         VkColorComponentFlags cmask,
2103
                         VkComponentMapping *cswizzle,
2104
                         uint32_t region_count,
2105
                         const VkBufferImageCopy2KHR *regions)
2106
{
2107
   VkResult result;
2108
   bool handled = false;
2109

2110
   assert(cswizzle);
2111

2112
   /* This is a copy path, so we don't handle format conversions. The only
2113
    * exception are stencil to D24S8 copies, which are handled as a color
2114
    * masked R8->RGBA8 copy.
2115
    */
2116
   assert(src_format == dst_format ||
2117
          (dst_format == VK_FORMAT_R8G8B8A8_UINT &&
2118
           src_format == VK_FORMAT_R8_UINT &&
2119
           cmask == VK_COLOR_COMPONENT_R_BIT));
2120

2121
   /* We only handle color copies. Callers can copy D/S aspects by using
2122
    * a compatible color format and maybe a cmask/cswizzle for D24 formats.
2123
    */
2124
   if (aspect != VK_IMAGE_ASPECT_COLOR_BIT)
2125
      return handled;
2126

2127
   /* FIXME: we only handle uncompressed images for now. */
2128
   if (vk_format_is_compressed(image->vk_format))
2129
      return handled;
2130

2131
   const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
2132
                                            VK_COLOR_COMPONENT_G_BIT |
2133
                                            VK_COLOR_COMPONENT_B_BIT |
2134
                                            VK_COLOR_COMPONENT_A_BIT;
2135
   if (cmask == 0)
2136
      cmask = full_cmask;
2137

2138
   /* The buffer needs to have VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT
2139
    * so we can bind it as a texel buffer. Otherwise, the buffer view
2140
    * we create below won't setup the texture state that we need for this.
2141
    */
2142
   if (!(buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT)) {
2143
      if (v3dv_buffer_format_supports_features(
2144
             cmd_buffer->device, src_format,
2145
             VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT)) {
2146
         buffer->usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
2147
      } else {
2148
         return handled;
2149
      }
2150
   }
2151

2152
   /* At this point we should be able to handle the copy unless an unexpected
2153
    * error occurs, such as an OOM.
2154
    */
2155
   handled = true;
2156

2157

2158
   /* Compute the number of layers to copy.
2159
    *
2160
    * If we are batching (region_count > 1) all our regions have the same
2161
    * image subresource so we can take this from the first region. For 3D
2162
    * images we require the same depth extent.
2163
    */
2164
   const VkImageSubresourceLayers *resource = &regions[0].imageSubresource;
2165
   uint32_t num_layers;
2166
   if (image->type != VK_IMAGE_TYPE_3D) {
2167
      num_layers = resource->layerCount;
2168
   } else {
2169
      assert(region_count == 1);
2170
      num_layers = regions[0].imageExtent.depth;
2171
   }
2172
   assert(num_layers > 0);
2173

2174
   /* Get the texel buffer copy pipeline */
2175
   struct v3dv_meta_texel_buffer_copy_pipeline *pipeline = NULL;
2176
   bool ok = get_copy_texel_buffer_pipeline(cmd_buffer->device,
2177
                                            dst_format, cmask, cswizzle,
2178
                                            image->type, num_layers > 1,
2179
                                            &pipeline);
2180
   if (!ok)
2181
      return handled;
2182
   assert(pipeline && pipeline->pipeline && pipeline->pass);
2183

2184
   /* Setup descriptor set for the source texel buffer. We don't have to
2185
    * register the descriptor as a private command buffer object since
2186
    * all descriptors will be freed automatically with the descriptor
2187
    * pool.
2188
    */
2189
   VkDescriptorSet set;
2190
   result = allocate_texel_buffer_copy_descriptor_set(cmd_buffer, &set);
2191
   if (result != VK_SUCCESS)
2192
      return handled;
2193

2194
   /* FIXME: for some reason passing region->bufferOffset here for the
2195
    * offset field doesn't work, making the following CTS tests fail:
2196
    *
2197
    * dEQP-VK.api.copy_and_blit.core.buffer_to_image.*buffer_offset*
2198
    *
2199
    * So instead we pass 0 here and we pass the offset in texels as a push
2200
    * constant to the shader, which seems to work correctly.
2201
    */
2202
   VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
2203
   VkBufferViewCreateInfo buffer_view_info = {
2204
      .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
2205
      .buffer = v3dv_buffer_to_handle(buffer),
2206
      .format = src_format,
2207
      .offset = 0,
2208
      .range = VK_WHOLE_SIZE,
2209
   };
2210

2211
   VkBufferView texel_buffer_view;
2212
   result = v3dv_CreateBufferView(_device, &buffer_view_info,
2213
                                  &cmd_buffer->device->vk.alloc,
2214
                                  &texel_buffer_view);
2215
   if (result != VK_SUCCESS)
2216
      return handled;
2217

2218
   v3dv_cmd_buffer_add_private_obj(
2219
      cmd_buffer, (uintptr_t)texel_buffer_view,
2220
      (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyBufferView);
2221

2222
   VkWriteDescriptorSet write = {
2223
      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
2224
      .dstSet = set,
2225
      .dstBinding = 0,
2226
      .dstArrayElement = 0,
2227
      .descriptorCount = 1,
2228
      .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
2229
      .pTexelBufferView = &texel_buffer_view,
2230
   };
2231
   v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
2232

2233
   /* Push command buffer state before starting meta operation */
2234
   v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
2235
   uint32_t dirty_dynamic_state = 0;
2236

2237
   /* Bind common state for all layers and regions  */
2238
   VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
2239
   v3dv_CmdBindPipeline(_cmd_buffer,
2240
                        VK_PIPELINE_BIND_POINT_GRAPHICS,
2241
                        pipeline->pipeline);
2242

2243
   v3dv_CmdBindDescriptorSets(_cmd_buffer,
2244
                              VK_PIPELINE_BIND_POINT_GRAPHICS,
2245
                              cmd_buffer->device->meta.texel_buffer_copy.p_layout,
2246
                              0, 1, &set,
2247
                              0, NULL);
2248

2249
   /* Setup framebuffer.
2250
    *
2251
    * For 3D images, this creates a layered framebuffer with a number of
2252
    * layers matching the depth extent of the 3D image.
2253
    */
2254
   uint32_t fb_width = u_minify(image->extent.width, resource->mipLevel);
2255
   uint32_t fb_height = u_minify(image->extent.height, resource->mipLevel);
2256
   VkImageViewCreateInfo image_view_info = {
2257
      .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
2258
      .image = v3dv_image_to_handle(image),
2259
      .viewType = v3dv_image_type_to_view_type(image->type),
2260
      .format = dst_format,
2261
      .subresourceRange = {
2262
         .aspectMask = aspect,
2263
         .baseMipLevel = resource->mipLevel,
2264
         .levelCount = 1,
2265
         .baseArrayLayer = resource->baseArrayLayer,
2266
         .layerCount = num_layers,
2267
      },
2268
   };
2269
   VkImageView image_view;
2270
   result = v3dv_CreateImageView(_device, &image_view_info,
2271
                                 &cmd_buffer->device->vk.alloc, &image_view);
2272
   if (result != VK_SUCCESS)
2273
      goto fail;
2274

2275
   v3dv_cmd_buffer_add_private_obj(
2276
      cmd_buffer, (uintptr_t)image_view,
2277
      (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
2278

2279
   VkFramebufferCreateInfo fb_info = {
2280
      .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
2281
      .renderPass = pipeline->pass,
2282
      .attachmentCount = 1,
2283
      .pAttachments = &image_view,
2284
      .width = fb_width,
2285
      .height = fb_height,
2286
      .layers = num_layers,
2287
   };
2288

2289
   VkFramebuffer fb;
2290
   result = v3dv_CreateFramebuffer(_device, &fb_info,
2291
                                   &cmd_buffer->device->vk.alloc, &fb);
2292
   if (result != VK_SUCCESS)
2293
      goto fail;
2294

2295
    v3dv_cmd_buffer_add_private_obj(
2296
       cmd_buffer, (uintptr_t)fb,
2297
       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
2298

2299
   /* For each layer */
2300
   for (uint32_t l = 0; l < num_layers; l++) {
2301
       /* Start render pass for this layer.
2302
        *
2303
        * If the we only have one region to copy, then we might be able to
2304
        * skip the TLB load if it is aligned to tile boundaries. All layers
2305
        * copy the same area, so we only need to check this once.
2306
        */
2307
      bool can_skip_tlb_load = false;
2308
      VkRect2D render_area;
2309
      if (region_count == 1) {
2310
         render_area.offset.x = regions[0].imageOffset.x;
2311
         render_area.offset.y = regions[0].imageOffset.y;
2312
         render_area.extent.width = regions[0].imageExtent.width;
2313
         render_area.extent.height = regions[0].imageExtent.height;
2314

2315
         if (l == 0) {
2316
            struct v3dv_render_pass *pipeline_pass =
2317
               v3dv_render_pass_from_handle(pipeline->pass);
2318
            can_skip_tlb_load =
2319
               cmask == full_cmask &&
2320
               v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, &render_area,
2321
                                                 v3dv_framebuffer_from_handle(fb),
2322
                                                 pipeline_pass, 0);
2323
         }
2324
      } else {
2325
         render_area.offset.x = 0;
2326
         render_area.offset.y = 0;
2327
         render_area.extent.width = fb_width;
2328
         render_area.extent.height = fb_height;
2329
      }
2330

2331
      VkRenderPassBeginInfo rp_info = {
2332
         .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
2333
         .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
2334
                                           pipeline->pass,
2335
         .framebuffer = fb,
2336
         .renderArea = render_area,
2337
         .clearValueCount = 0,
2338
      };
2339

2340
      v3dv_CmdBeginRenderPass(_cmd_buffer, &rp_info, VK_SUBPASS_CONTENTS_INLINE);
2341
      struct v3dv_job *job = cmd_buffer->state.job;
2342
      if (!job)
2343
         goto fail;
2344

2345
      /* If we are using a layered copy we need to specify the layer for the
2346
       * Geometry Shader.
2347
       */
2348
      if (num_layers > 1) {
2349
         uint32_t layer = resource->baseArrayLayer + l;
2350
         v3dv_CmdPushConstants(_cmd_buffer,
2351
                               cmd_buffer->device->meta.texel_buffer_copy.p_layout,
2352
                               VK_SHADER_STAGE_GEOMETRY_BIT,
2353
                               24, 4, &layer);
2354
      }
2355

2356
      /* For each region */
2357
      dirty_dynamic_state = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
2358
      for (uint32_t r = 0; r < region_count; r++) {
2359
         const VkBufferImageCopy2KHR *region = &regions[r];
2360

2361
         /* Obtain the 2D buffer region spec */
2362
         uint32_t buf_width, buf_height;
2363
         if (region->bufferRowLength == 0)
2364
             buf_width = region->imageExtent.width;
2365
         else
2366
             buf_width = region->bufferRowLength;
2367

2368
         if (region->bufferImageHeight == 0)
2369
             buf_height = region->imageExtent.height;
2370
         else
2371
             buf_height = region->bufferImageHeight;
2372

2373
         const VkViewport viewport = {
2374
            .x = region->imageOffset.x,
2375
            .y = region->imageOffset.y,
2376
            .width = region->imageExtent.width,
2377
            .height = region->imageExtent.height,
2378
            .minDepth = 0.0f,
2379
            .maxDepth = 1.0f
2380
         };
2381
         v3dv_CmdSetViewport(_cmd_buffer, 0, 1, &viewport);
2382
         const VkRect2D scissor = {
2383
            .offset = { region->imageOffset.x, region->imageOffset.y },
2384
            .extent = { region->imageExtent.width, region->imageExtent.height }
2385
         };
2386
         v3dv_CmdSetScissor(_cmd_buffer, 0, 1, &scissor);
2387

2388
         const VkDeviceSize buf_offset =
2389
            region->bufferOffset / buffer_bpp  + l * buf_height * buf_width;
2390
         uint32_t push_data[6] = {
2391
            region->imageOffset.x,
2392
            region->imageOffset.y,
2393
            region->imageOffset.x + region->imageExtent.width - 1,
2394
            region->imageOffset.y + region->imageExtent.height - 1,
2395
            buf_width,
2396
            buf_offset,
2397
         };
2398

2399
         v3dv_CmdPushConstants(_cmd_buffer,
2400
                               cmd_buffer->device->meta.texel_buffer_copy.p_layout,
2401
                               VK_SHADER_STAGE_FRAGMENT_BIT,
2402
                               0, sizeof(push_data), &push_data);
2403

2404
         v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
2405
      } /* For each region */
2406

2407
      v3dv_CmdEndRenderPass(_cmd_buffer);
2408
   } /* For each layer */
2409

2410
fail:
2411
   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dirty_dynamic_state, true);
2412
   return handled;
2413
}
2414

2415
/**
2416
 * Returns true if the implementation supports the requested operation (even if
2417
 * it failed to process it, for example, due to an out-of-memory error).
2418
 */
2419
static bool
2420
copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
2421
                          VkImageAspectFlags aspect,
2422
                          struct v3dv_image *image,
2423
                          VkFormat dst_format,
2424
                          VkFormat src_format,
2425
                          struct v3dv_buffer *buffer,
2426
                          uint32_t buffer_bpp,
2427
                          VkColorComponentFlags cmask,
2428
                          VkComponentMapping *cswizzle,
2429
                          uint32_t region_count,
2430
                          const VkBufferImageCopy2KHR *regions)
2431
{
2432
   /* Since we can't sample linear images we need to upload the linear
2433
    * buffer to a tiled image that we can use as a blit source, which
2434
    * is slow.
2435
    */
2436
   perf_debug("Falling back to blit path for buffer to image copy.\n");
2437

2438
   struct v3dv_device *device = cmd_buffer->device;
2439
   VkDevice _device = v3dv_device_to_handle(device);
2440
   bool handled = true;
2441

2442
   /* Allocate memory for the tiled image. Since we copy layer by layer
2443
    * we allocate memory to hold a full layer, which is the worse case.
2444
    * For that we create a dummy image with that spec, get memory requirements
2445
    * for it and use that information to create the memory allocation.
2446
    * We will then reuse this memory store for all the regions we want to
2447
    * copy.
2448
    */
2449
   VkImage dummy_image;
2450
   VkImageCreateInfo dummy_info = {
2451
      .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
2452
      .imageType = VK_IMAGE_TYPE_2D,
2453
      .format = src_format,
2454
      .extent = { image->extent.width, image->extent.height, 1 },
2455
      .mipLevels = 1,
2456
      .arrayLayers = 1,
2457
      .samples = VK_SAMPLE_COUNT_1_BIT,
2458
      .tiling = VK_IMAGE_TILING_OPTIMAL,
2459
      .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
2460
               VK_IMAGE_USAGE_TRANSFER_DST_BIT,
2461
      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
2462
      .queueFamilyIndexCount = 0,
2463
      .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
2464
   };
2465
   VkResult result =
2466
      v3dv_CreateImage(_device, &dummy_info, &device->vk.alloc, &dummy_image);
2467
   if (result != VK_SUCCESS)
2468
      return handled;
2469

2470
   VkMemoryRequirements reqs;
2471
   vk_common_GetImageMemoryRequirements(_device, dummy_image, &reqs);
2472
   v3dv_DestroyImage(_device, dummy_image, &device->vk.alloc);
2473

2474
   VkDeviceMemory mem;
2475
   VkMemoryAllocateInfo alloc_info = {
2476
      .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
2477
      .allocationSize = reqs.size,
2478
      .memoryTypeIndex = 0,
2479
   };
2480
   result = v3dv_AllocateMemory(_device, &alloc_info, &device->vk.alloc, &mem);
2481
   if (result != VK_SUCCESS)
2482
      return handled;
2483

2484
   v3dv_cmd_buffer_add_private_obj(
2485
      cmd_buffer, (uintptr_t)mem,
2486
      (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_FreeMemory);
2487

2488
   /* Obtain the layer count.
2489
    *
2490
    * If we are batching (region_count > 1) all our regions have the same
2491
    * image subresource so we can take this from the first region.
2492
    */
2493
   uint32_t num_layers;
2494
   if (image->type != VK_IMAGE_TYPE_3D)
2495
      num_layers = regions[0].imageSubresource.layerCount;
2496
   else
2497
      num_layers = regions[0].imageExtent.depth;
2498
   assert(num_layers > 0);
2499

2500
   /* Sanity check: we can only batch multiple regions together if they have
2501
    * the same framebuffer (so the same layer).
2502
    */
2503
   assert(num_layers == 1 || region_count == 1);
2504

2505
   const uint32_t block_width = vk_format_get_blockwidth(image->vk_format);
2506
   const uint32_t block_height = vk_format_get_blockheight(image->vk_format);
2507

2508
   /* Copy regions by uploading each region to a temporary tiled image using
2509
    * the memory we have just allocated as storage.
2510
    */
2511
   for (uint32_t r = 0; r < region_count; r++) {
2512
      const VkBufferImageCopy2KHR *region = &regions[r];
2513

2514
      /* Obtain the 2D buffer region spec */
2515
      uint32_t buf_width, buf_height;
2516
      if (region->bufferRowLength == 0)
2517
          buf_width = region->imageExtent.width;
2518
      else
2519
          buf_width = region->bufferRowLength;
2520

2521
      if (region->bufferImageHeight == 0)
2522
          buf_height = region->imageExtent.height;
2523
      else
2524
          buf_height = region->bufferImageHeight;
2525

2526
      /* If the image is compressed, the bpp refers to blocks, not pixels */
2527
      buf_width = buf_width / block_width;
2528
      buf_height = buf_height / block_height;
2529

2530
      for (uint32_t i = 0; i < num_layers; i++) {
2531
         /* Create the tiled image */
2532
         VkImageCreateInfo image_info = {
2533
            .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
2534
            .imageType = VK_IMAGE_TYPE_2D,
2535
            .format = src_format,
2536
            .extent = { buf_width, buf_height, 1 },
2537
            .mipLevels = 1,
2538
            .arrayLayers = 1,
2539
            .samples = VK_SAMPLE_COUNT_1_BIT,
2540
            .tiling = VK_IMAGE_TILING_OPTIMAL,
2541
            .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
2542
                     VK_IMAGE_USAGE_TRANSFER_DST_BIT,
2543
            .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
2544
            .queueFamilyIndexCount = 0,
2545
            .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
2546
         };
2547

2548
         VkImage buffer_image;
2549
         VkResult result =
2550
            v3dv_CreateImage(_device, &image_info, &device->vk.alloc,
2551
                             &buffer_image);
2552
         if (result != VK_SUCCESS)
2553
            return handled;
2554

2555
         v3dv_cmd_buffer_add_private_obj(
2556
            cmd_buffer, (uintptr_t)buffer_image,
2557
            (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
2558

2559
         result = vk_common_BindImageMemory(_device, buffer_image, mem, 0);
2560
         if (result != VK_SUCCESS)
2561
            return handled;
2562

2563
         /* Upload buffer contents for the selected layer */
2564
         const VkDeviceSize buf_offset_bytes =
2565
            region->bufferOffset + i * buf_height * buf_width * buffer_bpp;
2566
         const VkBufferImageCopy2KHR buffer_image_copy = {
2567
            .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2_KHR,
2568
            .bufferOffset = buf_offset_bytes,
2569
            .bufferRowLength = region->bufferRowLength / block_width,
2570
            .bufferImageHeight = region->bufferImageHeight / block_height,
2571
            .imageSubresource = {
2572
               .aspectMask = aspect,
2573
               .mipLevel = 0,
2574
               .baseArrayLayer = 0,
2575
               .layerCount = 1,
2576
            },
2577
            .imageOffset = { 0, 0, 0 },
2578
            .imageExtent = { buf_width, buf_height, 1 }
2579
         };
2580
         handled =
2581
            create_tiled_image_from_buffer(cmd_buffer,
2582
                                           v3dv_image_from_handle(buffer_image),
2583
                                           buffer, &buffer_image_copy);
2584
         if (!handled) {
2585
            /* This is unexpected, we should have setup the upload to be
2586
             * conformant to a TFU or TLB copy.
2587
             */
2588
            unreachable("Unable to copy buffer to image through TLB");
2589
            return false;
2590
         }
2591

2592
         /* Blit-copy the requested image extent from the buffer image to the
2593
          * destination image.
2594
          *
2595
          * Since we are copying, the blit must use the same format on the
2596
          * destination and source images to avoid format conversions. The
2597
          * only exception is copying stencil, which we upload to a R8UI source
2598
          * image, but that we need to blit to a S8D24 destination (the only
2599
          * stencil format we support).
2600
          */
2601
         const VkImageBlit2KHR blit_region = {
2602
            .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
2603
            .srcSubresource = {
2604
               .aspectMask = aspect,
2605
               .mipLevel = 0,
2606
               .baseArrayLayer = 0,
2607
               .layerCount = 1,
2608
            },
2609
            .srcOffsets = {
2610
               { 0, 0, 0 },
2611
               { region->imageExtent.width, region->imageExtent.height, 1 },
2612
            },
2613
            .dstSubresource = {
2614
               .aspectMask = aspect,
2615
               .mipLevel = region->imageSubresource.mipLevel,
2616
               .baseArrayLayer = region->imageSubresource.baseArrayLayer + i,
2617
               .layerCount = 1,
2618
            },
2619
            .dstOffsets = {
2620
               {
2621
                  DIV_ROUND_UP(region->imageOffset.x, block_width),
2622
                  DIV_ROUND_UP(region->imageOffset.y, block_height),
2623
                  region->imageOffset.z + i,
2624
               },
2625
               {
2626
                  DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width,
2627
                               block_width),
2628
                  DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height,
2629
                               block_height),
2630
                  region->imageOffset.z + i + 1,
2631
               },
2632
            },
2633
         };
2634

2635
         handled = blit_shader(cmd_buffer,
2636
                               image, dst_format,
2637
                               v3dv_image_from_handle(buffer_image), src_format,
2638
                               cmask, cswizzle,
2639
                               &blit_region, VK_FILTER_NEAREST, true);
2640
         if (!handled) {
2641
            /* This is unexpected, we should have a supported blit spec */
2642
            unreachable("Unable to blit buffer to destination image");
2643
            return false;
2644
         }
2645
      }
2646
   }
2647

2648
   return handled;
2649
}
2650

2651
/**
2652
 * Returns true if the implementation supports the requested operation (even if
2653
 * it failed to process it, for example, due to an out-of-memory error).
2654
 */
2655
static bool
2656
copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
2657
                            struct v3dv_image *image,
2658
                            struct v3dv_buffer *buffer,
2659
                            uint32_t region_count,
2660
                            const VkBufferImageCopy2KHR *regions,
2661
                            bool use_texel_buffer)
2662
{
2663
   /* We can only call this with region_count > 1 if we can batch the regions
2664
    * together, in which case they share the same image subresource, and so
2665
    * the same aspect.
2666
    */
2667
   VkImageAspectFlags aspect = regions[0].imageSubresource.aspectMask;
2668

2669
   /* Generally, the bpp of the data in the buffer matches that of the
2670
    * destination image. The exception is the case where we are uploading
2671
    * stencil (8bpp) to a combined d24s8 image (32bpp).
2672
    */
2673
   uint32_t buf_bpp = image->cpp;
2674

2675
   /* We are about to upload the buffer data to an image so we can then
2676
    * blit that to our destination region. Because we are going to implement
2677
    * the copy as a blit, we want our blit source and destination formats to be
2678
    * the same (to avoid any format conversions), so we choose a canonical
2679
    * format that matches the destination image bpp.
2680
    */
2681
   VkComponentMapping ident_swizzle = {
2682
      .r = VK_COMPONENT_SWIZZLE_IDENTITY,
2683
      .g = VK_COMPONENT_SWIZZLE_IDENTITY,
2684
      .b = VK_COMPONENT_SWIZZLE_IDENTITY,
2685
      .a = VK_COMPONENT_SWIZZLE_IDENTITY,
2686
   };
2687

2688
   VkComponentMapping cswizzle = ident_swizzle;
2689
   VkColorComponentFlags cmask = 0; /* Write all components */
2690
   VkFormat src_format;
2691
   VkFormat dst_format;
2692
   switch (buf_bpp) {
2693
   case 16:
2694
      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
2695
      src_format = VK_FORMAT_R32G32B32A32_UINT;
2696
      dst_format = src_format;
2697
      break;
2698
   case 8:
2699
      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
2700
      src_format = VK_FORMAT_R16G16B16A16_UINT;
2701
      dst_format = src_format;
2702
      break;
2703
   case 4:
2704
      switch (aspect) {
2705
      case VK_IMAGE_ASPECT_COLOR_BIT:
2706
         src_format = VK_FORMAT_R8G8B8A8_UINT;
2707
         dst_format = src_format;
2708
         break;
2709
      case VK_IMAGE_ASPECT_DEPTH_BIT:
2710
         assert(image->vk_format == VK_FORMAT_D32_SFLOAT ||
2711
                image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
2712
                image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32);
2713
         src_format = VK_FORMAT_R8G8B8A8_UINT;
2714
         dst_format = src_format;
2715
         aspect = VK_IMAGE_ASPECT_COLOR_BIT;
2716

2717
         /* For D24 formats, the Vulkan spec states that the depth component
2718
          * in the buffer is stored in the 24-LSB, but V3D wants it in the
2719
          * 24-MSB.
2720
          */
2721
         if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
2722
             image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) {
2723
            cmask = VK_COLOR_COMPONENT_G_BIT |
2724
                    VK_COLOR_COMPONENT_B_BIT |
2725
                    VK_COLOR_COMPONENT_A_BIT;
2726
            cswizzle.r = VK_COMPONENT_SWIZZLE_R;
2727
            cswizzle.g = VK_COMPONENT_SWIZZLE_R;
2728
            cswizzle.b = VK_COMPONENT_SWIZZLE_G;
2729
            cswizzle.a = VK_COMPONENT_SWIZZLE_B;
2730
         }
2731
         break;
2732
      case VK_IMAGE_ASPECT_STENCIL_BIT:
2733
         /* Since we don't support separate stencil this is always a stencil
2734
          * copy to a combined depth/stencil image. Because we don't support
2735
          * separate stencil images, we interpret the buffer data as a
2736
          * color R8UI image, and implement the blit as a compatible color
2737
          * blit to an RGBA8UI destination masking out writes to components
2738
          * GBA (which map to the D24 component of a S8D24 image).
2739
          */
2740
         assert(image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT);
2741
         buf_bpp = 1;
2742
         src_format = VK_FORMAT_R8_UINT;
2743
         dst_format = VK_FORMAT_R8G8B8A8_UINT;
2744
         cmask = VK_COLOR_COMPONENT_R_BIT;
2745
         aspect = VK_IMAGE_ASPECT_COLOR_BIT;
2746
         break;
2747
      default:
2748
         unreachable("unsupported aspect");
2749
         return false;
2750
      };
2751
      break;
2752
   case 2:
2753
      aspect = VK_IMAGE_ASPECT_COLOR_BIT;
2754
      src_format = VK_FORMAT_R16_UINT;
2755
      dst_format = src_format;
2756
      break;
2757
   case 1:
2758
      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
2759
      src_format = VK_FORMAT_R8_UINT;
2760
      dst_format = src_format;
2761
      break;
2762
   default:
2763
      unreachable("unsupported bit-size");
2764
      return false;
2765
   }
2766

2767
   if (use_texel_buffer) {
2768
      return texel_buffer_shader_copy(cmd_buffer, aspect, image,
2769
                                      dst_format, src_format,
2770
                                      buffer, buf_bpp,
2771
                                      cmask, &cswizzle,
2772
                                      region_count, regions);
2773
   } else {
2774
      return copy_buffer_to_image_blit(cmd_buffer, aspect, image,
2775
                                       dst_format, src_format,
2776
                                       buffer, buf_bpp,
2777
                                       cmask, &cswizzle,
2778
                                       region_count, regions);
2779
   }
2780
}
2781

2782
/**
2783
 * Returns true if the implementation supports the requested operation (even if
2784
 * it failed to process it, for example, due to an out-of-memory error).
2785
 */
2786
static bool
2787
copy_buffer_to_image_cpu(struct v3dv_cmd_buffer *cmd_buffer,
2788
                         struct v3dv_image *image,
2789
                         struct v3dv_buffer *buffer,
2790
                         const VkBufferImageCopy2KHR *region)
2791
{
2792
   /* FIXME */
2793
   if (vk_format_is_depth_or_stencil(image->vk_format))
2794
      return false;
2795

2796
   if (vk_format_is_compressed(image->vk_format))
2797
      return false;
2798

2799
   if (image->tiling == VK_IMAGE_TILING_LINEAR)
2800
      return false;
2801

2802
   uint32_t buffer_width, buffer_height;
2803
   if (region->bufferRowLength == 0)
2804
      buffer_width = region->imageExtent.width;
2805
   else
2806
      buffer_width = region->bufferRowLength;
2807

2808
   if (region->bufferImageHeight == 0)
2809
      buffer_height = region->imageExtent.height;
2810
   else
2811
      buffer_height = region->bufferImageHeight;
2812

2813
   uint32_t buffer_stride = buffer_width * image->cpp;
2814
   uint32_t buffer_layer_stride = buffer_stride * buffer_height;
2815

2816
   uint32_t num_layers;
2817
   if (image->type != VK_IMAGE_TYPE_3D)
2818
      num_layers = region->imageSubresource.layerCount;
2819
   else
2820
      num_layers = region->imageExtent.depth;
2821
   assert(num_layers > 0);
2822

2823
   struct v3dv_job *job =
2824
      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
2825
                                     V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
2826
                                     cmd_buffer, -1);
2827
   if (!job)
2828
      return true;
2829

2830
   job->cpu.copy_buffer_to_image.image = image;
2831
   job->cpu.copy_buffer_to_image.buffer = buffer;
2832
   job->cpu.copy_buffer_to_image.buffer_stride = buffer_stride;
2833
   job->cpu.copy_buffer_to_image.buffer_layer_stride = buffer_layer_stride;
2834
   job->cpu.copy_buffer_to_image.buffer_offset = region->bufferOffset;
2835
   job->cpu.copy_buffer_to_image.image_extent = region->imageExtent;
2836
   job->cpu.copy_buffer_to_image.image_offset = region->imageOffset;
2837
   job->cpu.copy_buffer_to_image.mip_level =
2838
      region->imageSubresource.mipLevel;
2839
   job->cpu.copy_buffer_to_image.base_layer =
2840
      region->imageSubresource.baseArrayLayer;
2841
   job->cpu.copy_buffer_to_image.layer_count = num_layers;
2842

2843
   list_addtail(&job->list_link, &cmd_buffer->jobs);
2844

2845
   return true;
2846
}
2847

2848
VKAPI_ATTR void VKAPI_CALL
2849
v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
2850
                              const VkCopyBufferToImageInfo2KHR *info)
2851
{
2852
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2853
   V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->srcBuffer);
2854
   V3DV_FROM_HANDLE(v3dv_image, image, info->dstImage);
2855

2856
   assert(image->samples == VK_SAMPLE_COUNT_1_BIT);
2857

2858
   uint32_t r = 0;
2859
   while (r < info->regionCount) {
2860
      /* The TFU and TLB paths can only copy one region at a time and the region
2861
       * needs to start at the origin. We try these first for the common case
2862
       * where we are copying full images, since they should be the fastest.
2863
       */
2864
      uint32_t batch_size = 1;
2865
      if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, &info->pRegions[r]))
2866
         goto handled;
2867

2868
      if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, &info->pRegions[r]))
2869
         goto handled;
2870

2871
      /* Otherwise, we are copying subrects, so we fallback to copying
2872
       * via shader and texel buffers and we try to batch the regions
2873
       * if possible. We can only batch copies if they have the same
2874
       * framebuffer spec, which is mostly determined by the image
2875
       * subresource of the region.
2876
       */
2877
      const VkImageSubresourceLayers *rsc = &info->pRegions[r].imageSubresource;
2878
      for (uint32_t s = r + 1; s < info->regionCount; s++) {
2879
         const VkImageSubresourceLayers *rsc_s =
2880
            &info->pRegions[s].imageSubresource;
2881

2882
         if (memcmp(rsc, rsc_s, sizeof(VkImageSubresourceLayers)) != 0)
2883
            break;
2884

2885
         /* For 3D images we also need to check the depth extent */
2886
         if (image->type == VK_IMAGE_TYPE_3D &&
2887
             info->pRegions[s].imageExtent.depth !=
2888
             info->pRegions[r].imageExtent.depth) {
2889
               break;
2890
         }
2891

2892
         batch_size++;
2893
      }
2894

2895
      if (copy_buffer_to_image_shader(cmd_buffer, image, buffer,
2896
                                      batch_size, &info->pRegions[r], true)) {
2897
         goto handled;
2898
      }
2899

2900
      /* If we still could not copy, fallback to slower paths.
2901
       *
2902
       * FIXME: we could try to batch these too, but since they are bound to be
2903
       * slow it might not be worth it and we should instead put more effort
2904
       * in handling more cases with the other paths.
2905
       */
2906
      if (copy_buffer_to_image_cpu(cmd_buffer, image, buffer,
2907
                                   &info->pRegions[r])) {
2908
         batch_size = 1;
2909
         goto handled;
2910
      }
2911

2912
      if (copy_buffer_to_image_shader(cmd_buffer, image, buffer,
2913
                                      batch_size, &info->pRegions[r], false)) {
2914
         goto handled;
2915
      }
2916

2917
      unreachable("Unsupported buffer to image copy.");
2918

2919
handled:
2920
      r += batch_size;
2921
   }
2922
}
2923

2924
static void
2925
compute_blit_3d_layers(const VkOffset3D *offsets,
2926
                       uint32_t *min_layer, uint32_t *max_layer,
2927
                       bool *mirror_z);
2928

2929
/**
2930
 * Returns true if the implementation supports the requested operation (even if
2931
 * it failed to process it, for example, due to an out-of-memory error).
2932
 *
2933
 * The TFU blit path doesn't handle scaling so the blit filter parameter can
2934
 * be ignored.
2935
 */
2936
static bool
2937
blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
2938
         struct v3dv_image *dst,
2939
         struct v3dv_image *src,
2940
         const VkImageBlit2KHR *region)
2941
{
2942
   assert(dst->samples == VK_SAMPLE_COUNT_1_BIT);
2943
   assert(src->samples == VK_SAMPLE_COUNT_1_BIT);
2944

2945
   /* Format must match */
2946
   if (src->vk_format != dst->vk_format)
2947
      return false;
2948

2949
   /* Destination can't be raster format */
2950
   if (dst->tiling == VK_IMAGE_TILING_LINEAR)
2951
      return false;
2952

2953
   /* Source region must start at (0,0) */
2954
   if (region->srcOffsets[0].x != 0 || region->srcOffsets[0].y != 0)
2955
      return false;
2956

2957
   /* Destination image must be complete */
2958
   if (region->dstOffsets[0].x != 0 || region->dstOffsets[0].y != 0)
2959
      return false;
2960

2961
   const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
2962
   const uint32_t dst_width = u_minify(dst->extent.width, dst_mip_level);
2963
   const uint32_t dst_height = u_minify(dst->extent.height, dst_mip_level);
2964
   if (region->dstOffsets[1].x < dst_width - 1||
2965
       region->dstOffsets[1].y < dst_height - 1) {
2966
      return false;
2967
   }
2968

2969
   /* No XY scaling */
2970
   if (region->srcOffsets[1].x != region->dstOffsets[1].x ||
2971
       region->srcOffsets[1].y != region->dstOffsets[1].y) {
2972
      return false;
2973
   }
2974

2975
   /* If the format is D24S8 both aspects need to be copied, since the TFU
2976
    * can't be programmed to copy only one aspect of the image.
2977
    */
2978
   if (dst->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
2979
       const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
2980
                                             VK_IMAGE_ASPECT_STENCIL_BIT;
2981
       if (region->dstSubresource.aspectMask != ds_aspects)
2982
          return false;
2983
   }
2984

2985
   /* Our TFU blits only handle exact copies (it requires same formats
2986
    * on input and output, no scaling, etc), so there is no pixel format
2987
    * conversions and we can rewrite the format to use one that is TFU
2988
    * compatible based on its texel size.
2989
    */
2990
   const struct v3dv_format *format =
2991
      v3dv_get_compatible_tfu_format(cmd_buffer->device,
2992
                                     dst->cpp, NULL);
2993

2994
   /* Emit a TFU job for each layer to blit */
2995
   assert(region->dstSubresource.layerCount ==
2996
          region->srcSubresource.layerCount);
2997

2998
   uint32_t min_dst_layer;
2999
   uint32_t max_dst_layer;
3000
   bool dst_mirror_z = false;
3001
   if (dst->type == VK_IMAGE_TYPE_3D) {
3002
      compute_blit_3d_layers(region->dstOffsets,
3003
                             &min_dst_layer, &max_dst_layer,
3004
                             &dst_mirror_z);
3005
   } else {
3006
      min_dst_layer = region->dstSubresource.baseArrayLayer;
3007
      max_dst_layer = min_dst_layer + region->dstSubresource.layerCount;
3008
   }
3009

3010
   uint32_t min_src_layer;
3011
   uint32_t max_src_layer;
3012
   bool src_mirror_z = false;
3013
   if (src->type == VK_IMAGE_TYPE_3D) {
3014
      compute_blit_3d_layers(region->srcOffsets,
3015
                             &min_src_layer, &max_src_layer,
3016
                             &src_mirror_z);
3017
   } else {
3018
      min_src_layer = region->srcSubresource.baseArrayLayer;
3019
      max_src_layer = min_src_layer + region->srcSubresource.layerCount;
3020
   }
3021

3022
   /* No Z scaling for 3D images (for non-3D images both src and dst must
3023
    * have the same layerCount).
3024
    */
3025
   if (max_dst_layer - min_dst_layer != max_src_layer - min_src_layer)
3026
      return false;
3027

3028
   const uint32_t layer_count = max_dst_layer - min_dst_layer;
3029
   const uint32_t src_mip_level = region->srcSubresource.mipLevel;
3030
   for (uint32_t i = 0; i < layer_count; i++) {
3031
      /* Since the TFU path doesn't handle scaling, Z mirroring for 3D images
3032
       * only involves reversing the order of the slices.
3033
       */
3034
      const uint32_t dst_layer =
3035
         dst_mirror_z ? max_dst_layer - i - 1: min_dst_layer + i;
3036
      const uint32_t src_layer =
3037
         src_mirror_z ? max_src_layer - i - 1: min_src_layer + i;
3038
      v3dv_X(cmd_buffer->device, cmd_buffer_emit_tfu_job)
3039
         (cmd_buffer, dst, dst_mip_level, dst_layer,
3040
          src, src_mip_level, src_layer,
3041
          dst_width, dst_height, format);
3042
   }
3043

3044
   return true;
3045
}
3046

3047
static bool
3048
format_needs_software_int_clamp(VkFormat format)
3049
{
3050
   switch (format) {
3051
      case VK_FORMAT_A2R10G10B10_UINT_PACK32:
3052
      case VK_FORMAT_A2R10G10B10_SINT_PACK32:
3053
      case VK_FORMAT_A2B10G10R10_UINT_PACK32:
3054
      case VK_FORMAT_A2B10G10R10_SINT_PACK32:
3055
         return true;
3056
      default:
3057
         return false;
3058
   };
3059
}
3060

3061
static void
3062
get_blit_pipeline_cache_key(VkFormat dst_format,
3063
                            VkFormat src_format,
3064
                            VkColorComponentFlags cmask,
3065
                            VkSampleCountFlagBits dst_samples,
3066
                            VkSampleCountFlagBits src_samples,
3067
                            uint8_t *key)
3068
{
3069
   memset(key, 0, V3DV_META_BLIT_CACHE_KEY_SIZE);
3070

3071
   uint32_t *p = (uint32_t *) key;
3072

3073
   *p = dst_format;
3074
   p++;
3075

3076
   /* Generally, when blitting from a larger format to a smaller format
3077
    * the hardware takes care of clamping the source to the RT range.
3078
    * Specifically, for integer formats, this is done by using
3079
    * V3D_RENDER_TARGET_CLAMP_INT in the render target setup, however, this
3080
    * clamps to the bit-size of the render type, and some formats, such as
3081
    * rgb10a2_uint have a 16-bit type, so it won't do what we need and we
3082
    * require to clamp in software. In these cases, we need to amend the blit
3083
    * shader with clamp code that depends on both the src and dst formats, so
3084
    * we need the src format to be part of the key.
3085
    */
3086
   *p = format_needs_software_int_clamp(dst_format) ? src_format : 0;
3087
   p++;
3088

3089
   *p = cmask;
3090
   p++;
3091

3092
   *p = (dst_samples << 8) | src_samples;
3093
   p++;
3094

3095
   assert(((uint8_t*)p - key) == V3DV_META_BLIT_CACHE_KEY_SIZE);
3096
}
3097

3098
static bool
3099
create_blit_render_pass(struct v3dv_device *device,
3100
                        VkFormat dst_format,
3101
                        VkFormat src_format,
3102
                        VkRenderPass *pass_load,
3103
                        VkRenderPass *pass_no_load)
3104
{
3105
   const bool is_color_blit = vk_format_is_color(dst_format);
3106

3107
   /* Attachment load operation is specified below */
3108
   VkAttachmentDescription att = {
3109
      .format = dst_format,
3110
      .samples = VK_SAMPLE_COUNT_1_BIT,
3111
      .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
3112
      .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
3113
      .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
3114
   };
3115

3116
   VkAttachmentReference att_ref = {
3117
      .attachment = 0,
3118
      .layout = VK_IMAGE_LAYOUT_GENERAL,
3119
   };
3120

3121
   VkSubpassDescription subpass = {
3122
      .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
3123
      .inputAttachmentCount = 0,
3124
      .colorAttachmentCount = is_color_blit ? 1 : 0,
3125
      .pColorAttachments = is_color_blit ? &att_ref : NULL,
3126
      .pResolveAttachments = NULL,
3127
      .pDepthStencilAttachment = is_color_blit ? NULL : &att_ref,
3128
      .preserveAttachmentCount = 0,
3129
      .pPreserveAttachments = NULL,
3130
   };
3131

3132
   VkRenderPassCreateInfo info = {
3133
      .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
3134
      .attachmentCount = 1,
3135
      .pAttachments = &att,
3136
      .subpassCount = 1,
3137
      .pSubpasses = &subpass,
3138
      .dependencyCount = 0,
3139
      .pDependencies = NULL,
3140
   };
3141

3142
   VkResult result;
3143
   att.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
3144
   result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
3145
                                  &info, &device->vk.alloc, pass_load);
3146
   if (result != VK_SUCCESS)
3147
      return false;
3148

3149
   att.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
3150
   result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
3151
                                  &info, &device->vk.alloc, pass_no_load);
3152
   return result == VK_SUCCESS;
3153
}
3154

3155
static nir_ssa_def *
3156
gen_rect_vertices(nir_builder *b)
3157
{
3158
   nir_ssa_def *vertex_id = nir_load_vertex_id(b);
3159

3160
   /* vertex 0: -1.0, -1.0
3161
    * vertex 1: -1.0,  1.0
3162
    * vertex 2:  1.0, -1.0
3163
    * vertex 3:  1.0,  1.0
3164
    *
3165
    * so:
3166
    *
3167
    * channel 0 is vertex_id < 2 ? -1.0 :  1.0
3168
    * channel 1 is vertex id & 1 ?  1.0 : -1.0
3169
    */
3170

3171
   nir_ssa_def *one = nir_imm_int(b, 1);
3172
   nir_ssa_def *c0cmp = nir_ilt(b, vertex_id, nir_imm_int(b, 2));
3173
   nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);
3174

3175
   nir_ssa_def *comp[4];
3176
   comp[0] = nir_bcsel(b, c0cmp,
3177
                       nir_imm_float(b, -1.0f),
3178
                       nir_imm_float(b, 1.0f));
3179

3180
   comp[1] = nir_bcsel(b, c1cmp,
3181
                       nir_imm_float(b, 1.0f),
3182
                       nir_imm_float(b, -1.0f));
3183
   comp[2] = nir_imm_float(b, 0.0f);
3184
   comp[3] = nir_imm_float(b, 1.0f);
3185
   return nir_vec(b, comp, 4);
3186
}
3187

3188
static nir_ssa_def *
3189
gen_tex_coords(nir_builder *b)
3190
{
3191
   nir_ssa_def *tex_box =
3192
      nir_load_push_constant(b, 4, 32, nir_imm_int(b, 0), .base = 0, .range = 16);
3193

3194
   nir_ssa_def *tex_z =
3195
      nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0), .base = 16, .range = 4);
3196

3197
   nir_ssa_def *vertex_id = nir_load_vertex_id(b);
3198

3199
   /* vertex 0: src0_x, src0_y
3200
    * vertex 1: src0_x, src1_y
3201
    * vertex 2: src1_x, src0_y
3202
    * vertex 3: src1_x, src1_y
3203
    *
3204
    * So:
3205
    *
3206
    * channel 0 is vertex_id < 2 ? src0_x : src1_x
3207
    * channel 1 is vertex id & 1 ? src1_y : src0_y
3208
    */
3209

3210
   nir_ssa_def *one = nir_imm_int(b, 1);
3211
   nir_ssa_def *c0cmp = nir_ilt(b, vertex_id, nir_imm_int(b, 2));
3212
   nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);
3213

3214
   nir_ssa_def *comp[4];
3215
   comp[0] = nir_bcsel(b, c0cmp,
3216
                       nir_channel(b, tex_box, 0),
3217
                       nir_channel(b, tex_box, 2));
3218

3219
   comp[1] = nir_bcsel(b, c1cmp,
3220
                       nir_channel(b, tex_box, 3),
3221
                       nir_channel(b, tex_box, 1));
3222
   comp[2] = tex_z;
3223
   comp[3] = nir_imm_float(b, 1.0f);
3224
   return nir_vec(b, comp, 4);
3225
}
3226

3227
static nir_ssa_def *
3228
build_nir_tex_op_read(struct nir_builder *b,
3229
                      nir_ssa_def *tex_pos,
3230
                      enum glsl_base_type tex_type,
3231
                      enum glsl_sampler_dim dim)
3232
{
3233
   assert(dim != GLSL_SAMPLER_DIM_MS);
3234

3235
   const struct glsl_type *sampler_type =
3236
      glsl_sampler_type(dim, false, false, tex_type);
3237
   nir_variable *sampler =
3238
      nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3239
   sampler->data.descriptor_set = 0;
3240
   sampler->data.binding = 0;
3241

3242
   nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
3243
   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
3244
   tex->sampler_dim = dim;
3245
   tex->op = nir_texop_tex;
3246
   tex->src[0].src_type = nir_tex_src_coord;
3247
   tex->src[0].src = nir_src_for_ssa(tex_pos);
3248
   tex->src[1].src_type = nir_tex_src_texture_deref;
3249
   tex->src[1].src = nir_src_for_ssa(tex_deref);
3250
   tex->src[2].src_type = nir_tex_src_sampler_deref;
3251
   tex->src[2].src = nir_src_for_ssa(tex_deref);
3252
   tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
3253
   tex->is_array = glsl_sampler_type_is_array(sampler_type);
3254
   tex->coord_components = tex_pos->num_components;
3255

3256
   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
3257
   nir_builder_instr_insert(b, &tex->instr);
3258
   return &tex->dest.ssa;
3259
}
3260

3261
static nir_ssa_def *
3262
build_nir_tex_op_ms_fetch_sample(struct nir_builder *b,
3263
                                 nir_variable *sampler,
3264
                                 nir_ssa_def *tex_deref,
3265
                                 enum glsl_base_type tex_type,
3266
                                 nir_ssa_def *tex_pos,
3267
                                 nir_ssa_def *sample_idx)
3268
{
3269
   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 4);
3270
   tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
3271
   tex->op = nir_texop_txf_ms;
3272
   tex->src[0].src_type = nir_tex_src_coord;
3273
   tex->src[0].src = nir_src_for_ssa(tex_pos);
3274
   tex->src[1].src_type = nir_tex_src_texture_deref;
3275
   tex->src[1].src = nir_src_for_ssa(tex_deref);
3276
   tex->src[2].src_type = nir_tex_src_sampler_deref;
3277
   tex->src[2].src = nir_src_for_ssa(tex_deref);
3278
   tex->src[3].src_type = nir_tex_src_ms_index;
3279
   tex->src[3].src = nir_src_for_ssa(sample_idx);
3280
   tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
3281
   tex->is_array = false;
3282
   tex->coord_components = tex_pos->num_components;
3283

3284
   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
3285
   nir_builder_instr_insert(b, &tex->instr);
3286
   return &tex->dest.ssa;
3287
}
3288

3289
/* Fetches all samples at the given position and averages them */
3290
static nir_ssa_def *
3291
build_nir_tex_op_ms_resolve(struct nir_builder *b,
3292
                            nir_ssa_def *tex_pos,
3293
                            enum glsl_base_type tex_type,
3294
                            VkSampleCountFlagBits src_samples)
3295
{
3296
   assert(src_samples > VK_SAMPLE_COUNT_1_BIT);
3297
   const struct glsl_type *sampler_type =
3298
      glsl_sampler_type(GLSL_SAMPLER_DIM_MS, false, false, tex_type);
3299
   nir_variable *sampler =
3300
      nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3301
   sampler->data.descriptor_set = 0;
3302
   sampler->data.binding = 0;
3303

3304
   const bool is_int = glsl_base_type_is_integer(tex_type);
3305

3306
   nir_ssa_def *tmp = NULL;
3307
   nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
3308
   for (uint32_t i = 0; i < src_samples; i++) {
3309
      nir_ssa_def *s =
3310
         build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
3311
                                          tex_type, tex_pos,
3312
                                          nir_imm_int(b, i));
3313

3314
      /* For integer formats, the multisample resolve operation is expected to
3315
       * return one of the samples, we just return the first one.
3316
       */
3317
      if (is_int)
3318
         return s;
3319

3320
      tmp = i == 0 ? s : nir_fadd(b, tmp, s);
3321
   }
3322

3323
   assert(!is_int);
3324
   return nir_fmul(b, tmp, nir_imm_float(b, 1.0f / src_samples));
3325
}
3326

3327
/* Fetches the current sample (gl_SampleID) at the given position */
3328
static nir_ssa_def *
3329
build_nir_tex_op_ms_read(struct nir_builder *b,
3330
                         nir_ssa_def *tex_pos,
3331
                         enum glsl_base_type tex_type)
3332
{
3333
   const struct glsl_type *sampler_type =
3334
      glsl_sampler_type(GLSL_SAMPLER_DIM_MS, false, false, tex_type);
3335
   nir_variable *sampler =
3336
      nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3337
   sampler->data.descriptor_set = 0;
3338
   sampler->data.binding = 0;
3339

3340
   nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
3341

3342
   return build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
3343
                                           tex_type, tex_pos,
3344
                                           nir_load_sample_id(b));
3345
}
3346

3347
static nir_ssa_def *
3348
build_nir_tex_op(struct nir_builder *b,
3349
                 struct v3dv_device *device,
3350
                 nir_ssa_def *tex_pos,
3351
                 enum glsl_base_type tex_type,
3352
                 VkSampleCountFlagBits dst_samples,
3353
                 VkSampleCountFlagBits src_samples,
3354
                 enum glsl_sampler_dim dim)
3355
{
3356
   switch (dim) {
3357
   case GLSL_SAMPLER_DIM_MS:
3358
      assert(src_samples == VK_SAMPLE_COUNT_4_BIT);
3359
      /* For multisampled texture sources we need to use fetching instead of
3360
       * normalized texture coordinates. We already configured our blit
3361
       * coordinates to be in texel units, but here we still need to convert
3362
       * them from floating point to integer.
3363
       */
3364
      tex_pos = nir_f2i32(b, tex_pos);
3365

3366
      if (dst_samples == VK_SAMPLE_COUNT_1_BIT)
3367
         return build_nir_tex_op_ms_resolve(b, tex_pos, tex_type, src_samples);
3368
      else
3369
         return build_nir_tex_op_ms_read(b, tex_pos, tex_type);
3370
   default:
3371
      assert(src_samples == VK_SAMPLE_COUNT_1_BIT);
3372
      return build_nir_tex_op_read(b, tex_pos, tex_type, dim);
3373
   }
3374
}
3375

3376
static nir_shader *
3377
get_blit_vs()
3378
{
3379
   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
3380
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, options,
3381
                                                  "meta blit vs");
3382

3383
   const struct glsl_type *vec4 = glsl_vec4_type();
3384

3385
   nir_variable *vs_out_pos =
3386
      nir_variable_create(b.shader, nir_var_shader_out, vec4, "gl_Position");
3387
   vs_out_pos->data.location = VARYING_SLOT_POS;
3388

3389
   nir_variable *vs_out_tex_coord =
3390
      nir_variable_create(b.shader, nir_var_shader_out, vec4, "out_tex_coord");
3391
   vs_out_tex_coord->data.location = VARYING_SLOT_VAR0;
3392
   vs_out_tex_coord->data.interpolation = INTERP_MODE_SMOOTH;
3393

3394
   nir_ssa_def *pos = gen_rect_vertices(&b);
3395
   nir_store_var(&b, vs_out_pos, pos, 0xf);
3396

3397
   nir_ssa_def *tex_coord = gen_tex_coords(&b);
3398
   nir_store_var(&b, vs_out_tex_coord, tex_coord, 0xf);
3399

3400
   return b.shader;
3401
}
3402

3403
static uint32_t
3404
get_channel_mask_for_sampler_dim(enum glsl_sampler_dim sampler_dim)
3405
{
3406
   switch (sampler_dim) {
3407
   case GLSL_SAMPLER_DIM_1D: return 0x1;
3408
   case GLSL_SAMPLER_DIM_2D: return 0x3;
3409
   case GLSL_SAMPLER_DIM_MS: return 0x3;
3410
   case GLSL_SAMPLER_DIM_3D: return 0x7;
3411
   default:
3412
      unreachable("invalid sampler dim");
3413
   };
3414
}
3415

3416
static nir_shader *
3417
get_color_blit_fs(struct v3dv_device *device,
3418
                  VkFormat dst_format,
3419
                  VkFormat src_format,
3420
                  VkSampleCountFlagBits dst_samples,
3421
                  VkSampleCountFlagBits src_samples,
3422
                  enum glsl_sampler_dim sampler_dim)
3423
{
3424
   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
3425
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options,
3426
                                                  "meta blit fs");
3427

3428
   const struct glsl_type *vec4 = glsl_vec4_type();
3429

3430
   nir_variable *fs_in_tex_coord =
3431
      nir_variable_create(b.shader, nir_var_shader_in, vec4, "in_tex_coord");
3432
   fs_in_tex_coord->data.location = VARYING_SLOT_VAR0;
3433

3434
   const struct glsl_type *fs_out_type =
3435
      vk_format_is_sint(dst_format) ? glsl_ivec4_type() :
3436
      vk_format_is_uint(dst_format) ? glsl_uvec4_type() :
3437
                                      glsl_vec4_type();
3438

3439
   enum glsl_base_type src_base_type =
3440
      vk_format_is_sint(src_format) ? GLSL_TYPE_INT :
3441
      vk_format_is_uint(src_format) ? GLSL_TYPE_UINT :
3442
                                      GLSL_TYPE_FLOAT;
3443

3444
   nir_variable *fs_out_color =
3445
      nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color");
3446
   fs_out_color->data.location = FRAG_RESULT_DATA0;
3447

3448
   nir_ssa_def *tex_coord = nir_load_var(&b, fs_in_tex_coord);
3449
   const uint32_t channel_mask = get_channel_mask_for_sampler_dim(sampler_dim);
3450
   tex_coord = nir_channels(&b, tex_coord, channel_mask);
3451

3452
   nir_ssa_def *color = build_nir_tex_op(&b, device, tex_coord, src_base_type,
3453
                                         dst_samples, src_samples, sampler_dim);
3454

3455
   /* For integer textures, if the bit-size of the destination is too small to
3456
    * hold source value, Vulkan (CTS) expects the implementation to clamp to the
3457
    * maximum value the destination can hold. The hardware can clamp to the
3458
    * render target type, which usually matches the component bit-size, but
3459
    * there are some cases that won't match, such as rgb10a2, which has a 16-bit
3460
    * render target type, so in these cases we need to clamp manually.
3461
    */
3462
   if (format_needs_software_int_clamp(dst_format)) {
3463
      assert(vk_format_is_int(dst_format));
3464
      enum pipe_format src_pformat = vk_format_to_pipe_format(src_format);
3465
      enum pipe_format dst_pformat = vk_format_to_pipe_format(dst_format);
3466

3467
      nir_ssa_def *c[4];
3468
      for (uint32_t i = 0; i < 4; i++) {
3469
         c[i] = nir_channel(&b, color, i);
3470

3471
         const uint32_t src_bit_size =
3472
            util_format_get_component_bits(src_pformat,
3473
                                           UTIL_FORMAT_COLORSPACE_RGB,
3474
                                           i);
3475
         const uint32_t dst_bit_size =
3476
            util_format_get_component_bits(dst_pformat,
3477
                                           UTIL_FORMAT_COLORSPACE_RGB,
3478
                                           i);
3479

3480
         if (dst_bit_size >= src_bit_size)
3481
            continue;
3482

3483
         assert(dst_bit_size > 0);
3484
         if (util_format_is_pure_uint(dst_pformat)) {
3485
            nir_ssa_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1);
3486
            c[i] = nir_umin(&b, c[i], max);
3487
         } else {
3488
            nir_ssa_def *max = nir_imm_int(&b, (1 << (dst_bit_size - 1)) - 1);
3489
            nir_ssa_def *min = nir_imm_int(&b, -(1 << (dst_bit_size - 1)));
3490
            c[i] = nir_imax(&b, nir_imin(&b, c[i], max), min);
3491
         }
3492
      }
3493

3494
      color = nir_vec4(&b, c[0], c[1], c[2], c[3]);
3495
   }
3496

3497
   nir_store_var(&b, fs_out_color, color, 0xf);
3498

3499
   return b.shader;
3500
}
3501

3502
static bool
3503
create_pipeline(struct v3dv_device *device,
3504
                struct v3dv_render_pass *pass,
3505
                struct nir_shader *vs_nir,
3506
                struct nir_shader *gs_nir,
3507
                struct nir_shader *fs_nir,
3508
                const VkPipelineVertexInputStateCreateInfo *vi_state,
3509
                const VkPipelineDepthStencilStateCreateInfo *ds_state,
3510
                const VkPipelineColorBlendStateCreateInfo *cb_state,
3511
                const VkPipelineMultisampleStateCreateInfo *ms_state,
3512
                const VkPipelineLayout layout,
3513
                VkPipeline *pipeline)
3514
{
3515
   struct vk_shader_module vs_m;
3516
   struct vk_shader_module gs_m;
3517
   struct vk_shader_module fs_m;
3518

3519
   uint32_t num_stages = gs_nir ? 3 : 2;
3520

3521
   v3dv_shader_module_internal_init(device, &vs_m, vs_nir);
3522
   v3dv_shader_module_internal_init(device, &fs_m, fs_nir);
3523

3524
   VkPipelineShaderStageCreateInfo stages[3] = {
3525
      {
3526
         .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
3527
         .stage = VK_SHADER_STAGE_VERTEX_BIT,
3528
         .module = vk_shader_module_to_handle(&vs_m),
3529
         .pName = "main",
3530
      },
3531
      {
3532
         .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
3533
         .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
3534
         .module = vk_shader_module_to_handle(&fs_m),
3535
         .pName = "main",
3536
      },
3537
      {
3538
         .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
3539
         .stage = VK_SHADER_STAGE_GEOMETRY_BIT,
3540
         .module = VK_NULL_HANDLE,
3541
         .pName = "main",
3542
      },
3543
   };
3544

3545
   if (gs_nir) {
3546
      v3dv_shader_module_internal_init(device, &gs_m, gs_nir);
3547
      stages[2].module = vk_shader_module_to_handle(&gs_m);
3548
   }
3549

3550
   VkGraphicsPipelineCreateInfo info = {
3551
      .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
3552

3553
      .stageCount = num_stages,
3554
      .pStages = stages,
3555

3556
      .pVertexInputState = vi_state,
3557

3558
      .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
3559
         .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
3560
         .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
3561
         .primitiveRestartEnable = false,
3562
      },
3563

3564
      .pViewportState = &(VkPipelineViewportStateCreateInfo) {
3565
         .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
3566
         .viewportCount = 1,
3567
         .scissorCount = 1,
3568
      },
3569

3570
      .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
3571
         .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
3572
         .rasterizerDiscardEnable = false,
3573
         .polygonMode = VK_POLYGON_MODE_FILL,
3574
         .cullMode = VK_CULL_MODE_NONE,
3575
         .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE,
3576
         .depthBiasEnable = false,
3577
      },
3578

3579
      .pMultisampleState = ms_state,
3580

3581
      .pDepthStencilState = ds_state,
3582

3583
      .pColorBlendState = cb_state,
3584

3585
      /* The meta clear pipeline declares all state as dynamic.
3586
       * As a consequence, vkCmdBindPipeline writes no dynamic state
3587
       * to the cmd buffer. Therefore, at the end of the meta clear,
3588
       * we need only restore dynamic state that was vkCmdSet.
3589
       */
3590
      .pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
3591
         .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
3592
         .dynamicStateCount = 6,
3593
         .pDynamicStates = (VkDynamicState[]) {
3594
            VK_DYNAMIC_STATE_VIEWPORT,
3595
            VK_DYNAMIC_STATE_SCISSOR,
3596
            VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK,
3597
            VK_DYNAMIC_STATE_STENCIL_WRITE_MASK,
3598
            VK_DYNAMIC_STATE_STENCIL_REFERENCE,
3599
            VK_DYNAMIC_STATE_BLEND_CONSTANTS,
3600
            VK_DYNAMIC_STATE_DEPTH_BIAS,
3601
            VK_DYNAMIC_STATE_LINE_WIDTH,
3602
         },
3603
      },
3604

3605
      .flags = 0,
3606
      .layout = layout,
3607
      .renderPass = v3dv_render_pass_to_handle(pass),
3608
      .subpass = 0,
3609
   };
3610

3611
   VkResult result =
3612
      v3dv_CreateGraphicsPipelines(v3dv_device_to_handle(device),
3613
                                   VK_NULL_HANDLE,
3614
                                   1, &info,
3615
                                   &device->vk.alloc,
3616
                                   pipeline);
3617

3618
   ralloc_free(vs_nir);
3619
   ralloc_free(fs_nir);
3620

3621
   return result == VK_SUCCESS;
3622
}
3623

3624
static enum glsl_sampler_dim
3625
get_sampler_dim(VkImageType type, VkSampleCountFlagBits src_samples)
3626
{
3627
   /* From the Vulkan 1.0 spec, VkImageCreateInfo Validu Usage:
3628
    *
3629
    *   "If samples is not VK_SAMPLE_COUNT_1_BIT, then imageType must be
3630
    *    VK_IMAGE_TYPE_2D, ..."
3631
    */
3632
   assert(src_samples == VK_SAMPLE_COUNT_1_BIT || type == VK_IMAGE_TYPE_2D);
3633

3634
   switch (type) {
3635
   case VK_IMAGE_TYPE_1D: return GLSL_SAMPLER_DIM_1D;
3636
   case VK_IMAGE_TYPE_2D:
3637
      return src_samples == VK_SAMPLE_COUNT_1_BIT ? GLSL_SAMPLER_DIM_2D :
3638
                                                    GLSL_SAMPLER_DIM_MS;
3639
   case VK_IMAGE_TYPE_3D: return GLSL_SAMPLER_DIM_3D;
3640
   default:
3641
      unreachable("Invalid image type");
3642
   }
3643
}
3644

3645
static bool
3646
create_blit_pipeline(struct v3dv_device *device,
3647
                     VkFormat dst_format,
3648
                     VkFormat src_format,
3649
                     VkColorComponentFlags cmask,
3650
                     VkImageType src_type,
3651
                     VkSampleCountFlagBits dst_samples,
3652
                     VkSampleCountFlagBits src_samples,
3653
                     VkRenderPass _pass,
3654
                     VkPipelineLayout pipeline_layout,
3655
                     VkPipeline *pipeline)
3656
{
3657
   struct v3dv_render_pass *pass = v3dv_render_pass_from_handle(_pass);
3658

3659
   /* We always rewrite depth/stencil blits to compatible color blits */
3660
   assert(vk_format_is_color(dst_format));
3661
   assert(vk_format_is_color(src_format));
3662

3663
   const enum glsl_sampler_dim sampler_dim =
3664
      get_sampler_dim(src_type, src_samples);
3665

3666
   nir_shader *vs_nir = get_blit_vs();
3667
   nir_shader *fs_nir =
3668
      get_color_blit_fs(device, dst_format, src_format,
3669
                        dst_samples, src_samples, sampler_dim);
3670

3671
   const VkPipelineVertexInputStateCreateInfo vi_state = {
3672
      .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
3673
      .vertexBindingDescriptionCount = 0,
3674
      .vertexAttributeDescriptionCount = 0,
3675
   };
3676

3677
   VkPipelineDepthStencilStateCreateInfo ds_state = {
3678
      .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
3679
   };
3680

3681
   VkPipelineColorBlendAttachmentState blend_att_state[1] = { 0 };
3682
   blend_att_state[0] = (VkPipelineColorBlendAttachmentState) {
3683
      .blendEnable = false,
3684
      .colorWriteMask = cmask,
3685
   };
3686

3687
   const VkPipelineColorBlendStateCreateInfo cb_state = {
3688
      .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
3689
      .logicOpEnable = false,
3690
      .attachmentCount = 1,
3691
      .pAttachments = blend_att_state
3692
   };
3693

3694
   const VkPipelineMultisampleStateCreateInfo ms_state = {
3695
      .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
3696
      .rasterizationSamples = dst_samples,
3697
      .sampleShadingEnable = dst_samples > VK_SAMPLE_COUNT_1_BIT,
3698
      .pSampleMask = NULL,
3699
      .alphaToCoverageEnable = false,
3700
      .alphaToOneEnable = false,
3701
   };
3702

3703
   return create_pipeline(device,
3704
                          pass,
3705
                          vs_nir, NULL, fs_nir,
3706
                          &vi_state,
3707
                          &ds_state,
3708
                          &cb_state,
3709
                          &ms_state,
3710
                          pipeline_layout,
3711
                          pipeline);
3712
}
3713

3714
/**
3715
 * Return a pipeline suitable for blitting the requested aspect given the
3716
 * destination and source formats.
3717
 */
3718
static bool
3719
get_blit_pipeline(struct v3dv_device *device,
3720
                  VkFormat dst_format,
3721
                  VkFormat src_format,
3722
                  VkColorComponentFlags cmask,
3723
                  VkImageType src_type,
3724
                  VkSampleCountFlagBits dst_samples,
3725
                  VkSampleCountFlagBits src_samples,
3726
                  struct v3dv_meta_blit_pipeline **pipeline)
3727
{
3728
   bool ok = true;
3729

3730
   uint8_t key[V3DV_META_BLIT_CACHE_KEY_SIZE];
3731
   get_blit_pipeline_cache_key(dst_format, src_format, cmask,
3732
                               dst_samples, src_samples, key);
3733
   mtx_lock(&device->meta.mtx);
3734
   struct hash_entry *entry =
3735
      _mesa_hash_table_search(device->meta.blit.cache[src_type], &key);
3736
   if (entry) {
3737
      mtx_unlock(&device->meta.mtx);
3738
      *pipeline = entry->data;
3739
      return true;
3740
   }
3741

3742
   *pipeline = vk_zalloc2(&device->vk.alloc, NULL, sizeof(**pipeline), 8,
3743
                          VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
3744

3745
   if (*pipeline == NULL)
3746
      goto fail;
3747

3748
   ok = create_blit_render_pass(device, dst_format, src_format,
3749
                                &(*pipeline)->pass,
3750
                                &(*pipeline)->pass_no_load);
3751
   if (!ok)
3752
      goto fail;
3753

3754
   /* Create the pipeline using one of the render passes, they are both
3755
    * compatible, so we don't care which one we use here.
3756
    */
3757
   ok = create_blit_pipeline(device,
3758
                             dst_format,
3759
                             src_format,
3760
                             cmask,
3761
                             src_type,
3762
                             dst_samples,
3763
                             src_samples,
3764
                             (*pipeline)->pass,
3765
                             device->meta.blit.p_layout,
3766
                             &(*pipeline)->pipeline);
3767
   if (!ok)
3768
      goto fail;
3769

3770
   memcpy((*pipeline)->key, key, sizeof((*pipeline)->key));
3771
   _mesa_hash_table_insert(device->meta.blit.cache[src_type],
3772
                           &(*pipeline)->key, *pipeline);
3773

3774
   mtx_unlock(&device->meta.mtx);
3775
   return true;
3776

3777
fail:
3778
   mtx_unlock(&device->meta.mtx);
3779

3780
   VkDevice _device = v3dv_device_to_handle(device);
3781
   if (*pipeline) {
3782
      if ((*pipeline)->pass)
3783
         v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->vk.alloc);
3784
      if ((*pipeline)->pass_no_load)
3785
         v3dv_DestroyRenderPass(_device, (*pipeline)->pass_no_load, &device->vk.alloc);
3786
      if ((*pipeline)->pipeline)
3787
         v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->vk.alloc);
3788
      vk_free(&device->vk.alloc, *pipeline);
3789
      *pipeline = NULL;
3790
   }
3791

3792
   return false;
3793
}
3794

3795
static void
3796
compute_blit_box(const VkOffset3D *offsets,
3797
                 uint32_t image_w, uint32_t image_h,
3798
                 uint32_t *x, uint32_t *y, uint32_t *w, uint32_t *h,
3799
                 bool *mirror_x, bool *mirror_y)
3800
{
3801
   if (offsets[1].x >= offsets[0].x) {
3802
      *mirror_x = false;
3803
      *x = MIN2(offsets[0].x, image_w - 1);
3804
      *w = MIN2(offsets[1].x - offsets[0].x, image_w - offsets[0].x);
3805
   } else {
3806
      *mirror_x = true;
3807
      *x = MIN2(offsets[1].x, image_w - 1);
3808
      *w = MIN2(offsets[0].x - offsets[1].x, image_w - offsets[1].x);
3809
   }
3810
   if (offsets[1].y >= offsets[0].y) {
3811
      *mirror_y = false;
3812
      *y = MIN2(offsets[0].y, image_h - 1);
3813
      *h = MIN2(offsets[1].y - offsets[0].y, image_h - offsets[0].y);
3814
   } else {
3815
      *mirror_y = true;
3816
      *y = MIN2(offsets[1].y, image_h - 1);
3817
      *h = MIN2(offsets[0].y - offsets[1].y, image_h - offsets[1].y);
3818
   }
3819
}
3820

3821
static void
3822
compute_blit_3d_layers(const VkOffset3D *offsets,
3823
                       uint32_t *min_layer, uint32_t *max_layer,
3824
                       bool *mirror_z)
3825
{
3826
   if (offsets[1].z >= offsets[0].z) {
3827
      *mirror_z = false;
3828
      *min_layer = offsets[0].z;
3829
      *max_layer = offsets[1].z;
3830
   } else {
3831
      *mirror_z = true;
3832
      *min_layer = offsets[1].z;
3833
      *max_layer = offsets[0].z;
3834
   }
3835
}
3836

3837
static VkResult
3838
create_blit_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
3839
{
3840
   /* If this is not the first pool we create for this command buffer
3841
    * size it based on the size of the currently exhausted pool.
3842
    */
3843
   uint32_t descriptor_count = 64;
3844
   if (cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE) {
3845
      struct v3dv_descriptor_pool *exhausted_pool =
3846
         v3dv_descriptor_pool_from_handle(cmd_buffer->meta.blit.dspool);
3847
      descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
3848
   }
3849

3850
   /* Create the descriptor pool */
3851
   cmd_buffer->meta.blit.dspool = VK_NULL_HANDLE;
3852
   VkDescriptorPoolSize pool_size = {
3853
      .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
3854
      .descriptorCount = descriptor_count,
3855
   };
3856
   VkDescriptorPoolCreateInfo info = {
3857
      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
3858
      .maxSets = descriptor_count,
3859
      .poolSizeCount = 1,
3860
      .pPoolSizes = &pool_size,
3861
      .flags = 0,
3862
   };
3863
   VkResult result =
3864
      v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
3865
                                &info,
3866
                                &cmd_buffer->device->vk.alloc,
3867
                                &cmd_buffer->meta.blit.dspool);
3868

3869
   if (result == VK_SUCCESS) {
3870
      assert(cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE);
3871
      const VkDescriptorPool _pool = cmd_buffer->meta.blit.dspool;
3872

3873
      v3dv_cmd_buffer_add_private_obj(
3874
         cmd_buffer, (uintptr_t) _pool,
3875
         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
3876

3877
      struct v3dv_descriptor_pool *pool =
3878
         v3dv_descriptor_pool_from_handle(_pool);
3879
      pool->is_driver_internal = true;
3880
   }
3881

3882
   return result;
3883
}
3884

3885
static VkResult
3886
allocate_blit_source_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
3887
                                    VkDescriptorSet *set)
3888
{
3889
   /* Make sure we have a descriptor pool */
3890
   VkResult result;
3891
   if (cmd_buffer->meta.blit.dspool == VK_NULL_HANDLE) {
3892
      result = create_blit_descriptor_pool(cmd_buffer);
3893
      if (result != VK_SUCCESS)
3894
         return result;
3895
   }
3896
   assert(cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE);
3897

3898
   /* Allocate descriptor set */
3899
   struct v3dv_device *device = cmd_buffer->device;
3900
   VkDevice _device = v3dv_device_to_handle(device);
3901
   VkDescriptorSetAllocateInfo info = {
3902
      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
3903
      .descriptorPool = cmd_buffer->meta.blit.dspool,
3904
      .descriptorSetCount = 1,
3905
      .pSetLayouts = &device->meta.blit.ds_layout,
3906
   };
3907
   result = v3dv_AllocateDescriptorSets(_device, &info, set);
3908

3909
   /* If we ran out of pool space, grow the pool and try again */
3910
   if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
3911
      result = create_blit_descriptor_pool(cmd_buffer);
3912
      if (result == VK_SUCCESS) {
3913
         info.descriptorPool = cmd_buffer->meta.blit.dspool;
3914
         result = v3dv_AllocateDescriptorSets(_device, &info, set);
3915
      }
3916
   }
3917

3918
   return result;
3919
}
3920

3921
/**
3922
 * Returns true if the implementation supports the requested operation (even if
3923
 * it failed to process it, for example, due to an out-of-memory error).
3924
 *
3925
 * The caller can specify the channels on the destination to be written via the
3926
 * cmask parameter (which can be 0 to default to all channels), as well as a
3927
 * swizzle to apply to the source via the cswizzle parameter  (which can be NULL
3928
 * to use the default identity swizzle).
3929
 */
3930
static bool
3931
blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
3932
            struct v3dv_image *dst,
3933
            VkFormat dst_format,
3934
            struct v3dv_image *src,
3935
            VkFormat src_format,
3936
            VkColorComponentFlags cmask,
3937
            VkComponentMapping *cswizzle,
3938
            const VkImageBlit2KHR *_region,
3939
            VkFilter filter,
3940
            bool dst_is_padded_image)
3941
{
3942
   bool handled = true;
3943
   VkResult result;
3944
   uint32_t dirty_dynamic_state = 0;
3945

3946
   /* We don't support rendering to linear depth/stencil, this should have
3947
    * been rewritten to a compatible color blit by the caller.
3948
    */
3949
   assert(dst->tiling != VK_IMAGE_TILING_LINEAR ||
3950
          !vk_format_is_depth_or_stencil(dst_format));
3951

3952
   /* Can't sample from linear images */
3953
   if (src->tiling == VK_IMAGE_TILING_LINEAR && src->type != VK_IMAGE_TYPE_1D)
3954
      return false;
3955

3956
   VkImageBlit2KHR region = *_region;
3957
   /* Rewrite combined D/S blits to compatible color blits */
3958
   if (vk_format_is_depth_or_stencil(dst_format)) {
3959
      assert(src_format == dst_format);
3960
      assert(cmask == 0);
3961
      switch(dst_format) {
3962
      case VK_FORMAT_D16_UNORM:
3963
         dst_format = VK_FORMAT_R16_UINT;
3964
         break;
3965
      case VK_FORMAT_D32_SFLOAT:
3966
         dst_format = VK_FORMAT_R32_UINT;
3967
         break;
3968
      case VK_FORMAT_X8_D24_UNORM_PACK32:
3969
      case VK_FORMAT_D24_UNORM_S8_UINT:
3970
         if (region.srcSubresource.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3971
            cmask |= VK_COLOR_COMPONENT_G_BIT |
3972
                     VK_COLOR_COMPONENT_B_BIT |
3973
                     VK_COLOR_COMPONENT_A_BIT;
3974
         }
3975
         if (region.srcSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3976
            assert(dst_format == VK_FORMAT_D24_UNORM_S8_UINT);
3977
            cmask |= VK_COLOR_COMPONENT_R_BIT;
3978
         }
3979
         dst_format = VK_FORMAT_R8G8B8A8_UINT;
3980
         break;
3981
      default:
3982
         unreachable("Unsupported depth/stencil format");
3983
      };
3984
      src_format = dst_format;
3985
      region.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
3986
      region.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
3987
   }
3988

3989
   const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
3990
                                            VK_COLOR_COMPONENT_G_BIT |
3991
                                            VK_COLOR_COMPONENT_B_BIT |
3992
                                            VK_COLOR_COMPONENT_A_BIT;
3993
   if (cmask == 0)
3994
      cmask = full_cmask;
3995

3996
   VkComponentMapping ident_swizzle = {
3997
      .r = VK_COMPONENT_SWIZZLE_IDENTITY,
3998
      .g = VK_COMPONENT_SWIZZLE_IDENTITY,
3999
      .b = VK_COMPONENT_SWIZZLE_IDENTITY,
4000
      .a = VK_COMPONENT_SWIZZLE_IDENTITY,
4001
   };
4002
   if (!cswizzle)
4003
      cswizzle = &ident_swizzle;
4004

4005
   /* When we get here from a copy between compressed / uncompressed images
4006
    * we choose to specify the destination blit region based on the size
4007
    * semantics of the source image of the copy (see copy_image_blit), so we
4008
    * need to apply those same semantics here when we compute the size of the
4009
    * destination image level.
4010
    */
4011
   const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk_format);
4012
   const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk_format);
4013
   const uint32_t src_block_w = vk_format_get_blockwidth(src->vk_format);
4014
   const uint32_t src_block_h = vk_format_get_blockheight(src->vk_format);
4015
   const uint32_t dst_level_w =
4016
      u_minify(DIV_ROUND_UP(dst->extent.width * src_block_w, dst_block_w),
4017
               region.dstSubresource.mipLevel);
4018
   const uint32_t dst_level_h =
4019
      u_minify(DIV_ROUND_UP(dst->extent.height * src_block_h, dst_block_h),
4020
               region.dstSubresource.mipLevel);
4021

4022
   const uint32_t src_level_w =
4023
      u_minify(src->extent.width, region.srcSubresource.mipLevel);
4024
   const uint32_t src_level_h =
4025
      u_minify(src->extent.height, region.srcSubresource.mipLevel);
4026
   const uint32_t src_level_d =
4027
      u_minify(src->extent.depth, region.srcSubresource.mipLevel);
4028

4029
   uint32_t dst_x, dst_y, dst_w, dst_h;
4030
   bool dst_mirror_x, dst_mirror_y;
4031
   compute_blit_box(region.dstOffsets,
4032
                    dst_level_w, dst_level_h,
4033
                    &dst_x, &dst_y, &dst_w, &dst_h,
4034
                    &dst_mirror_x, &dst_mirror_y);
4035

4036
   uint32_t src_x, src_y, src_w, src_h;
4037
   bool src_mirror_x, src_mirror_y;
4038
   compute_blit_box(region.srcOffsets,
4039
                    src_level_w, src_level_h,
4040
                    &src_x, &src_y, &src_w, &src_h,
4041
                    &src_mirror_x, &src_mirror_y);
4042

4043
   uint32_t min_dst_layer;
4044
   uint32_t max_dst_layer;
4045
   bool dst_mirror_z = false;
4046
   if (dst->type != VK_IMAGE_TYPE_3D) {
4047
      min_dst_layer = region.dstSubresource.baseArrayLayer;
4048
      max_dst_layer = min_dst_layer + region.dstSubresource.layerCount;
4049
   } else {
4050
      compute_blit_3d_layers(region.dstOffsets,
4051
                             &min_dst_layer, &max_dst_layer,
4052
                             &dst_mirror_z);
4053
   }
4054

4055
   uint32_t min_src_layer;
4056
   uint32_t max_src_layer;
4057
   bool src_mirror_z = false;
4058
   if (src->type != VK_IMAGE_TYPE_3D) {
4059
      min_src_layer = region.srcSubresource.baseArrayLayer;
4060
      max_src_layer = min_src_layer + region.srcSubresource.layerCount;
4061
   } else {
4062
      compute_blit_3d_layers(region.srcOffsets,
4063
                             &min_src_layer, &max_src_layer,
4064
                             &src_mirror_z);
4065
   }
4066

4067
   uint32_t layer_count = max_dst_layer - min_dst_layer;
4068

4069
   /* Translate source blit coordinates to normalized texture coordinates for
4070
    * single sampled textures. For multisampled textures we require
4071
    * unnormalized coordinates, since we can only do texelFetch on them.
4072
    */
4073
   float coords[4] =  {
4074
      (float)src_x,
4075
      (float)src_y,
4076
      (float)(src_x + src_w),
4077
      (float)(src_y + src_h),
4078
   };
4079

4080
   if (src->samples == VK_SAMPLE_COUNT_1_BIT) {
4081
      coords[0] /= (float)src_level_w;
4082
      coords[1] /= (float)src_level_h;
4083
      coords[2] /= (float)src_level_w;
4084
      coords[3] /= (float)src_level_h;
4085
   }
4086

4087
   /* Handle mirroring */
4088
   const bool mirror_x = dst_mirror_x != src_mirror_x;
4089
   const bool mirror_y = dst_mirror_y != src_mirror_y;
4090
   const bool mirror_z = dst_mirror_z != src_mirror_z;
4091
   float tex_coords[5] = {
4092
      !mirror_x ? coords[0] : coords[2],
4093
      !mirror_y ? coords[1] : coords[3],
4094
      !mirror_x ? coords[2] : coords[0],
4095
      !mirror_y ? coords[3] : coords[1],
4096
      /* Z coordinate for 3D blit sources, to be filled for each
4097
       * destination layer
4098
       */
4099
      0.0f
4100
   };
4101

4102
   /* For blits from 3D images we also need to compute the slice coordinate to
4103
    * sample from, which will change for each layer in the destination.
4104
    * Compute the step we should increase for each iteration.
4105
    */
4106
   const float src_z_step =
4107
      (float)(max_src_layer - min_src_layer) / (float)layer_count;
4108

4109
   /* Get the blit pipeline */
4110
   struct v3dv_meta_blit_pipeline *pipeline = NULL;
4111
   bool ok = get_blit_pipeline(cmd_buffer->device,
4112
                               dst_format, src_format, cmask, src->type,
4113
                               dst->samples, src->samples,
4114
                               &pipeline);
4115
   if (!ok)
4116
      return handled;
4117
   assert(pipeline && pipeline->pipeline &&
4118
          pipeline->pass && pipeline->pass_no_load);
4119

4120
   struct v3dv_device *device = cmd_buffer->device;
4121
   assert(device->meta.blit.ds_layout);
4122

4123
   VkDevice _device = v3dv_device_to_handle(device);
4124
   VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
4125

4126
   /* Create sampler for blit source image */
4127
   VkSamplerCreateInfo sampler_info = {
4128
      .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
4129
      .magFilter = filter,
4130
      .minFilter = filter,
4131
      .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
4132
      .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
4133
      .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
4134
      .mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST,
4135
   };
4136
   VkSampler sampler;
4137
   result = v3dv_CreateSampler(_device, &sampler_info, &device->vk.alloc,
4138
                               &sampler);
4139
   if (result != VK_SUCCESS)
4140
      goto fail;
4141

4142
   v3dv_cmd_buffer_add_private_obj(
4143
      cmd_buffer, (uintptr_t)sampler,
4144
      (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroySampler);
4145

4146
   /* Push command buffer state before starting meta operation */
4147
   v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
4148

4149
   /* Push state that is common for all layers */
4150
   v3dv_CmdBindPipeline(_cmd_buffer,
4151
                        VK_PIPELINE_BIND_POINT_GRAPHICS,
4152
                        pipeline->pipeline);
4153

4154
   const VkViewport viewport = {
4155
      .x = dst_x,
4156
      .y = dst_y,
4157
      .width = dst_w,
4158
      .height = dst_h,
4159
      .minDepth = 0.0f,
4160
      .maxDepth = 1.0f
4161
   };
4162
   v3dv_CmdSetViewport(_cmd_buffer, 0, 1, &viewport);
4163

4164
   const VkRect2D scissor = {
4165
      .offset = { dst_x, dst_y },
4166
      .extent = { dst_w, dst_h }
4167
   };
4168
   v3dv_CmdSetScissor(_cmd_buffer, 0, 1, &scissor);
4169

4170
   bool can_skip_tlb_load = false;
4171
   const VkRect2D render_area = {
4172
      .offset = { dst_x, dst_y },
4173
      .extent = { dst_w, dst_h },
4174
   };
4175

4176
   /* Record per-layer commands */
4177
   VkImageAspectFlags aspects = region.dstSubresource.aspectMask;
4178
   for (uint32_t i = 0; i < layer_count; i++) {
4179
      /* Setup framebuffer */
4180
      VkImageViewCreateInfo dst_image_view_info = {
4181
         .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
4182
         .image = v3dv_image_to_handle(dst),
4183
         .viewType = v3dv_image_type_to_view_type(dst->type),
4184
         .format = dst_format,
4185
         .subresourceRange = {
4186
            .aspectMask = aspects,
4187
            .baseMipLevel = region.dstSubresource.mipLevel,
4188
            .levelCount = 1,
4189
            .baseArrayLayer = min_dst_layer + i,
4190
            .layerCount = 1
4191
         },
4192
      };
4193
      VkImageView dst_image_view;
4194
      result = v3dv_CreateImageView(_device, &dst_image_view_info,
4195
                                    &device->vk.alloc, &dst_image_view);
4196
      if (result != VK_SUCCESS)
4197
         goto fail;
4198

4199
      v3dv_cmd_buffer_add_private_obj(
4200
         cmd_buffer, (uintptr_t)dst_image_view,
4201
         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
4202

4203
      VkFramebufferCreateInfo fb_info = {
4204
         .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
4205
         .renderPass = pipeline->pass,
4206
         .attachmentCount = 1,
4207
         .pAttachments = &dst_image_view,
4208
         .width = dst_x + dst_w,
4209
         .height = dst_y + dst_h,
4210
         .layers = 1,
4211
      };
4212

4213
      VkFramebuffer fb;
4214
      result = v3dv_CreateFramebuffer(_device, &fb_info,
4215
                                      &cmd_buffer->device->vk.alloc, &fb);
4216
      if (result != VK_SUCCESS)
4217
         goto fail;
4218

4219
      struct v3dv_framebuffer *framebuffer = v3dv_framebuffer_from_handle(fb);
4220
      framebuffer->has_edge_padding = fb_info.width == dst_level_w &&
4221
                                      fb_info.height == dst_level_h &&
4222
                                      dst_is_padded_image;
4223

4224
      v3dv_cmd_buffer_add_private_obj(
4225
         cmd_buffer, (uintptr_t)fb,
4226
         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
4227

4228
      /* Setup descriptor set for blit source texture. We don't have to
4229
       * register the descriptor as a private command buffer object since
4230
       * all descriptors will be freed automatically with the descriptor
4231
       * pool.
4232
       */
4233
      VkDescriptorSet set;
4234
      result = allocate_blit_source_descriptor_set(cmd_buffer, &set);
4235
      if (result != VK_SUCCESS)
4236
         goto fail;
4237

4238
      VkImageViewCreateInfo src_image_view_info = {
4239
         .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
4240
         .image = v3dv_image_to_handle(src),
4241
         .viewType = v3dv_image_type_to_view_type(src->type),
4242
         .format = src_format,
4243
         .components = *cswizzle,
4244
         .subresourceRange = {
4245
            .aspectMask = aspects,
4246
            .baseMipLevel = region.srcSubresource.mipLevel,
4247
            .levelCount = 1,
4248
            .baseArrayLayer =
4249
               src->type == VK_IMAGE_TYPE_3D ? 0 : min_src_layer + i,
4250
            .layerCount = 1
4251
         },
4252
      };
4253
      VkImageView src_image_view;
4254
      result = v3dv_CreateImageView(_device, &src_image_view_info,
4255
                                    &device->vk.alloc, &src_image_view);
4256
      if (result != VK_SUCCESS)
4257
         goto fail;
4258

4259
      v3dv_cmd_buffer_add_private_obj(
4260
         cmd_buffer, (uintptr_t)src_image_view,
4261
         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
4262

4263
      VkDescriptorImageInfo image_info = {
4264
         .sampler = sampler,
4265
         .imageView = src_image_view,
4266
         .imageLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
4267
      };
4268
      VkWriteDescriptorSet write = {
4269
         .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
4270
         .dstSet = set,
4271
         .dstBinding = 0,
4272
         .dstArrayElement = 0,
4273
         .descriptorCount = 1,
4274
         .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
4275
         .pImageInfo = &image_info,
4276
      };
4277
      v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
4278

4279
      v3dv_CmdBindDescriptorSets(_cmd_buffer,
4280
                                 VK_PIPELINE_BIND_POINT_GRAPHICS,
4281
                                 device->meta.blit.p_layout,
4282
                                 0, 1, &set,
4283
                                 0, NULL);
4284

4285
      /* If the region we are about to blit is tile-aligned, then we can
4286
       * use the render pass version that won't pre-load the tile buffer
4287
       * with the dst image contents before the blit. The exception is when we
4288
       * don't have a full color mask, since in that case we need to preserve
4289
       * the original value of some of the color components.
4290
       *
4291
       * Since all layers have the same area, we only need to compute this for
4292
       * the first.
4293
       */
4294
      if (i == 0) {
4295
         struct v3dv_render_pass *pipeline_pass =
4296
            v3dv_render_pass_from_handle(pipeline->pass);
4297
         can_skip_tlb_load =
4298
            cmask == full_cmask &&
4299
            v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, &render_area,
4300
                                              framebuffer, pipeline_pass, 0);
4301
      }
4302

4303
      /* Record blit */
4304
      VkRenderPassBeginInfo rp_info = {
4305
         .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
4306
         .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
4307
                                           pipeline->pass,
4308
         .framebuffer = fb,
4309
         .renderArea = render_area,
4310
         .clearValueCount = 0,
4311
      };
4312

4313
      v3dv_CmdBeginRenderPass(_cmd_buffer, &rp_info, VK_SUBPASS_CONTENTS_INLINE);
4314
      struct v3dv_job *job = cmd_buffer->state.job;
4315
      if (!job)
4316
         goto fail;
4317

4318
      /* For 3D blits we need to compute the source slice to blit from (the Z
4319
       * coordinate of the source sample operation). We want to choose this
4320
       * based on the ratio of the depth of the source and the destination
4321
       * images, picking the coordinate in the middle of each step.
4322
       */
4323
      if (src->type == VK_IMAGE_TYPE_3D) {
4324
         tex_coords[4] =
4325
            !mirror_z ?
4326
            (min_src_layer + (i + 0.5f) * src_z_step) / (float)src_level_d :
4327
            (max_src_layer - (i + 0.5f) * src_z_step) / (float)src_level_d;
4328
      }
4329

4330
      v3dv_CmdPushConstants(_cmd_buffer,
4331
                            device->meta.blit.p_layout,
4332
                            VK_SHADER_STAGE_VERTEX_BIT, 0, 20,
4333
                            &tex_coords);
4334

4335
      v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
4336

4337
      v3dv_CmdEndRenderPass(_cmd_buffer);
4338
      dirty_dynamic_state = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
4339
   }
4340

4341
fail:
4342
   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dirty_dynamic_state, true);
4343

4344
   return handled;
4345
}
4346

4347
VKAPI_ATTR void VKAPI_CALL
4348
v3dv_CmdBlitImage2KHR(VkCommandBuffer commandBuffer,
4349
                      const VkBlitImageInfo2KHR *pBlitImageInfo)
4350
{
4351
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4352
   V3DV_FROM_HANDLE(v3dv_image, src, pBlitImageInfo->srcImage);
4353
   V3DV_FROM_HANDLE(v3dv_image, dst, pBlitImageInfo->dstImage);
4354

4355
    /* This command can only happen outside a render pass */
4356
   assert(cmd_buffer->state.pass == NULL);
4357
   assert(cmd_buffer->state.job == NULL);
4358

4359
   /* From the Vulkan 1.0 spec, vkCmdBlitImage valid usage */
4360
   assert(dst->samples == VK_SAMPLE_COUNT_1_BIT &&
4361
          src->samples == VK_SAMPLE_COUNT_1_BIT);
4362

4363
   /* We don't export VK_FORMAT_FEATURE_BLIT_DST_BIT on compressed formats */
4364
   assert(!vk_format_is_compressed(dst->vk_format));
4365

4366
   for (uint32_t i = 0; i < pBlitImageInfo->regionCount; i++) {
4367
      if (blit_tfu(cmd_buffer, dst, src, &pBlitImageInfo->pRegions[i]))
4368
         continue;
4369
      if (blit_shader(cmd_buffer,
4370
                      dst, dst->vk_format,
4371
                      src, src->vk_format,
4372
                      0, NULL,
4373
                      &pBlitImageInfo->pRegions[i],
4374
                      pBlitImageInfo->filter, true)) {
4375
         continue;
4376
      }
4377
      unreachable("Unsupported blit operation");
4378
   }
4379
}
4380

4381
static bool
4382
resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
4383
                  struct v3dv_image *dst,
4384
                  struct v3dv_image *src,
4385
                  const VkImageResolve2KHR *region)
4386
{
4387
   if (!can_use_tlb(src, &region->srcOffset, NULL) ||
4388
       !can_use_tlb(dst, &region->dstOffset, NULL)) {
4389
      return false;
4390
   }
4391

4392
   if (!v3dv_X(cmd_buffer->device, format_supports_tlb_resolve)(src->format))
4393
      return false;
4394

4395
   const VkFormat fb_format = src->vk_format;
4396

4397
   uint32_t num_layers;
4398
   if (dst->type != VK_IMAGE_TYPE_3D)
4399
      num_layers = region->dstSubresource.layerCount;
4400
   else
4401
      num_layers = region->extent.depth;
4402
   assert(num_layers > 0);
4403

4404
   struct v3dv_job *job =
4405
      v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
4406
   if (!job)
4407
      return true;
4408

4409
   const uint32_t block_w = vk_format_get_blockwidth(dst->vk_format);
4410
   const uint32_t block_h = vk_format_get_blockheight(dst->vk_format);
4411
   const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
4412
   const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
4413

4414
   uint32_t internal_type, internal_bpp;
4415
   v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
4416
      (fb_format, region->srcSubresource.aspectMask,
4417
       &internal_type, &internal_bpp);
4418

4419
   v3dv_job_start_frame(job, width, height, num_layers, 1, internal_bpp, true);
4420

4421
   struct framebuffer_data framebuffer;
4422
   v3dv_X(job->device, setup_framebuffer_data)(&framebuffer, fb_format, internal_type,
4423
                                               &job->frame_tiling);
4424

4425
   v3dv_X(job->device, job_emit_binning_flush)(job);
4426
   v3dv_X(job->device, job_emit_resolve_image_rcl)(job, dst, src, &framebuffer, region);
4427

4428
   v3dv_cmd_buffer_finish_job(cmd_buffer);
4429
   return true;
4430
}
4431

4432
static bool
4433
resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
4434
                   struct v3dv_image *dst,
4435
                   struct v3dv_image *src,
4436
                   const VkImageResolve2KHR *region)
4437
{
4438
   const VkImageBlit2KHR blit_region = {
4439
      .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR,
4440
      .srcSubresource = region->srcSubresource,
4441
      .srcOffsets = {
4442
         region->srcOffset,
4443
         {
4444
            region->srcOffset.x + region->extent.width,
4445
            region->srcOffset.y + region->extent.height,
4446
         }
4447
      },
4448
      .dstSubresource = region->dstSubresource,
4449
      .dstOffsets = {
4450
         region->dstOffset,
4451
         {
4452
            region->dstOffset.x + region->extent.width,
4453
            region->dstOffset.y + region->extent.height,
4454
         }
4455
      },
4456
   };
4457
   return blit_shader(cmd_buffer,
4458
                      dst, dst->vk_format,
4459
                      src, src->vk_format,
4460
                      0, NULL,
4461
                      &blit_region, VK_FILTER_NEAREST, true);
4462
}
4463

4464
VKAPI_ATTR void VKAPI_CALL
4465
v3dv_CmdResolveImage2KHR(VkCommandBuffer commandBuffer,
4466
                         const VkResolveImageInfo2KHR *info)
4467

4468
{
4469
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4470
   V3DV_FROM_HANDLE(v3dv_image, src, info->srcImage);
4471
   V3DV_FROM_HANDLE(v3dv_image, dst, info->dstImage);
4472

4473
    /* This command can only happen outside a render pass */
4474
   assert(cmd_buffer->state.pass == NULL);
4475
   assert(cmd_buffer->state.job == NULL);
4476

4477
   assert(src->samples == VK_SAMPLE_COUNT_4_BIT);
4478
   assert(dst->samples == VK_SAMPLE_COUNT_1_BIT);
4479

4480
   for (uint32_t i = 0; i < info->regionCount; i++) {
4481
      if (resolve_image_tlb(cmd_buffer, dst, src, &info->pRegions[i]))
4482
         continue;
4483
      if (resolve_image_blit(cmd_buffer, dst, src, &info->pRegions[i]))
4484
         continue;
4485
      unreachable("Unsupported multismaple resolve operation");
4486
   }
4487
}
4488

4489
Product

Resources

Company