CoCalc -- aco_statistics.cpp

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/compiler/aco_statistics.cpp
⁴⁵⁵⁰ views
1
/*
2
 * Copyright © 2020 Valve Corporation
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 *
23
 */
24

25
#include "aco_ir.h"
26

27
#include "util/crc32.h"
28

29
#include <algorithm>
30
#include <deque>
31
#include <set>
32
#include <vector>
33

34
namespace aco {
35

36
/* sgpr_presched/vgpr_presched */
37
void
38
collect_presched_stats(Program* program)
39
{
40
   RegisterDemand presched_demand;
41
   for (Block& block : program->blocks)
42
      presched_demand.update(block.register_demand);
43
   program->statistics[statistic_sgpr_presched] = presched_demand.sgpr;
44
   program->statistics[statistic_vgpr_presched] = presched_demand.vgpr;
45
}
46

47
class BlockCycleEstimator {
48
public:
49
   enum resource {
50
      null = 0,
51
      scalar,
52
      branch_sendmsg,
53
      valu,
54
      valu_complex,
55
      lds,
56
      export_gds,
57
      vmem,
58
      resource_count,
59
   };
60

61
   BlockCycleEstimator(Program* program_) : program(program_) {}
62

63
   Program* program;
64

65
   int32_t cur_cycle = 0;
66
   int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0};
67
   unsigned res_usage[(int)BlockCycleEstimator::resource_count] = {0};
68
   int32_t reg_available[512] = {0};
69
   std::deque<int32_t> lgkm;
70
   std::deque<int32_t> exp;
71
   std::deque<int32_t> vm;
72
   std::deque<int32_t> vs;
73

74
   unsigned predict_cost(aco_ptr<Instruction>& instr);
75
   void add(aco_ptr<Instruction>& instr);
76
   void join(const BlockCycleEstimator& other);
77

78
private:
79
   unsigned get_waitcnt_cost(wait_imm imm);
80
   unsigned get_dependency_cost(aco_ptr<Instruction>& instr);
81

82
   void use_resources(aco_ptr<Instruction>& instr);
83
   int32_t cycles_until_res_available(aco_ptr<Instruction>& instr);
84
};
85

86
struct wait_counter_info {
87
   wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_)
88
       : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
89
   {}
90

91
   unsigned vm;
92
   unsigned exp;
93
   unsigned lgkm;
94
   unsigned vs;
95
};
96

97
struct perf_info {
98
   int latency;
99

100
   BlockCycleEstimator::resource rsrc0;
101
   unsigned cost0;
102

103
   BlockCycleEstimator::resource rsrc1;
104
   unsigned cost1;
105
};
106

107
static perf_info
108
get_perf_info(Program* program, aco_ptr<Instruction>& instr)
109
{
110
   instr_class cls = instr_info.classes[(int)instr->opcode];
111

112
#define WAIT(res)          BlockCycleEstimator::res, 0
113
#define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt
114

115
   if (program->chip_class >= GFX10) {
116
      /* fp64 might be incorrect */
117
      switch (cls) {
118
      case instr_class::valu32:
119
      case instr_class::valu_convert32:
120
      case instr_class::valu_fma: return {5, WAIT_USE(valu, 1)};
121
      case instr_class::valu64: return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
122
      case instr_class::valu_quarter_rate32:
123
         return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)};
124
      case instr_class::valu_transcendental32:
125
         return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)};
126
      case instr_class::valu_double: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
127
      case instr_class::valu_double_add:
128
         return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
129
      case instr_class::valu_double_convert:
130
         return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
131
      case instr_class::valu_double_transcendental:
132
         return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
133
      case instr_class::salu: return {2, WAIT_USE(scalar, 1)};
134
      case instr_class::smem: return {0, WAIT_USE(scalar, 1)};
135
      case instr_class::branch:
136
      case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 1)};
137
      case instr_class::ds:
138
         return instr->ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)}
139
                                : perf_info{0, WAIT_USE(lds, 1)};
140
      case instr_class::exp: return {0, WAIT_USE(export_gds, 1)};
141
      case instr_class::vmem: return {0, WAIT_USE(vmem, 1)};
142
      case instr_class::barrier:
143
      case instr_class::waitcnt:
144
      case instr_class::other:
145
      default: return {0};
146
      }
147
   } else {
148
      switch (cls) {
149
      case instr_class::valu32: return {4, WAIT_USE(valu, 4)};
150
      case instr_class::valu_convert32: return {16, WAIT_USE(valu, 16)};
151
      case instr_class::valu64: return {8, WAIT_USE(valu, 8)};
152
      case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)};
153
      case instr_class::valu_fma:
154
         return program->dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)}
155
                                            : perf_info{16, WAIT_USE(valu, 16)};
156
      case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)};
157
      case instr_class::valu_double: return {64, WAIT_USE(valu, 64)};
158
      case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)};
159
      case instr_class::valu_double_convert: return {16, WAIT_USE(valu, 16)};
160
      case instr_class::valu_double_transcendental: return {64, WAIT_USE(valu, 64)};
161
      case instr_class::salu: return {4, WAIT_USE(scalar, 4)};
162
      case instr_class::smem: return {4, WAIT_USE(scalar, 4)};
163
      case instr_class::branch:
164
         return {8, WAIT_USE(branch_sendmsg, 8)};
165
         return {4, WAIT_USE(branch_sendmsg, 4)};
166
      case instr_class::ds:
167
         return instr->ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)}
168
                                : perf_info{4, WAIT_USE(lds, 4)};
169
      case instr_class::exp: return {16, WAIT_USE(export_gds, 16)};
170
      case instr_class::vmem: return {4, WAIT_USE(vmem, 4)};
171
      case instr_class::barrier:
172
      case instr_class::waitcnt:
173
      case instr_class::other:
174
      default: return {4};
175
      }
176
   }
177

178
#undef WAIT_USE
179
#undef WAIT
180
}
181

182
void
183
BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
184
{
185
   perf_info perf = get_perf_info(program, instr);
186

187
   if (perf.rsrc0 != resource_count) {
188
      res_available[(int)perf.rsrc0] = cur_cycle + perf.cost0;
189
      res_usage[(int)perf.rsrc0] += perf.cost0;
190
   }
191

192
   if (perf.rsrc1 != resource_count) {
193
      res_available[(int)perf.rsrc1] = cur_cycle + perf.cost1;
194
      res_usage[(int)perf.rsrc1] += perf.cost1;
195
   }
196
}
197

198
int32_t
199
BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
200
{
201
   perf_info perf = get_perf_info(program, instr);
202

203
   int32_t cost = 0;
204
   if (perf.rsrc0 != resource_count)
205
      cost = MAX2(cost, res_available[(int)perf.rsrc0] - cur_cycle);
206
   if (perf.rsrc1 != resource_count)
207
      cost = MAX2(cost, res_available[(int)perf.rsrc1] - cur_cycle);
208

209
   return cost;
210
}
211

212
static wait_counter_info
213
get_wait_counter_info(aco_ptr<Instruction>& instr)
214
{
215
   /* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance
216
    * depends a lot on the situation. */
217

218
   if (instr->isEXP())
219
      return wait_counter_info(0, 16, 0, 0);
220

221
   if (instr->isFlatLike()) {
222
      unsigned lgkm = instr->isFlat() ? 20 : 0;
223
      if (!instr->definitions.empty())
224
         return wait_counter_info(230, 0, lgkm, 0);
225
      else
226
         return wait_counter_info(0, 0, lgkm, 230);
227
   }
228

229
   if (instr->isSMEM()) {
230
      if (instr->definitions.empty())
231
         return wait_counter_info(0, 0, 200, 0);
232
      if (instr->operands.empty()) /* s_memtime and s_memrealtime */
233
         return wait_counter_info(0, 0, 1, 0);
234

235
      bool likely_desc_load = instr->operands[0].size() == 2;
236
      bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
237
      bool const_offset =
238
         instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant());
239

240
      if (likely_desc_load || const_offset)
241
         return wait_counter_info(0, 0, 30, 0); /* likely to hit L0 cache */
242

243
      return wait_counter_info(0, 0, 200, 0);
244
   }
245

246
   if (instr->format == Format::DS)
247
      return wait_counter_info(0, 0, 20, 0);
248

249
   if (instr->isVMEM() && !instr->definitions.empty())
250
      return wait_counter_info(320, 0, 0, 0);
251

252
   if (instr->isVMEM() && instr->definitions.empty())
253
      return wait_counter_info(0, 0, 0, 320);
254

255
   return wait_counter_info(0, 0, 0, 0);
256
}
257

258
static wait_imm
259
get_wait_imm(Program* program, aco_ptr<Instruction>& instr)
260
{
261
   if (instr->opcode == aco_opcode::s_endpgm) {
262
      return wait_imm(0, 0, 0, 0);
263
   } else if (instr->opcode == aco_opcode::s_waitcnt) {
264
      return wait_imm(GFX10_3, instr->sopp().imm);
265
   } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) {
266
      return wait_imm(0, 0, 0, instr->sopk().imm);
267
   } else {
268
      unsigned max_lgkm_cnt = program->chip_class >= GFX10 ? 62 : 14;
269
      unsigned max_exp_cnt = 6;
270
      unsigned max_vm_cnt = program->chip_class >= GFX9 ? 62 : 14;
271
      unsigned max_vs_cnt = 62;
272

273
      wait_counter_info wait_info = get_wait_counter_info(instr);
274
      wait_imm imm;
275
      imm.lgkm = wait_info.lgkm ? max_lgkm_cnt : wait_imm::unset_counter;
276
      imm.exp = wait_info.exp ? max_exp_cnt : wait_imm::unset_counter;
277
      imm.vm = wait_info.vm ? max_vm_cnt : wait_imm::unset_counter;
278
      imm.vs = wait_info.vs ? max_vs_cnt : wait_imm::unset_counter;
279
      return imm;
280
   }
281
}
282

283
unsigned
284
BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
285
{
286
   int deps_available = cur_cycle;
287

288
   wait_imm imm = get_wait_imm(program, instr);
289
   if (imm.vm != wait_imm::unset_counter) {
290
      for (int i = 0; i < (int)vm.size() - imm.vm; i++)
291
         deps_available = MAX2(deps_available, vm[i]);
292
   }
293
   if (imm.exp != wait_imm::unset_counter) {
294
      for (int i = 0; i < (int)exp.size() - imm.exp; i++)
295
         deps_available = MAX2(deps_available, exp[i]);
296
   }
297
   if (imm.lgkm != wait_imm::unset_counter) {
298
      for (int i = 0; i < (int)lgkm.size() - imm.lgkm; i++)
299
         deps_available = MAX2(deps_available, lgkm[i]);
300
   }
301
   if (imm.vs != wait_imm::unset_counter) {
302
      for (int i = 0; i < (int)vs.size() - imm.vs; i++)
303
         deps_available = MAX2(deps_available, vs[i]);
304
   }
305

306
   if (instr->opcode == aco_opcode::s_endpgm) {
307
      for (unsigned i = 0; i < 512; i++)
308
         deps_available = MAX2(deps_available, reg_available[i]);
309
   } else if (program->chip_class >= GFX10) {
310
      for (Operand& op : instr->operands) {
311
         if (op.isConstant() || op.isUndefined())
312
            continue;
313
         for (unsigned i = 0; i < op.size(); i++)
314
            deps_available = MAX2(deps_available, reg_available[op.physReg().reg() + i]);
315
      }
316
   }
317

318
   if (program->chip_class < GFX10)
319
      deps_available = align(deps_available, 4);
320

321
   return deps_available - cur_cycle;
322
}
323

324
unsigned
325
BlockCycleEstimator::predict_cost(aco_ptr<Instruction>& instr)
326
{
327
   int32_t dep = get_dependency_cost(instr);
328
   return dep + std::max(cycles_until_res_available(instr) - dep, 0);
329
}
330

331
static bool
332
is_vector(aco_opcode op)
333
{
334
   switch (instr_info.classes[(int)op]) {
335
   case instr_class::valu32:
336
   case instr_class::valu_convert32:
337
   case instr_class::valu_fma:
338
   case instr_class::valu_double:
339
   case instr_class::valu_double_add:
340
   case instr_class::valu_double_convert:
341
   case instr_class::valu_double_transcendental:
342
   case instr_class::vmem:
343
   case instr_class::ds:
344
   case instr_class::exp:
345
   case instr_class::valu64:
346
   case instr_class::valu_quarter_rate32:
347
   case instr_class::valu_transcendental32: return true;
348
   default: return false;
349
   }
350
}
351

352
void
353
BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
354
{
355
   perf_info perf = get_perf_info(program, instr);
356

357
   cur_cycle += get_dependency_cost(instr);
358

359
   unsigned start;
360
   bool dual_issue = program->chip_class >= GFX10 && program->wave_size == 64 &&
361
                     is_vector(instr->opcode) && program->workgroup_size > 32;
362
   for (unsigned i = 0; i < (dual_issue ? 2 : 1); i++) {
363
      cur_cycle += cycles_until_res_available(instr);
364

365
      start = cur_cycle;
366
      use_resources(instr);
367

368
      /* GCN is in-order and doesn't begin the next instruction until the current one finishes */
369
      cur_cycle += program->chip_class >= GFX10 ? 1 : perf.latency;
370
   }
371

372
   wait_imm imm = get_wait_imm(program, instr);
373
   while (lgkm.size() > imm.lgkm)
374
      lgkm.pop_front();
375
   while (exp.size() > imm.exp)
376
      exp.pop_front();
377
   while (vm.size() > imm.vm)
378
      vm.pop_front();
379
   while (vs.size() > imm.vs)
380
      vs.pop_front();
381

382
   wait_counter_info wait_info = get_wait_counter_info(instr);
383
   if (wait_info.exp)
384
      exp.push_back(cur_cycle + wait_info.exp);
385
   if (wait_info.lgkm)
386
      lgkm.push_back(cur_cycle + wait_info.lgkm);
387
   if (wait_info.vm)
388
      vm.push_back(cur_cycle + wait_info.vm);
389
   if (wait_info.vs)
390
      vs.push_back(cur_cycle + wait_info.vs);
391

392
   /* This is inaccurate but shouldn't affect anything after waitcnt insertion.
393
    * Before waitcnt insertion, this is necessary to consider memory operations.
394
    */
395
   int latency = MAX3(wait_info.exp, wait_info.lgkm, wait_info.vm);
396
   int32_t result_available = start + MAX2(perf.latency, latency);
397

398
   for (Definition& def : instr->definitions) {
399
      int32_t* available = &reg_available[def.physReg().reg()];
400
      for (unsigned i = 0; i < def.size(); i++)
401
         available[i] = MAX2(available[i], result_available);
402
   }
403
}
404

405
static void
406
join_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pred, int cycle_diff)
407
{
408
   for (unsigned i = 0; i < MIN2(queue.size(), pred.size()); i++)
409
      queue.rbegin()[i] = MAX2(queue.rbegin()[i], pred.rbegin()[i] + cycle_diff);
410
   for (int i = pred.size() - queue.size() - 1; i >= 0; i--)
411
      queue.push_front(pred[i] + cycle_diff);
412
}
413

414
void
415
BlockCycleEstimator::join(const BlockCycleEstimator& pred)
416
{
417
   assert(cur_cycle == 0);
418

419
   for (unsigned i = 0; i < (unsigned)resource_count; i++) {
420
      assert(res_usage[i] == 0);
421
      res_available[i] = MAX2(res_available[i], pred.res_available[i] - pred.cur_cycle);
422
   }
423

424
   for (unsigned i = 0; i < 512; i++)
425
      reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle);
426

427
   join_queue(lgkm, pred.lgkm, -pred.cur_cycle);
428
   join_queue(exp, pred.exp, -pred.cur_cycle);
429
   join_queue(vm, pred.vm, -pred.cur_cycle);
430
   join_queue(vs, pred.vs, -pred.cur_cycle);
431
}
432

433
/* instructions/branches/vmem_clauses/smem_clauses/cycles */
434
void
435
collect_preasm_stats(Program* program)
436
{
437
   for (Block& block : program->blocks) {
438
      std::set<Instruction*> vmem_clause;
439
      std::set<Instruction*> smem_clause;
440

441
      program->statistics[statistic_instructions] += block.instructions.size();
442

443
      for (aco_ptr<Instruction>& instr : block.instructions) {
444
         if (instr->isSOPP() && instr->sopp().block != -1)
445
            program->statistics[statistic_branches]++;
446

447
         if (instr->opcode == aco_opcode::p_constaddr)
448
            program->statistics[statistic_instructions] += 2;
449

450
         if (instr->isVMEM() && !instr->operands.empty()) {
451
            if (std::none_of(vmem_clause.begin(), vmem_clause.end(),
452
                             [&](Instruction* other)
453
                             { return should_form_clause(instr.get(), other); }))
454
               program->statistics[statistic_vmem_clauses]++;
455
            vmem_clause.insert(instr.get());
456
         } else {
457
            vmem_clause.clear();
458
         }
459

460
         if (instr->isSMEM() && !instr->operands.empty()) {
461
            if (std::none_of(smem_clause.begin(), smem_clause.end(),
462
                             [&](Instruction* other)
463
                             { return should_form_clause(instr.get(), other); }))
464
               program->statistics[statistic_smem_clauses]++;
465
            smem_clause.insert(instr.get());
466
         } else {
467
            smem_clause.clear();
468
         }
469
      }
470
   }
471

472
   double latency = 0;
473
   double usage[(int)BlockCycleEstimator::resource_count] = {0};
474
   std::vector<BlockCycleEstimator> blocks(program->blocks.size(), program);
475

476
   for (Block& block : program->blocks) {
477
      BlockCycleEstimator& block_est = blocks[block.index];
478
      for (unsigned pred : block.linear_preds)
479
         block_est.join(blocks[pred]);
480

481
      for (aco_ptr<Instruction>& instr : block.instructions) {
482
         unsigned before = block_est.cur_cycle;
483
         block_est.add(instr);
484
         instr->pass_flags = block_est.cur_cycle - before;
485
      }
486

487
      /* TODO: it would be nice to be able to consider estimated loop trip
488
       * counts used for loop unrolling.
489
       */
490

491
      /* TODO: estimate the trip_count of divergent loops (those which break
492
       * divergent) higher than of uniform loops
493
       */
494

495
      /* Assume loops execute 8-2 times, uniform branches are taken 50% the time,
496
       * and any lane in the wave takes a side of a divergent branch 75% of the
497
       * time.
498
       */
499
      double iter = 1.0;
500
      iter *= block.loop_nest_depth > 0 ? 8.0 : 1.0;
501
      iter *= block.loop_nest_depth > 1 ? 4.0 : 1.0;
502
      iter *= block.loop_nest_depth > 2 ? pow(2.0, block.loop_nest_depth - 2) : 1.0;
503
      iter *= pow(0.5, block.uniform_if_depth);
504
      iter *= pow(0.75, block.divergent_if_logical_depth);
505

506
      bool divergent_if_linear_else =
507
         block.logical_preds.empty() && block.linear_preds.size() == 1 &&
508
         block.linear_succs.size() == 1 &&
509
         program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);
510
      if (divergent_if_linear_else)
511
         iter *= 0.25;
512

513
      latency += block_est.cur_cycle * iter;
514
      for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++)
515
         usage[i] += block_est.res_usage[i] * iter;
516
   }
517

518
   /* This likely exaggerates the effectiveness of parallelism because it
519
    * ignores instruction ordering. It can assume there might be SALU/VALU/etc
520
    * work to from other waves while one is idle but that might not be the case
521
    * because those other waves have not reached such a point yet.
522
    */
523

524
   double parallelism = program->num_waves;
525
   for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++) {
526
      if (usage[i] > 0.0)
527
         parallelism = MIN2(parallelism, latency / usage[i]);
528
   }
529
   double waves_per_cycle = 1.0 / latency * parallelism;
530
   double wave64_per_cycle = waves_per_cycle * (program->wave_size / 64.0);
531

532
   double max_utilization = 1.0;
533
   if (program->workgroup_size != UINT_MAX)
534
      max_utilization =
535
         program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
536
   wave64_per_cycle *= max_utilization;
537

538
   program->statistics[statistic_latency] = round(latency);
539
   program->statistics[statistic_inv_throughput] = round(1.0 / wave64_per_cycle);
540

541
   if (debug_flags & DEBUG_PERF_INFO) {
542
      aco_print_program(program, stderr, print_no_ssa | print_perf_info);
543

544
      fprintf(stderr, "num_waves: %u\n", program->num_waves);
545
      fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]);
546
      fprintf(stderr, "branch_sendmsg_usage: %f\n",
547
              usage[(int)BlockCycleEstimator::branch_sendmsg]);
548
      fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]);
549
      fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]);
550
      fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]);
551
      fprintf(stderr, "export_gds_usage: %f\n", usage[(int)BlockCycleEstimator::export_gds]);
552
      fprintf(stderr, "vmem_usage: %f\n", usage[(int)BlockCycleEstimator::vmem]);
553
      fprintf(stderr, "latency: %f\n", latency);
554
      fprintf(stderr, "parallelism: %f\n", parallelism);
555
      fprintf(stderr, "max_utilization: %f\n", max_utilization);
556
      fprintf(stderr, "wave64_per_cycle: %f\n", wave64_per_cycle);
557
      fprintf(stderr, "\n");
558
   }
559
}
560

561
void
562
collect_postasm_stats(Program* program, const std::vector<uint32_t>& code)
563
{
564
   program->statistics[aco::statistic_hash] = util_hash_crc32(code.data(), code.size() * 4);
565
}
566

567
} // namespace aco
568

569
Product

Resources

Company