Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/dnn/src/opencl/softmax_loss.cl
16337 views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
8
//
9
//
10
// License Agreement
11
// For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2017, Intel Corporation, all rights reserved.
14
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
15
// Third party copyrights are property of their respective owners.
16
//
17
// Redistribution and use in source and binary forms, with or without modification,
18
// are permitted provided that the following conditions are met:
19
//
20
// * Redistribution's of source code must retain the above copyright notice,
21
// this list of conditions and the following disclaimer.
22
//
23
// * Redistribution's in binary form must reproduce the above copyright notice,
24
// this list of conditions and the following disclaimer in the documentation
25
// and/or other materials provided with the distribution.
26
//
27
// * The name of the copyright holders may not be used to endorse or promote products
28
// derived from this software without specific prior written permission.
29
//
30
// This software is provided by the copyright holders and contributors "as is" and
31
// any express or implied warranties, including, but not limited to, the implied
32
// warranties of merchantability and fitness for a particular purpose are disclaimed.
33
// In no event shall the Intel Corporation or contributors be liable for any direct,
34
// indirect, incidental, special, exemplary, or consequential damages
35
// (including, but not limited to, procurement of substitute goods or services;
36
// loss of use, data, or profits; or business interruption) however caused
37
// and on any theory of liability, whether in contract, strict liability,
38
// or tort (including negligence or otherwise) arising in any way out of
39
// the use of this software, even if advised of the possibility of such damage.
40
//
41
//M*/
42
43
#define CONCAT(A,B) A##_##B
44
#define TEMPLATE(name,type) CONCAT(name,type)
45
46
#if defined(cl_intel_subgroups)
47
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
48
#endif
49
50
#if defined(cl_khr_fp16)
51
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
52
#endif
53
54
__kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels,
55
const int spatial_dim,
56
__global Dtype* scale,
57
__global const Dtype* data,
58
__global Dtype* out,
59
__local Dtype *out_tmp,
60
__local Dtype *scale_tmp,
61
__local Dtype *group_tmp) {
62
63
int n = get_global_id(1);
64
for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
65
get_global_size(0), ++s) {
66
Dtype maxval = -DTYPE_MAX;
67
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
68
Dtype tmp = data[(n * channels + c) * spatial_dim + s];
69
maxval = max((Dtype)tmp, (Dtype)maxval);
70
}
71
maxval = sub_group_reduce_max(maxval);
72
//if (get_sub_group_local_id() == 0)
73
group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
74
}
75
76
barrier(CLK_LOCAL_MEM_FENCE);
77
78
for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
79
get_global_size(0)) {
80
int s = index / get_max_sub_group_size();
81
Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
82
//if (get_sub_group_local_id() == 0)
83
scale_tmp[s] = maxval;
84
}
85
86
barrier(CLK_LOCAL_MEM_FENCE);
87
88
for (int index = get_global_id(0); index < channels * spatial_dim;
89
index += get_global_size(0)) {
90
int s = index % spatial_dim;
91
out_tmp[index] = exp(data[n * channels * spatial_dim + index] - scale_tmp[s]);
92
}
93
barrier(CLK_LOCAL_MEM_FENCE);
94
95
for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
96
get_global_size(0), ++s) {
97
Dtype sum = 0;
98
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
99
sum += out_tmp[c * spatial_dim + s];
100
}
101
sum = sub_group_reduce_add(sum);
102
group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
103
}
104
barrier(CLK_LOCAL_MEM_FENCE);
105
106
for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
107
get_global_size(0)) {
108
int s = index / get_max_sub_group_size();
109
Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
110
//if (get_sub_group_local_id() == 0)
111
scale_tmp[s] = sum;
112
}
113
barrier(CLK_LOCAL_MEM_FENCE);
114
115
for (int index = get_global_id(0); index < channels * spatial_dim;
116
index += get_global_size(0)) {
117
int s = index % spatial_dim;
118
Dtype v = out_tmp[index] / scale_tmp[s];
119
#ifdef LOG_SOFTMAX
120
v = log(v);
121
#endif
122
out[n * channels * spatial_dim + index] = v;
123
}
124
}
125
126
__kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
127
const int spatial_dim,
128
__global Dtype* scale,
129
__global const Dtype* data,
130
__global Dtype* out) {
131
132
int n = get_global_id(1);
133
__global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim;
134
for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
135
get_global_size(0), ++s) {
136
Dtype maxval = -DTYPE_MAX;
137
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
138
Dtype tmp = data[(n * channels + c) * spatial_dim + s];
139
maxval = max((Dtype)tmp, (Dtype)maxval);
140
}
141
maxval = sub_group_reduce_max(maxval);
142
//if (get_sub_group_local_id() == 0)
143
group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
144
}
145
barrier(CLK_GLOBAL_MEM_FENCE);
146
147
for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
148
get_global_size(0)) {
149
int s = index / get_max_sub_group_size();
150
Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
151
//if (get_sub_group_local_id() == 0)
152
scale[n * spatial_dim + s] = maxval;
153
}
154
155
barrier(CLK_GLOBAL_MEM_FENCE);
156
157
for (int index = get_global_id(0); index < channels * spatial_dim;
158
index += get_global_size(0)) {
159
int s = index % spatial_dim;
160
out[n * channels * spatial_dim + index] = exp(data[n * channels * spatial_dim + index] - scale[n * spatial_dim + s]);
161
}
162
barrier(CLK_GLOBAL_MEM_FENCE);
163
164
for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
165
get_global_size(0), ++s) {
166
Dtype sum = 0;
167
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
168
sum += out[n * channels * spatial_dim + c * spatial_dim + s];
169
}
170
sum = sub_group_reduce_add(sum);
171
group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
172
}
173
barrier(CLK_GLOBAL_MEM_FENCE);
174
175
for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
176
get_global_size(0)) {
177
int s = index / get_max_sub_group_size();
178
Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
179
//if (get_sub_group_local_id() == 0)
180
scale[n * spatial_dim + s] = sum;
181
}
182
barrier(CLK_GLOBAL_MEM_FENCE);
183
184
for (int index = get_global_id(0); index < channels * spatial_dim;
185
index += get_global_size(0)) {
186
int s = index % spatial_dim;
187
Dtype v = out[n * channels * spatial_dim + index] / scale[n * spatial_dim + s];
188
#ifdef LOG_SOFTMAX
189
v = log(v);
190
#endif
191
out[n * channels * spatial_dim + index] = v;
192
}
193
}
194
195