Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libwebp/src/dsp/dec_mips_dsp_r2.c
9913 views
1
// Copyright 2014 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
// MIPS version of dsp functions
11
//
12
// Author(s): Djordje Pesut ([email protected])
13
// Jovan Zelincevic ([email protected])
14
15
#include "src/dsp/dsp.h"
16
17
#if defined(WEBP_USE_MIPS_DSP_R2)
18
19
#include "src/dsp/mips_macro.h"
20
21
static const int kC1 = WEBP_TRANSFORM_AC3_C1;
22
static const int kC2 = WEBP_TRANSFORM_AC3_C2;
23
24
static void TransformDC(const int16_t* WEBP_RESTRICT in,
25
uint8_t* WEBP_RESTRICT dst) {
26
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
27
28
__asm__ volatile (
29
LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,
30
0, 0, 0, 0,
31
0, 1, 2, 3,
32
BPS)
33
"lh %[temp5], 0(%[in]) \n\t"
34
"addiu %[temp5], %[temp5], 4 \n\t"
35
"ins %[temp5], %[temp5], 16, 16 \n\t"
36
"shra.ph %[temp5], %[temp5], 3 \n\t"
37
CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
38
temp3, temp1, temp2, temp3, temp4)
39
STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
40
temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
41
dst, 0, 1, 2, 3, BPS)
42
43
OUTPUT_EARLY_CLOBBER_REGS_10()
44
: [in]"r"(in), [dst]"r"(dst)
45
: "memory"
46
);
47
}
48
49
static void TransformAC3(const int16_t* WEBP_RESTRICT in,
50
uint8_t* WEBP_RESTRICT dst) {
51
const int a = in[0] + 4;
52
int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
53
const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
54
const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
55
const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
56
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
57
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
58
59
__asm__ volatile (
60
"ins %[c4], %[d4], 16, 16 \n\t"
61
"replv.ph %[temp1], %[a] \n\t"
62
"replv.ph %[temp4], %[d1] \n\t"
63
ADD_SUB_HALVES(temp2, temp3, temp1, c4)
64
"replv.ph %[temp5], %[c1] \n\t"
65
SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
66
temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
67
LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,
68
0, 0, 0, 0,
69
0, 1, 2, 3,
70
BPS)
71
CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
72
temp11, temp17, temp3, temp5, temp11, temp12)
73
PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
74
temp4, temp7, temp6, temp10, temp9)
75
STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
76
temp17, temp12, temp18, temp1, temp8, temp2, temp4,
77
temp7, temp6, dst, 0, 1, 2, 3, BPS)
78
79
OUTPUT_EARLY_CLOBBER_REGS_18(),
80
[c4]"+&r"(c4)
81
: [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)
82
: "memory"
83
);
84
}
85
86
static void TransformOne(const int16_t* WEBP_RESTRICT in,
87
uint8_t* WEBP_RESTRICT dst) {
88
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
89
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
90
91
__asm__ volatile (
92
"ulw %[temp1], 0(%[in]) \n\t"
93
"ulw %[temp2], 16(%[in]) \n\t"
94
LOAD_IN_X2(temp5, temp6, 24, 26)
95
ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
96
LOAD_IN_X2(temp1, temp2, 8, 10)
97
MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
98
temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
99
temp13, temp11, temp14, temp12)
100
INSERT_HALF_X2(temp8, temp7, temp10, temp9)
101
"ulw %[temp17], 4(%[in]) \n\t"
102
"ulw %[temp18], 20(%[in]) \n\t"
103
ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
104
ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
105
ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
106
LOAD_IN_X2(temp17, temp18, 12, 14)
107
LOAD_IN_X2(temp9, temp10, 28, 30)
108
MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
109
temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
110
temp15, temp4, temp16, temp17)
111
INSERT_HALF_X2(temp11, temp12, temp13, temp14)
112
ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
113
ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
114
115
// horizontal
116
SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
117
INSERT_HALF_X2(temp1, temp6, temp5, temp2)
118
SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
119
"repl.ph %[temp2], 0x4 \n\t"
120
INSERT_HALF_X2(temp3, temp8, temp17, temp4)
121
"addq.ph %[temp1], %[temp1], %[temp2] \n\t"
122
"addq.ph %[temp6], %[temp6], %[temp2] \n\t"
123
ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
124
ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
125
MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
126
temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
127
temp6, temp17, temp8, temp18)
128
MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
129
temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
130
temp18, temp12, temp17, temp16)
131
INSERT_HALF_X2(temp1, temp3, temp9, temp13)
132
INSERT_HALF_X2(temp6, temp8, temp11, temp15)
133
SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
134
temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
135
temp6)
136
PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
137
temp16, temp11, temp10, temp15, temp14)
138
LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,
139
0, 0, 0, 0,
140
0, 1, 2, 3,
141
BPS)
142
CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
143
temp11, temp10, temp11, temp14, temp15)
144
STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
145
temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
146
dst, 0, 1, 2, 3, BPS)
147
148
OUTPUT_EARLY_CLOBBER_REGS_18()
149
: [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
150
: "memory", "hi", "lo"
151
);
152
}
153
154
static void TransformTwo(const int16_t* WEBP_RESTRICT in,
155
uint8_t* WEBP_RESTRICT dst, int do_two) {
156
TransformOne(in, dst);
157
if (do_two) {
158
TransformOne(in + 16, dst + 4);
159
}
160
}
161
162
static WEBP_INLINE void FilterLoop26(uint8_t* p,
163
int hstride, int vstride, int size,
164
int thresh, int ithresh, int hev_thresh) {
165
const int thresh2 = 2 * thresh + 1;
166
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
167
int temp10, temp11, temp12, temp13, temp14, temp15;
168
169
__asm__ volatile (
170
".set push \n\t"
171
".set noreorder \n\t"
172
"1: \n\t"
173
"negu %[temp1], %[hstride] \n\t"
174
"addiu %[size], %[size], -1 \n\t"
175
"sll %[temp2], %[hstride], 1 \n\t"
176
"sll %[temp3], %[temp1], 1 \n\t"
177
"addu %[temp4], %[temp2], %[hstride] \n\t"
178
"addu %[temp5], %[temp3], %[temp1] \n\t"
179
"lbu %[temp7], 0(%[p]) \n\t"
180
"sll %[temp6], %[temp3], 1 \n\t"
181
"lbux %[temp8], %[temp5](%[p]) \n\t"
182
"lbux %[temp9], %[temp3](%[p]) \n\t"
183
"lbux %[temp10], %[temp1](%[p]) \n\t"
184
"lbux %[temp11], %[temp6](%[p]) \n\t"
185
"lbux %[temp12], %[hstride](%[p]) \n\t"
186
"lbux %[temp13], %[temp2](%[p]) \n\t"
187
"lbux %[temp14], %[temp4](%[p]) \n\t"
188
"subu %[temp1], %[temp10], %[temp7] \n\t"
189
"subu %[temp2], %[temp9], %[temp12] \n\t"
190
"absq_s.w %[temp3], %[temp1] \n\t"
191
"absq_s.w %[temp4], %[temp2] \n\t"
192
"negu %[temp1], %[temp1] \n\t"
193
"sll %[temp3], %[temp3], 2 \n\t"
194
"addu %[temp15], %[temp3], %[temp4] \n\t"
195
"subu %[temp3], %[temp15], %[thresh2] \n\t"
196
"sll %[temp6], %[temp1], 1 \n\t"
197
"bgtz %[temp3], 3f \n\t"
198
" subu %[temp4], %[temp11], %[temp8] \n\t"
199
"absq_s.w %[temp4], %[temp4] \n\t"
200
"shll_s.w %[temp2], %[temp2], 24 \n\t"
201
"subu %[temp4], %[temp4], %[ithresh] \n\t"
202
"bgtz %[temp4], 3f \n\t"
203
" subu %[temp3], %[temp8], %[temp9] \n\t"
204
"absq_s.w %[temp3], %[temp3] \n\t"
205
"subu %[temp3], %[temp3], %[ithresh] \n\t"
206
"bgtz %[temp3], 3f \n\t"
207
" subu %[temp5], %[temp9], %[temp10] \n\t"
208
"absq_s.w %[temp3], %[temp5] \n\t"
209
"absq_s.w %[temp5], %[temp5] \n\t"
210
"subu %[temp3], %[temp3], %[ithresh] \n\t"
211
"bgtz %[temp3], 3f \n\t"
212
" subu %[temp3], %[temp14], %[temp13] \n\t"
213
"absq_s.w %[temp3], %[temp3] \n\t"
214
"slt %[temp5], %[hev_thresh], %[temp5] \n\t"
215
"subu %[temp3], %[temp3], %[ithresh] \n\t"
216
"bgtz %[temp3], 3f \n\t"
217
" subu %[temp3], %[temp13], %[temp12] \n\t"
218
"absq_s.w %[temp3], %[temp3] \n\t"
219
"sra %[temp4], %[temp2], 24 \n\t"
220
"subu %[temp3], %[temp3], %[ithresh] \n\t"
221
"bgtz %[temp3], 3f \n\t"
222
" subu %[temp15], %[temp12], %[temp7] \n\t"
223
"absq_s.w %[temp3], %[temp15] \n\t"
224
"absq_s.w %[temp15], %[temp15] \n\t"
225
"subu %[temp3], %[temp3], %[ithresh] \n\t"
226
"bgtz %[temp3], 3f \n\t"
227
" slt %[temp15], %[hev_thresh], %[temp15] \n\t"
228
"addu %[temp3], %[temp6], %[temp1] \n\t"
229
"or %[temp2], %[temp5], %[temp15] \n\t"
230
"addu %[temp5], %[temp4], %[temp3] \n\t"
231
"beqz %[temp2], 4f \n\t"
232
" shra_r.w %[temp1], %[temp5], 3 \n\t"
233
"addiu %[temp2], %[temp5], 3 \n\t"
234
"sra %[temp2], %[temp2], 3 \n\t"
235
"shll_s.w %[temp1], %[temp1], 27 \n\t"
236
"shll_s.w %[temp2], %[temp2], 27 \n\t"
237
"subu %[temp3], %[p], %[hstride] \n\t"
238
"sra %[temp1], %[temp1], 27 \n\t"
239
"sra %[temp2], %[temp2], 27 \n\t"
240
"subu %[temp1], %[temp7], %[temp1] \n\t"
241
"addu %[temp2], %[temp10], %[temp2] \n\t"
242
"lbux %[temp2], %[temp2](%[VP8kclip1]) \n\t"
243
"lbux %[temp1], %[temp1](%[VP8kclip1]) \n\t"
244
"sb %[temp2], 0(%[temp3]) \n\t"
245
"j 3f \n\t"
246
" sb %[temp1], 0(%[p]) \n\t"
247
"4: \n\t"
248
"shll_s.w %[temp5], %[temp5], 24 \n\t"
249
"subu %[temp14], %[p], %[hstride] \n\t"
250
"subu %[temp11], %[temp14], %[hstride] \n\t"
251
"sra %[temp6], %[temp5], 24 \n\t"
252
"sll %[temp1], %[temp6], 3 \n\t"
253
"subu %[temp15], %[temp11], %[hstride] \n\t"
254
"addu %[temp2], %[temp6], %[temp1] \n\t"
255
"sll %[temp3], %[temp2], 1 \n\t"
256
"addu %[temp4], %[temp3], %[temp2] \n\t"
257
"addiu %[temp2], %[temp2], 63 \n\t"
258
"addiu %[temp3], %[temp3], 63 \n\t"
259
"addiu %[temp4], %[temp4], 63 \n\t"
260
"sra %[temp2], %[temp2], 7 \n\t"
261
"sra %[temp3], %[temp3], 7 \n\t"
262
"sra %[temp4], %[temp4], 7 \n\t"
263
"addu %[temp1], %[temp8], %[temp2] \n\t"
264
"addu %[temp5], %[temp9], %[temp3] \n\t"
265
"addu %[temp6], %[temp10], %[temp4] \n\t"
266
"subu %[temp8], %[temp7], %[temp4] \n\t"
267
"subu %[temp7], %[temp12], %[temp3] \n\t"
268
"addu %[temp10], %[p], %[hstride] \n\t"
269
"subu %[temp9], %[temp13], %[temp2] \n\t"
270
"addu %[temp12], %[temp10], %[hstride] \n\t"
271
"lbux %[temp2], %[temp1](%[VP8kclip1]) \n\t"
272
"lbux %[temp3], %[temp5](%[VP8kclip1]) \n\t"
273
"lbux %[temp4], %[temp6](%[VP8kclip1]) \n\t"
274
"lbux %[temp5], %[temp8](%[VP8kclip1]) \n\t"
275
"lbux %[temp6], %[temp7](%[VP8kclip1]) \n\t"
276
"lbux %[temp8], %[temp9](%[VP8kclip1]) \n\t"
277
"sb %[temp2], 0(%[temp15]) \n\t"
278
"sb %[temp3], 0(%[temp11]) \n\t"
279
"sb %[temp4], 0(%[temp14]) \n\t"
280
"sb %[temp5], 0(%[p]) \n\t"
281
"sb %[temp6], 0(%[temp10]) \n\t"
282
"sb %[temp8], 0(%[temp12]) \n\t"
283
"3: \n\t"
284
"bgtz %[size], 1b \n\t"
285
" addu %[p], %[p], %[vstride] \n\t"
286
".set pop \n\t"
287
: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),
288
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
289
[temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),
290
[temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),
291
[temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),
292
[size]"+&r"(size), [p]"+&r"(p)
293
: [hstride]"r"(hstride), [thresh2]"r"(thresh2),
294
[ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),
295
[VP8kclip1]"r"(VP8kclip1)
296
: "memory"
297
);
298
}
299
300
static WEBP_INLINE void FilterLoop24(uint8_t* p,
301
int hstride, int vstride, int size,
302
int thresh, int ithresh, int hev_thresh) {
303
int p0, q0, p1, q1, p2, q2, p3, q3;
304
int step1, step2, temp1, temp2, temp3, temp4;
305
uint8_t* pTemp0;
306
uint8_t* pTemp1;
307
const int thresh2 = 2 * thresh + 1;
308
309
__asm__ volatile (
310
".set push \n\t"
311
".set noreorder \n\t"
312
"bltz %[size], 3f \n\t"
313
" nop \n\t"
314
"2: \n\t"
315
"negu %[step1], %[hstride] \n\t"
316
"lbu %[q0], 0(%[p]) \n\t"
317
"lbux %[p0], %[step1](%[p]) \n\t"
318
"subu %[step1], %[step1], %[hstride] \n\t"
319
"lbux %[q1], %[hstride](%[p]) \n\t"
320
"subu %[temp1], %[p0], %[q0] \n\t"
321
"lbux %[p1], %[step1](%[p]) \n\t"
322
"addu %[step2], %[hstride], %[hstride] \n\t"
323
"absq_s.w %[temp2], %[temp1] \n\t"
324
"subu %[temp3], %[p1], %[q1] \n\t"
325
"absq_s.w %[temp4], %[temp3] \n\t"
326
"sll %[temp2], %[temp2], 2 \n\t"
327
"addu %[temp2], %[temp2], %[temp4] \n\t"
328
"subu %[temp4], %[temp2], %[thresh2] \n\t"
329
"subu %[step1], %[step1], %[hstride] \n\t"
330
"bgtz %[temp4], 0f \n\t"
331
" lbux %[p2], %[step1](%[p]) \n\t"
332
"subu %[step1], %[step1], %[hstride] \n\t"
333
"lbux %[q2], %[step2](%[p]) \n\t"
334
"lbux %[p3], %[step1](%[p]) \n\t"
335
"subu %[temp4], %[p2], %[p1] \n\t"
336
"addu %[step2], %[step2], %[hstride] \n\t"
337
"subu %[temp2], %[p3], %[p2] \n\t"
338
"absq_s.w %[temp4], %[temp4] \n\t"
339
"absq_s.w %[temp2], %[temp2] \n\t"
340
"lbux %[q3], %[step2](%[p]) \n\t"
341
"subu %[temp4], %[temp4], %[ithresh] \n\t"
342
"negu %[temp1], %[temp1] \n\t"
343
"bgtz %[temp4], 0f \n\t"
344
" subu %[temp2], %[temp2], %[ithresh] \n\t"
345
"subu %[p3], %[p1], %[p0] \n\t"
346
"bgtz %[temp2], 0f \n\t"
347
" absq_s.w %[p3], %[p3] \n\t"
348
"subu %[temp4], %[q3], %[q2] \n\t"
349
"subu %[pTemp0], %[p], %[hstride] \n\t"
350
"absq_s.w %[temp4], %[temp4] \n\t"
351
"subu %[temp2], %[p3], %[ithresh] \n\t"
352
"sll %[step1], %[temp1], 1 \n\t"
353
"bgtz %[temp2], 0f \n\t"
354
" subu %[temp4], %[temp4], %[ithresh] \n\t"
355
"subu %[temp2], %[q2], %[q1] \n\t"
356
"bgtz %[temp4], 0f \n\t"
357
" absq_s.w %[temp2], %[temp2] \n\t"
358
"subu %[q3], %[q1], %[q0] \n\t"
359
"absq_s.w %[q3], %[q3] \n\t"
360
"subu %[temp2], %[temp2], %[ithresh] \n\t"
361
"addu %[temp1], %[temp1], %[step1] \n\t"
362
"bgtz %[temp2], 0f \n\t"
363
" subu %[temp4], %[q3], %[ithresh] \n\t"
364
"slt %[p3], %[hev_thresh], %[p3] \n\t"
365
"bgtz %[temp4], 0f \n\t"
366
" slt %[q3], %[hev_thresh], %[q3] \n\t"
367
"or %[q3], %[q3], %[p3] \n\t"
368
"bgtz %[q3], 1f \n\t"
369
" shra_r.w %[temp2], %[temp1], 3 \n\t"
370
"addiu %[temp1], %[temp1], 3 \n\t"
371
"sra %[temp1], %[temp1], 3 \n\t"
372
"shll_s.w %[temp2], %[temp2], 27 \n\t"
373
"shll_s.w %[temp1], %[temp1], 27 \n\t"
374
"addu %[pTemp1], %[p], %[hstride] \n\t"
375
"sra %[temp2], %[temp2], 27 \n\t"
376
"sra %[temp1], %[temp1], 27 \n\t"
377
"addiu %[step1], %[temp2], 1 \n\t"
378
"sra %[step1], %[step1], 1 \n\t"
379
"addu %[p0], %[p0], %[temp1] \n\t"
380
"addu %[p1], %[p1], %[step1] \n\t"
381
"subu %[q0], %[q0], %[temp2] \n\t"
382
"subu %[q1], %[q1], %[step1] \n\t"
383
"lbux %[temp2], %[p0](%[VP8kclip1]) \n\t"
384
"lbux %[temp3], %[q0](%[VP8kclip1]) \n\t"
385
"lbux %[temp4], %[q1](%[VP8kclip1]) \n\t"
386
"sb %[temp2], 0(%[pTemp0]) \n\t"
387
"lbux %[temp1], %[p1](%[VP8kclip1]) \n\t"
388
"subu %[pTemp0], %[pTemp0], %[hstride] \n\t"
389
"sb %[temp3], 0(%[p]) \n\t"
390
"sb %[temp4], 0(%[pTemp1]) \n\t"
391
"j 0f \n\t"
392
" sb %[temp1], 0(%[pTemp0]) \n\t"
393
"1: \n\t"
394
"shll_s.w %[temp3], %[temp3], 24 \n\t"
395
"sra %[temp3], %[temp3], 24 \n\t"
396
"addu %[temp1], %[temp1], %[temp3] \n\t"
397
"shra_r.w %[temp2], %[temp1], 3 \n\t"
398
"addiu %[temp1], %[temp1], 3 \n\t"
399
"shll_s.w %[temp2], %[temp2], 27 \n\t"
400
"sra %[temp1], %[temp1], 3 \n\t"
401
"shll_s.w %[temp1], %[temp1], 27 \n\t"
402
"sra %[temp2], %[temp2], 27 \n\t"
403
"sra %[temp1], %[temp1], 27 \n\t"
404
"addu %[p0], %[p0], %[temp1] \n\t"
405
"subu %[q0], %[q0], %[temp2] \n\t"
406
"lbux %[temp1], %[p0](%[VP8kclip1]) \n\t"
407
"lbux %[temp2], %[q0](%[VP8kclip1]) \n\t"
408
"sb %[temp2], 0(%[p]) \n\t"
409
"sb %[temp1], 0(%[pTemp0]) \n\t"
410
"0: \n\t"
411
"subu %[size], %[size], 1 \n\t"
412
"bgtz %[size], 2b \n\t"
413
" addu %[p], %[p], %[vstride] \n\t"
414
"3: \n\t"
415
".set pop \n\t"
416
: [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),
417
[p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),
418
[step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),
419
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
420
[pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),
421
[size]"+&r"(size)
422
: [vstride]"r"(vstride), [ithresh]"r"(ithresh),
423
[hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),
424
[VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
425
: "memory"
426
);
427
}
428
429
// on macroblock edges
430
static void VFilter16(uint8_t* p, int stride,
431
int thresh, int ithresh, int hev_thresh) {
432
FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
433
}
434
435
static void HFilter16(uint8_t* p, int stride,
436
int thresh, int ithresh, int hev_thresh) {
437
FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
438
}
439
440
// 8-pixels wide variant, for chroma filtering
441
static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
442
int stride, int thresh, int ithresh, int hev_thresh) {
443
FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
444
FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
445
}
446
447
static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
448
int stride, int thresh, int ithresh, int hev_thresh) {
449
FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
450
FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
451
}
452
453
// on three inner edges
454
static void VFilter16i(uint8_t* p, int stride,
455
int thresh, int ithresh, int hev_thresh) {
456
int k;
457
for (k = 3; k > 0; --k) {
458
p += 4 * stride;
459
FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
460
}
461
}
462
463
static void HFilter16i(uint8_t* p, int stride,
464
int thresh, int ithresh, int hev_thresh) {
465
int k;
466
for (k = 3; k > 0; --k) {
467
p += 4;
468
FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
469
}
470
}
471
472
static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
473
int stride, int thresh, int ithresh, int hev_thresh) {
474
FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
475
FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
476
}
477
478
static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
479
int stride, int thresh, int ithresh, int hev_thresh) {
480
FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
481
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
482
}
483
484
//------------------------------------------------------------------------------
485
// Simple In-loop filtering (Paragraph 15.2)
486
487
static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
488
int i;
489
const int thresh2 = 2 * thresh + 1;
490
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
491
uint8_t* p1 = p - stride;
492
__asm__ volatile (
493
".set push \n\t"
494
".set noreorder \n\t"
495
"li %[i], 16 \n\t"
496
"0: \n\t"
497
"negu %[temp4], %[stride] \n\t"
498
"sll %[temp5], %[temp4], 1 \n\t"
499
"lbu %[temp2], 0(%[p]) \n\t"
500
"lbux %[temp3], %[stride](%[p]) \n\t"
501
"lbux %[temp1], %[temp4](%[p]) \n\t"
502
"lbux %[temp0], %[temp5](%[p]) \n\t"
503
"subu %[temp7], %[temp1], %[temp2] \n\t"
504
"subu %[temp6], %[temp0], %[temp3] \n\t"
505
"absq_s.w %[temp4], %[temp7] \n\t"
506
"absq_s.w %[temp5], %[temp6] \n\t"
507
"sll %[temp4], %[temp4], 2 \n\t"
508
"subu %[temp5], %[temp5], %[thresh2] \n\t"
509
"addu %[temp5], %[temp4], %[temp5] \n\t"
510
"negu %[temp8], %[temp7] \n\t"
511
"bgtz %[temp5], 1f \n\t"
512
" addiu %[i], %[i], -1 \n\t"
513
"sll %[temp4], %[temp8], 1 \n\t"
514
"shll_s.w %[temp5], %[temp6], 24 \n\t"
515
"addu %[temp3], %[temp4], %[temp8] \n\t"
516
"sra %[temp5], %[temp5], 24 \n\t"
517
"addu %[temp3], %[temp3], %[temp5] \n\t"
518
"addiu %[temp7], %[temp3], 3 \n\t"
519
"sra %[temp7], %[temp7], 3 \n\t"
520
"shra_r.w %[temp8], %[temp3], 3 \n\t"
521
"shll_s.w %[temp0], %[temp7], 27 \n\t"
522
"shll_s.w %[temp4], %[temp8], 27 \n\t"
523
"sra %[temp0], %[temp0], 27 \n\t"
524
"sra %[temp4], %[temp4], 27 \n\t"
525
"addu %[temp7], %[temp1], %[temp0] \n\t"
526
"subu %[temp2], %[temp2], %[temp4] \n\t"
527
"lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"
528
"lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"
529
"sb %[temp3], 0(%[p1]) \n\t"
530
"sb %[temp4], 0(%[p]) \n\t"
531
"1: \n\t"
532
"addiu %[p1], %[p1], 1 \n\t"
533
"bgtz %[i], 0b \n\t"
534
" addiu %[p], %[p], 1 \n\t"
535
" .set pop \n\t"
536
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
537
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
538
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
539
[p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)
540
: [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
541
: "memory"
542
);
543
}
544
545
// TEMP0 = SRC[A + A1 * BPS]
546
// TEMP1 = SRC[B + B1 * BPS]
547
// TEMP2 = SRC[C + C1 * BPS]
548
// TEMP3 = SRC[D + D1 * BPS]
549
#define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3, \
550
A, A1, B, B1, C, C1, D, D1, SRC) \
551
"lbu %[" #TEMP0 "], " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
552
"lbu %[" #TEMP1 "], " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
553
"lbu %[" #TEMP2 "], " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
554
"lbu %[" #TEMP3 "], " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
555
556
static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
557
int i;
558
const int thresh2 = 2 * thresh + 1;
559
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
560
__asm__ volatile (
561
".set push \n\t"
562
".set noreorder \n\t"
563
"li %[i], 16 \n\t"
564
"0: \n\t"
565
LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p)
566
"subu %[temp7], %[temp1], %[temp2] \n\t"
567
"subu %[temp6], %[temp0], %[temp3] \n\t"
568
"absq_s.w %[temp4], %[temp7] \n\t"
569
"absq_s.w %[temp5], %[temp6] \n\t"
570
"sll %[temp4], %[temp4], 2 \n\t"
571
"addu %[temp5], %[temp4], %[temp5] \n\t"
572
"subu %[temp5], %[temp5], %[thresh2] \n\t"
573
"negu %[temp8], %[temp7] \n\t"
574
"bgtz %[temp5], 1f \n\t"
575
" addiu %[i], %[i], -1 \n\t"
576
"sll %[temp4], %[temp8], 1 \n\t"
577
"shll_s.w %[temp5], %[temp6], 24 \n\t"
578
"addu %[temp3], %[temp4], %[temp8] \n\t"
579
"sra %[temp5], %[temp5], 24 \n\t"
580
"addu %[temp3], %[temp3], %[temp5] \n\t"
581
"addiu %[temp7], %[temp3], 3 \n\t"
582
"sra %[temp7], %[temp7], 3 \n\t"
583
"shra_r.w %[temp8], %[temp3], 3 \n\t"
584
"shll_s.w %[temp0], %[temp7], 27 \n\t"
585
"shll_s.w %[temp4], %[temp8], 27 \n\t"
586
"sra %[temp0], %[temp0], 27 \n\t"
587
"sra %[temp4], %[temp4], 27 \n\t"
588
"addu %[temp7], %[temp1], %[temp0] \n\t"
589
"subu %[temp2], %[temp2], %[temp4] \n\t"
590
"lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"
591
"lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"
592
"sb %[temp3], -1(%[p]) \n\t"
593
"sb %[temp4], 0(%[p]) \n\t"
594
"1: \n\t"
595
"bgtz %[i], 0b \n\t"
596
" addu %[p], %[p], %[stride] \n\t"
597
".set pop \n\t"
598
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
599
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
600
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
601
[p]"+&r"(p), [i]"=&r"(i)
602
: [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
603
: "memory"
604
);
605
}
606
607
static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
608
int k;
609
for (k = 3; k > 0; --k) {
610
p += 4 * stride;
611
SimpleVFilter16(p, stride, thresh);
612
}
613
}
614
615
static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
616
int k;
617
for (k = 3; k > 0; --k) {
618
p += 4;
619
SimpleHFilter16(p, stride, thresh);
620
}
621
}
622
623
// DST[A * BPS] = TEMP0
624
// DST[B + C * BPS] = TEMP1
625
#define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST) \
626
"usw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #DST "]) \n\t" \
627
"usw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #DST "]) \n\t"
628
629
static void VE4(uint8_t* dst) { // vertical
630
const uint8_t* top = dst - BPS;
631
int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
632
__asm__ volatile (
633
"ulw %[temp0], -1(%[top]) \n\t"
634
"ulh %[temp1], 3(%[top]) \n\t"
635
"preceu.ph.qbr %[temp2], %[temp0] \n\t"
636
"preceu.ph.qbl %[temp3], %[temp0] \n\t"
637
"preceu.ph.qbr %[temp4], %[temp1] \n\t"
638
"packrl.ph %[temp5], %[temp3], %[temp2] \n\t"
639
"packrl.ph %[temp6], %[temp4], %[temp3] \n\t"
640
"shll.ph %[temp5], %[temp5], 1 \n\t"
641
"shll.ph %[temp6], %[temp6], 1 \n\t"
642
"addq.ph %[temp2], %[temp5], %[temp2] \n\t"
643
"addq.ph %[temp6], %[temp6], %[temp4] \n\t"
644
"addq.ph %[temp2], %[temp2], %[temp3] \n\t"
645
"addq.ph %[temp6], %[temp6], %[temp3] \n\t"
646
"shra_r.ph %[temp2], %[temp2], 2 \n\t"
647
"shra_r.ph %[temp6], %[temp6], 2 \n\t"
648
"precr.qb.ph %[temp4], %[temp6], %[temp2] \n\t"
649
STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst)
650
STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst)
651
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
652
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
653
[temp6]"=&r"(temp6)
654
: [top]"r"(top), [dst]"r"(dst)
655
: "memory"
656
);
657
}
658
659
static void DC4(uint8_t* dst) { // DC
660
int temp0, temp1, temp2, temp3, temp4;
661
__asm__ volatile (
662
"ulw %[temp0], -1*" XSTR(BPS) "(%[dst]) \n\t"
663
LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)
664
"ins %[temp1], %[temp2], 8, 8 \n\t"
665
"ins %[temp1], %[temp3], 16, 8 \n\t"
666
"ins %[temp1], %[temp4], 24, 8 \n\t"
667
"raddu.w.qb %[temp0], %[temp0] \n\t"
668
"raddu.w.qb %[temp1], %[temp1] \n\t"
669
"addu %[temp0], %[temp0], %[temp1] \n\t"
670
"shra_r.w %[temp0], %[temp0], 3 \n\t"
671
"replv.qb %[temp0], %[temp0] \n\t"
672
STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst)
673
STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst)
674
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
675
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
676
: [dst]"r"(dst)
677
: "memory"
678
);
679
}
680
681
static void RD4(uint8_t* dst) { // Down-right
682
int temp0, temp1, temp2, temp3, temp4;
683
int temp5, temp6, temp7, temp8;
684
__asm__ volatile (
685
LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)
686
"ulw %[temp7], -1-" XSTR(BPS) "(%[dst]) \n\t"
687
"ins %[temp1], %[temp0], 16, 16 \n\t"
688
"preceu.ph.qbr %[temp5], %[temp7] \n\t"
689
"ins %[temp2], %[temp1], 16, 16 \n\t"
690
"preceu.ph.qbl %[temp4], %[temp7] \n\t"
691
"ins %[temp3], %[temp2], 16, 16 \n\t"
692
"shll.ph %[temp2], %[temp2], 1 \n\t"
693
"addq.ph %[temp3], %[temp3], %[temp1] \n\t"
694
"packrl.ph %[temp6], %[temp5], %[temp1] \n\t"
695
"addq.ph %[temp3], %[temp3], %[temp2] \n\t"
696
"addq.ph %[temp1], %[temp1], %[temp5] \n\t"
697
"shll.ph %[temp6], %[temp6], 1 \n\t"
698
"addq.ph %[temp1], %[temp1], %[temp6] \n\t"
699
"packrl.ph %[temp0], %[temp4], %[temp5] \n\t"
700
"addq.ph %[temp8], %[temp5], %[temp4] \n\t"
701
"shra_r.ph %[temp3], %[temp3], 2 \n\t"
702
"shll.ph %[temp0], %[temp0], 1 \n\t"
703
"shra_r.ph %[temp1], %[temp1], 2 \n\t"
704
"addq.ph %[temp8], %[temp0], %[temp8] \n\t"
705
"lbu %[temp5], 3-" XSTR(BPS) "(%[dst]) \n\t"
706
"precrq.ph.w %[temp7], %[temp7], %[temp7] \n\t"
707
"shra_r.ph %[temp8], %[temp8], 2 \n\t"
708
"ins %[temp7], %[temp5], 0, 8 \n\t"
709
"precr.qb.ph %[temp2], %[temp1], %[temp3] \n\t"
710
"raddu.w.qb %[temp4], %[temp7] \n\t"
711
"precr.qb.ph %[temp6], %[temp8], %[temp1] \n\t"
712
"shra_r.w %[temp4], %[temp4], 2 \n\t"
713
STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst)
714
"prepend %[temp2], %[temp8], 8 \n\t"
715
"prepend %[temp6], %[temp4], 8 \n\t"
716
STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst)
717
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
718
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
719
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
720
: [dst]"r"(dst)
721
: "memory"
722
);
723
}
724
725
// TEMP0 = SRC[A * BPS]
726
// TEMP1 = SRC[B + C * BPS]
727
#define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC) \
728
"ulw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
729
"ulw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "]) \n\t"
730
731
static void LD4(uint8_t* dst) { // Down-Left
732
int temp0, temp1, temp2, temp3, temp4;
733
int temp5, temp6, temp7, temp8, temp9;
734
__asm__ volatile (
735
LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
736
"preceu.ph.qbl %[temp2], %[temp0] \n\t"
737
"preceu.ph.qbr %[temp3], %[temp0] \n\t"
738
"preceu.ph.qbr %[temp4], %[temp1] \n\t"
739
"preceu.ph.qbl %[temp5], %[temp1] \n\t"
740
"packrl.ph %[temp6], %[temp2], %[temp3] \n\t"
741
"packrl.ph %[temp7], %[temp4], %[temp2] \n\t"
742
"packrl.ph %[temp8], %[temp5], %[temp4] \n\t"
743
"shll.ph %[temp6], %[temp6], 1 \n\t"
744
"addq.ph %[temp9], %[temp2], %[temp6] \n\t"
745
"shll.ph %[temp7], %[temp7], 1 \n\t"
746
"addq.ph %[temp9], %[temp9], %[temp3] \n\t"
747
"shll.ph %[temp8], %[temp8], 1 \n\t"
748
"shra_r.ph %[temp9], %[temp9], 2 \n\t"
749
"addq.ph %[temp3], %[temp4], %[temp7] \n\t"
750
"addq.ph %[temp0], %[temp5], %[temp8] \n\t"
751
"addq.ph %[temp3], %[temp3], %[temp2] \n\t"
752
"addq.ph %[temp0], %[temp0], %[temp4] \n\t"
753
"shra_r.ph %[temp3], %[temp3], 2 \n\t"
754
"shra_r.ph %[temp0], %[temp0], 2 \n\t"
755
"srl %[temp1], %[temp1], 24 \n\t"
756
"sll %[temp1], %[temp1], 1 \n\t"
757
"raddu.w.qb %[temp5], %[temp5] \n\t"
758
"precr.qb.ph %[temp9], %[temp3], %[temp9] \n\t"
759
"precr.qb.ph %[temp3], %[temp0], %[temp3] \n\t"
760
"addu %[temp1], %[temp1], %[temp5] \n\t"
761
"shra_r.w %[temp1], %[temp1], 2 \n\t"
762
STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst)
763
"prepend %[temp9], %[temp0], 8 \n\t"
764
"prepend %[temp3], %[temp1], 8 \n\t"
765
STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst)
766
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
767
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
768
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
769
[temp9]"=&r"(temp9)
770
: [dst]"r"(dst)
771
: "memory"
772
);
773
}
774
775
//------------------------------------------------------------------------------
776
// Chroma
777
778
static void DC8uv(uint8_t* dst) { // DC
779
int temp0, temp1, temp2, temp3, temp4;
780
int temp5, temp6, temp7, temp8, temp9;
781
__asm__ volatile (
782
LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
783
LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
784
LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst)
785
"raddu.w.qb %[temp0], %[temp0] \n\t"
786
"raddu.w.qb %[temp1], %[temp1] \n\t"
787
"addu %[temp2], %[temp2], %[temp3] \n\t"
788
"addu %[temp4], %[temp4], %[temp5] \n\t"
789
"addu %[temp6], %[temp6], %[temp7] \n\t"
790
"addu %[temp8], %[temp8], %[temp9] \n\t"
791
"addu %[temp0], %[temp0], %[temp1] \n\t"
792
"addu %[temp2], %[temp2], %[temp4] \n\t"
793
"addu %[temp6], %[temp6], %[temp8] \n\t"
794
"addu %[temp0], %[temp0], %[temp2] \n\t"
795
"addu %[temp0], %[temp0], %[temp6] \n\t"
796
"shra_r.w %[temp0], %[temp0], 4 \n\t"
797
"replv.qb %[temp0], %[temp0] \n\t"
798
STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
799
STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
800
STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
801
STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
802
STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
803
STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
804
STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
805
STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
806
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
807
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
808
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
809
[temp9]"=&r"(temp9)
810
: [dst]"r"(dst)
811
: "memory"
812
);
813
}
814
815
static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
816
int temp0, temp1;
817
__asm__ volatile (
818
LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
819
"raddu.w.qb %[temp0], %[temp0] \n\t"
820
"raddu.w.qb %[temp1], %[temp1] \n\t"
821
"addu %[temp0], %[temp0], %[temp1] \n\t"
822
"shra_r.w %[temp0], %[temp0], 3 \n\t"
823
"replv.qb %[temp0], %[temp0] \n\t"
824
STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
825
STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
826
STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
827
STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
828
STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
829
STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
830
STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
831
STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
832
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
833
: [dst]"r"(dst)
834
: "memory"
835
);
836
}
837
838
static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
839
int temp0, temp1, temp2, temp3, temp4;
840
int temp5, temp6, temp7, temp8;
841
__asm__ volatile (
842
LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
843
LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst)
844
"addu %[temp2], %[temp2], %[temp3] \n\t"
845
"addu %[temp4], %[temp4], %[temp5] \n\t"
846
"addu %[temp6], %[temp6], %[temp7] \n\t"
847
"addu %[temp8], %[temp8], %[temp1] \n\t"
848
"addu %[temp2], %[temp2], %[temp4] \n\t"
849
"addu %[temp6], %[temp6], %[temp8] \n\t"
850
"addu %[temp0], %[temp6], %[temp2] \n\t"
851
"shra_r.w %[temp0], %[temp0], 3 \n\t"
852
"replv.qb %[temp0], %[temp0] \n\t"
853
STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
854
STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
855
STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
856
STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
857
STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
858
STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
859
STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
860
STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
861
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
862
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
863
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
864
: [dst]"r"(dst)
865
: "memory"
866
);
867
}
868
869
#undef LOAD_8_BYTES
870
#undef STORE_8_BYTES
871
#undef LOAD_4_BYTES
872
873
#define CLIPPING(SIZE) \
874
"preceu.ph.qbl %[temp2], %[temp0] \n\t" \
875
"preceu.ph.qbr %[temp0], %[temp0] \n\t" \
876
".if " #SIZE " == 8 \n\t" \
877
"preceu.ph.qbl %[temp3], %[temp1] \n\t" \
878
"preceu.ph.qbr %[temp1], %[temp1] \n\t" \
879
".endif \n\t" \
880
"addu.ph %[temp2], %[temp2], %[dst_1] \n\t" \
881
"addu.ph %[temp0], %[temp0], %[dst_1] \n\t" \
882
".if " #SIZE " == 8 \n\t" \
883
"addu.ph %[temp3], %[temp3], %[dst_1] \n\t" \
884
"addu.ph %[temp1], %[temp1], %[dst_1] \n\t" \
885
".endif \n\t" \
886
"shll_s.ph %[temp2], %[temp2], 7 \n\t" \
887
"shll_s.ph %[temp0], %[temp0], 7 \n\t" \
888
".if " #SIZE " == 8 \n\t" \
889
"shll_s.ph %[temp3], %[temp3], 7 \n\t" \
890
"shll_s.ph %[temp1], %[temp1], 7 \n\t" \
891
".endif \n\t" \
892
"precrqu_s.qb.ph %[temp0], %[temp2], %[temp0] \n\t" \
893
".if " #SIZE " == 8 \n\t" \
894
"precrqu_s.qb.ph %[temp1], %[temp3], %[temp1] \n\t" \
895
".endif \n\t"
896
897
898
#define CLIP_8B_TO_DST(DST, TOP, SIZE) do { \
899
int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1]; \
900
int temp0, temp1, temp2, temp3; \
901
__asm__ volatile ( \
902
".if " #SIZE " < 8 \n\t" \
903
"ulw %[temp0], 0(%[top]) \n\t" \
904
"subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \
905
CLIPPING(4) \
906
"usw %[temp0], 0(%[dst]) \n\t" \
907
".else \n\t" \
908
"ulw %[temp0], 0(%[top]) \n\t" \
909
"ulw %[temp1], 4(%[top]) \n\t" \
910
"subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \
911
CLIPPING(8) \
912
"usw %[temp0], 0(%[dst]) \n\t" \
913
"usw %[temp1], 4(%[dst]) \n\t" \
914
".if " #SIZE " == 16 \n\t" \
915
"ulw %[temp0], 8(%[top]) \n\t" \
916
"ulw %[temp1], 12(%[top]) \n\t" \
917
CLIPPING(8) \
918
"usw %[temp0], 8(%[dst]) \n\t" \
919
"usw %[temp1], 12(%[dst]) \n\t" \
920
".endif \n\t" \
921
".endif \n\t" \
922
: [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \
923
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \
924
: [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST)) \
925
: "memory" \
926
); \
927
} while (0)
928
929
#define CLIP_TO_DST(DST, SIZE) do { \
930
int y; \
931
const uint8_t* top = (DST) - BPS; \
932
const int top_1 = ((int)top[-1] << 16) + top[-1]; \
933
for (y = 0; y < (SIZE); ++y) { \
934
CLIP_8B_TO_DST((DST), top, (SIZE)); \
935
(DST) += BPS; \
936
} \
937
} while (0)
938
939
#define TRUE_MOTION(DST, SIZE) \
940
static void TrueMotion##SIZE(uint8_t* (DST)) { \
941
CLIP_TO_DST((DST), (SIZE)); \
942
}
943
944
TRUE_MOTION(dst, 4)
945
TRUE_MOTION(dst, 8)
946
TRUE_MOTION(dst, 16)
947
948
#undef TRUE_MOTION
949
#undef CLIP_TO_DST
950
#undef CLIP_8B_TO_DST
951
#undef CLIPPING
952
953
//------------------------------------------------------------------------------
954
// Entry point
955
956
extern void VP8DspInitMIPSdspR2(void);
957
958
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {
959
VP8TransformDC = TransformDC;
960
VP8TransformAC3 = TransformAC3;
961
VP8Transform = TransformTwo;
962
963
VP8VFilter16 = VFilter16;
964
VP8HFilter16 = HFilter16;
965
VP8VFilter8 = VFilter8;
966
VP8HFilter8 = HFilter8;
967
VP8VFilter16i = VFilter16i;
968
VP8HFilter16i = HFilter16i;
969
VP8VFilter8i = VFilter8i;
970
VP8HFilter8i = HFilter8i;
971
VP8SimpleVFilter16 = SimpleVFilter16;
972
VP8SimpleHFilter16 = SimpleHFilter16;
973
VP8SimpleVFilter16i = SimpleVFilter16i;
974
VP8SimpleHFilter16i = SimpleHFilter16i;
975
976
VP8PredLuma4[0] = DC4;
977
VP8PredLuma4[1] = TrueMotion4;
978
VP8PredLuma4[2] = VE4;
979
VP8PredLuma4[4] = RD4;
980
VP8PredLuma4[6] = LD4;
981
982
VP8PredChroma8[0] = DC8uv;
983
VP8PredChroma8[1] = TrueMotion8;
984
VP8PredChroma8[4] = DC8uvNoTop;
985
VP8PredChroma8[5] = DC8uvNoLeft;
986
987
VP8PredLuma16[1] = TrueMotion16;
988
}
989
990
#else // !WEBP_USE_MIPS_DSP_R2
991
992
WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)
993
994
#endif // WEBP_USE_MIPS_DSP_R2
995
996