CoCalc -- dec_mips_dsp

GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c
¹⁶³⁴⁸ views
1
// Copyright 2014 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
// MIPS version of dsp functions
11
//
12
// Author(s):  Djordje Pesut    ([email protected])
13
//             Jovan Zelincevic ([email protected])
14

15
#include "src/dsp/dsp.h"
16

17
#if defined(WEBP_USE_MIPS_DSP_R2)
18

19
#include "src/dsp/mips_macro.h"
20

21
static const int kC1 = 20091 + (1 << 16);
22
static const int kC2 = 35468;
23

24
#define MUL(a, b) (((a) * (b)) >> 16)
25

26
static void TransformDC(const int16_t* in, uint8_t* dst) {
27
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
28

29
  __asm__ volatile (
30
    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,
31
                        0, 0, 0, 0,
32
                        0, 1, 2, 3,
33
                        BPS)
34
    "lh               %[temp5],  0(%[in])               \n\t"
35
    "addiu            %[temp5],  %[temp5],  4           \n\t"
36
    "ins              %[temp5],  %[temp5],  16, 16      \n\t"
37
    "shra.ph          %[temp5],  %[temp5],  3           \n\t"
38
    CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
39
                            temp3, temp1, temp2, temp3, temp4)
40
    STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
41
                     temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
42
                     dst, 0, 1, 2, 3, BPS)
43

44
    OUTPUT_EARLY_CLOBBER_REGS_10()
45
    : [in]"r"(in), [dst]"r"(dst)
46
    : "memory"
47
  );
48
}
49

50
static void TransformAC3(const int16_t* in, uint8_t* dst) {
51
  const int a = in[0] + 4;
52
  int c4 = MUL(in[4], kC2);
53
  const int d4 = MUL(in[4], kC1);
54
  const int c1 = MUL(in[1], kC2);
55
  const int d1 = MUL(in[1], kC1);
56
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
57
  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
58

59
  __asm__ volatile (
60
    "ins              %[c4],      %[d4],     16,       16    \n\t"
61
    "replv.ph         %[temp1],   %[a]                       \n\t"
62
    "replv.ph         %[temp4],   %[d1]                      \n\t"
63
    ADD_SUB_HALVES(temp2, temp3, temp1, c4)
64
    "replv.ph         %[temp5],   %[c1]                      \n\t"
65
    SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
66
                   temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
67
    LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,
68
                        0, 0, 0, 0,
69
                        0, 1, 2, 3,
70
                        BPS)
71
    CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
72
                            temp11, temp17, temp3, temp5, temp11, temp12)
73
    PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
74
                          temp4, temp7, temp6, temp10, temp9)
75
    STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
76
                     temp17, temp12, temp18, temp1, temp8, temp2, temp4,
77
                     temp7, temp6, dst, 0, 1, 2, 3, BPS)
78

79
    OUTPUT_EARLY_CLOBBER_REGS_18(),
80
      [c4]"+&r"(c4)
81
    : [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)
82
    : "memory"
83
  );
84
}
85

86
static void TransformOne(const int16_t* in, uint8_t* dst) {
87
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
88
  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
89

90
  __asm__ volatile (
91
    "ulw              %[temp1],   0(%[in])                 \n\t"
92
    "ulw              %[temp2],   16(%[in])                \n\t"
93
    LOAD_IN_X2(temp5, temp6, 24, 26)
94
    ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
95
    LOAD_IN_X2(temp1, temp2, 8, 10)
96
    MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
97
                  temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
98
                  temp13, temp11, temp14, temp12)
99
    INSERT_HALF_X2(temp8, temp7, temp10, temp9)
100
    "ulw              %[temp17],  4(%[in])                 \n\t"
101
    "ulw              %[temp18],  20(%[in])                \n\t"
102
    ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
103
    ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
104
    ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
105
    LOAD_IN_X2(temp17, temp18, 12, 14)
106
    LOAD_IN_X2(temp9, temp10, 28, 30)
107
    MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
108
                  temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
109
                  temp15, temp4, temp16, temp17)
110
    INSERT_HALF_X2(temp11, temp12, temp13, temp14)
111
    ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
112
    ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
113

114
    // horizontal
115
    SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
116
    INSERT_HALF_X2(temp1, temp6, temp5, temp2)
117
    SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
118
    "repl.ph          %[temp2],   0x4                      \n\t"
119
    INSERT_HALF_X2(temp3, temp8, temp17, temp4)
120
    "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
121
    "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
122
    ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
123
    ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
124
    MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
125
                  temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
126
                  temp6, temp17, temp8, temp18)
127
    MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
128
                  temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
129
                  temp18, temp12, temp17, temp16)
130
    INSERT_HALF_X2(temp1, temp3, temp9, temp13)
131
    INSERT_HALF_X2(temp6, temp8, temp11, temp15)
132
    SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
133
                   temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
134
                   temp6)
135
    PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
136
                          temp16, temp11, temp10, temp15, temp14)
137
    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,
138
                        0, 0, 0, 0,
139
                        0, 1, 2, 3,
140
                        BPS)
141
    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
142
                            temp11, temp10, temp11, temp14, temp15)
143
    STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
144
                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
145
                     dst, 0, 1, 2, 3, BPS)
146

147
    OUTPUT_EARLY_CLOBBER_REGS_18()
148
    : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
149
    : "memory", "hi", "lo"
150
  );
151
}
152

153
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
154
  TransformOne(in, dst);
155
  if (do_two) {
156
    TransformOne(in + 16, dst + 4);
157
  }
158
}
159

160
static WEBP_INLINE void FilterLoop26(uint8_t* p,
161
                                     int hstride, int vstride, int size,
162
                                     int thresh, int ithresh, int hev_thresh) {
163
  const int thresh2 = 2 * thresh + 1;
164
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
165
  int temp10, temp11, temp12, temp13, temp14, temp15;
166

167
  __asm__ volatile (
168
    ".set      push                                      \n\t"
169
    ".set      noreorder                                 \n\t"
170
  "1:                                                    \n\t"
171
    "negu      %[temp1],  %[hstride]                     \n\t"
172
    "addiu     %[size],   %[size],        -1             \n\t"
173
    "sll       %[temp2],  %[hstride],     1              \n\t"
174
    "sll       %[temp3],  %[temp1],       1              \n\t"
175
    "addu      %[temp4],  %[temp2],       %[hstride]     \n\t"
176
    "addu      %[temp5],  %[temp3],       %[temp1]       \n\t"
177
    "lbu       %[temp7],  0(%[p])                        \n\t"
178
    "sll       %[temp6],  %[temp3],       1              \n\t"
179
    "lbux      %[temp8],  %[temp5](%[p])                 \n\t"
180
    "lbux      %[temp9],  %[temp3](%[p])                 \n\t"
181
    "lbux      %[temp10], %[temp1](%[p])                 \n\t"
182
    "lbux      %[temp11], %[temp6](%[p])                 \n\t"
183
    "lbux      %[temp12], %[hstride](%[p])               \n\t"
184
    "lbux      %[temp13], %[temp2](%[p])                 \n\t"
185
    "lbux      %[temp14], %[temp4](%[p])                 \n\t"
186
    "subu      %[temp1],  %[temp10],      %[temp7]       \n\t"
187
    "subu      %[temp2],  %[temp9],       %[temp12]      \n\t"
188
    "absq_s.w  %[temp3],  %[temp1]                       \n\t"
189
    "absq_s.w  %[temp4],  %[temp2]                       \n\t"
190
    "negu      %[temp1],  %[temp1]                       \n\t"
191
    "sll       %[temp3],  %[temp3],       2              \n\t"
192
    "addu      %[temp15], %[temp3],       %[temp4]       \n\t"
193
    "subu      %[temp3],  %[temp15],      %[thresh2]     \n\t"
194
    "sll       %[temp6],  %[temp1],       1              \n\t"
195
    "bgtz      %[temp3],  3f                             \n\t"
196
    " subu     %[temp4],  %[temp11],      %[temp8]       \n\t"
197
    "absq_s.w  %[temp4],  %[temp4]                       \n\t"
198
    "shll_s.w  %[temp2],  %[temp2],       24             \n\t"
199
    "subu      %[temp4],  %[temp4],       %[ithresh]     \n\t"
200
    "bgtz      %[temp4],  3f                             \n\t"
201
    " subu     %[temp3],  %[temp8],       %[temp9]       \n\t"
202
    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
203
    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
204
    "bgtz      %[temp3],  3f                             \n\t"
205
    " subu     %[temp5],  %[temp9],       %[temp10]      \n\t"
206
    "absq_s.w  %[temp3],  %[temp5]                       \n\t"
207
    "absq_s.w  %[temp5],  %[temp5]                       \n\t"
208
    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
209
    "bgtz      %[temp3],  3f                             \n\t"
210
    " subu     %[temp3],  %[temp14],      %[temp13]      \n\t"
211
    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
212
    "slt       %[temp5],  %[hev_thresh],  %[temp5]       \n\t"
213
    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
214
    "bgtz      %[temp3],  3f                             \n\t"
215
    " subu     %[temp3],  %[temp13],      %[temp12]      \n\t"
216
    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
217
    "sra       %[temp4],  %[temp2],       24             \n\t"
218
    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
219
    "bgtz      %[temp3],  3f                             \n\t"
220
    " subu     %[temp15], %[temp12],      %[temp7]       \n\t"
221
    "absq_s.w  %[temp3],  %[temp15]                      \n\t"
222
    "absq_s.w  %[temp15], %[temp15]                      \n\t"
223
    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
224
    "bgtz      %[temp3],  3f                             \n\t"
225
    " slt      %[temp15], %[hev_thresh],  %[temp15]      \n\t"
226
    "addu      %[temp3],  %[temp6],       %[temp1]       \n\t"
227
    "or        %[temp2],  %[temp5],       %[temp15]      \n\t"
228
    "addu      %[temp5],  %[temp4],       %[temp3]       \n\t"
229
    "beqz      %[temp2],  4f                             \n\t"
230
    " shra_r.w %[temp1],  %[temp5],       3              \n\t"
231
    "addiu     %[temp2],  %[temp5],       3              \n\t"
232
    "sra       %[temp2],  %[temp2],       3              \n\t"
233
    "shll_s.w  %[temp1],  %[temp1],       27             \n\t"
234
    "shll_s.w  %[temp2],  %[temp2],       27             \n\t"
235
    "subu      %[temp3],  %[p],           %[hstride]     \n\t"
236
    "sra       %[temp1],  %[temp1],       27             \n\t"
237
    "sra       %[temp2],  %[temp2],       27             \n\t"
238
    "subu      %[temp1],  %[temp7],       %[temp1]       \n\t"
239
    "addu      %[temp2],  %[temp10],      %[temp2]       \n\t"
240
    "lbux      %[temp2],  %[temp2](%[VP8kclip1])         \n\t"
241
    "lbux      %[temp1],  %[temp1](%[VP8kclip1])         \n\t"
242
    "sb        %[temp2],  0(%[temp3])                    \n\t"
243
    "j         3f                                        \n\t"
244
    " sb       %[temp1],  0(%[p])                        \n\t"
245
  "4:                                                    \n\t"
246
    "shll_s.w  %[temp5],  %[temp5],       24             \n\t"
247
    "subu      %[temp14], %[p],           %[hstride]     \n\t"
248
    "subu      %[temp11], %[temp14],      %[hstride]     \n\t"
249
    "sra       %[temp6],  %[temp5],       24             \n\t"
250
    "sll       %[temp1],  %[temp6],       3              \n\t"
251
    "subu      %[temp15], %[temp11],      %[hstride]     \n\t"
252
    "addu      %[temp2],  %[temp6],       %[temp1]       \n\t"
253
    "sll       %[temp3],  %[temp2],       1              \n\t"
254
    "addu      %[temp4],  %[temp3],       %[temp2]       \n\t"
255
    "addiu     %[temp2],  %[temp2],       63             \n\t"
256
    "addiu     %[temp3],  %[temp3],       63             \n\t"
257
    "addiu     %[temp4],  %[temp4],       63             \n\t"
258
    "sra       %[temp2],  %[temp2],       7              \n\t"
259
    "sra       %[temp3],  %[temp3],       7              \n\t"
260
    "sra       %[temp4],  %[temp4],       7              \n\t"
261
    "addu      %[temp1],  %[temp8],       %[temp2]       \n\t"
262
    "addu      %[temp5],  %[temp9],       %[temp3]       \n\t"
263
    "addu      %[temp6],  %[temp10],      %[temp4]       \n\t"
264
    "subu      %[temp8],  %[temp7],       %[temp4]       \n\t"
265
    "subu      %[temp7],  %[temp12],      %[temp3]       \n\t"
266
    "addu      %[temp10], %[p],           %[hstride]     \n\t"
267
    "subu      %[temp9],  %[temp13],      %[temp2]       \n\t"
268
    "addu      %[temp12], %[temp10],      %[hstride]     \n\t"
269
    "lbux      %[temp2],  %[temp1](%[VP8kclip1])         \n\t"
270
    "lbux      %[temp3],  %[temp5](%[VP8kclip1])         \n\t"
271
    "lbux      %[temp4],  %[temp6](%[VP8kclip1])         \n\t"
272
    "lbux      %[temp5],  %[temp8](%[VP8kclip1])         \n\t"
273
    "lbux      %[temp6],  %[temp7](%[VP8kclip1])         \n\t"
274
    "lbux      %[temp8],  %[temp9](%[VP8kclip1])         \n\t"
275
    "sb        %[temp2],  0(%[temp15])                   \n\t"
276
    "sb        %[temp3],  0(%[temp11])                   \n\t"
277
    "sb        %[temp4],  0(%[temp14])                   \n\t"
278
    "sb        %[temp5],  0(%[p])                        \n\t"
279
    "sb        %[temp6],  0(%[temp10])                   \n\t"
280
    "sb        %[temp8],  0(%[temp12])                   \n\t"
281
  "3:                                                    \n\t"
282
    "bgtz      %[size],   1b                             \n\t"
283
    " addu     %[p],      %[p],           %[vstride]     \n\t"
284
    ".set      pop                                       \n\t"
285
    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),
286
      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
287
      [temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),
288
      [temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),
289
      [temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),
290
      [size]"+&r"(size), [p]"+&r"(p)
291
    : [hstride]"r"(hstride), [thresh2]"r"(thresh2),
292
      [ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),
293
      [VP8kclip1]"r"(VP8kclip1)
294
    : "memory"
295
  );
296
}
297

298
static WEBP_INLINE void FilterLoop24(uint8_t* p,
299
                                     int hstride, int vstride, int size,
300
                                     int thresh, int ithresh, int hev_thresh) {
301
  int p0, q0, p1, q1, p2, q2, p3, q3;
302
  int step1, step2, temp1, temp2, temp3, temp4;
303
  uint8_t* pTemp0;
304
  uint8_t* pTemp1;
305
  const int thresh2 = 2 * thresh + 1;
306

307
  __asm__ volatile (
308
    ".set      push                                   \n\t"
309
    ".set      noreorder                              \n\t"
310
    "bltz      %[size],    3f                         \n\t"
311
    " nop                                             \n\t"
312
  "2:                                                 \n\t"
313
    "negu      %[step1],   %[hstride]                 \n\t"
314
    "lbu       %[q0],      0(%[p])                    \n\t"
315
    "lbux      %[p0],      %[step1](%[p])             \n\t"
316
    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
317
    "lbux      %[q1],      %[hstride](%[p])           \n\t"
318
    "subu      %[temp1],   %[p0],         %[q0]       \n\t"
319
    "lbux      %[p1],      %[step1](%[p])             \n\t"
320
    "addu      %[step2],   %[hstride],    %[hstride]  \n\t"
321
    "absq_s.w  %[temp2],   %[temp1]                   \n\t"
322
    "subu      %[temp3],   %[p1],         %[q1]       \n\t"
323
    "absq_s.w  %[temp4],   %[temp3]                   \n\t"
324
    "sll       %[temp2],   %[temp2],      2           \n\t"
325
    "addu      %[temp2],   %[temp2],      %[temp4]    \n\t"
326
    "subu      %[temp4],   %[temp2],      %[thresh2]  \n\t"
327
    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
328
    "bgtz      %[temp4],   0f                         \n\t"
329
    " lbux     %[p2],      %[step1](%[p])             \n\t"
330
    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
331
    "lbux      %[q2],      %[step2](%[p])             \n\t"
332
    "lbux      %[p3],      %[step1](%[p])             \n\t"
333
    "subu      %[temp4],   %[p2],         %[p1]       \n\t"
334
    "addu      %[step2],   %[step2],      %[hstride]  \n\t"
335
    "subu      %[temp2],   %[p3],         %[p2]       \n\t"
336
    "absq_s.w  %[temp4],   %[temp4]                   \n\t"
337
    "absq_s.w  %[temp2],   %[temp2]                   \n\t"
338
    "lbux      %[q3],      %[step2](%[p])             \n\t"
339
    "subu      %[temp4],   %[temp4],      %[ithresh]  \n\t"
340
    "negu      %[temp1],   %[temp1]                   \n\t"
341
    "bgtz      %[temp4],   0f                         \n\t"
342
    " subu     %[temp2],   %[temp2],      %[ithresh]  \n\t"
343
    "subu      %[p3],      %[p1],         %[p0]       \n\t"
344
    "bgtz      %[temp2],   0f                         \n\t"
345
    " absq_s.w %[p3],      %[p3]                      \n\t"
346
    "subu      %[temp4],   %[q3],         %[q2]       \n\t"
347
    "subu      %[pTemp0],  %[p],          %[hstride]  \n\t"
348
    "absq_s.w  %[temp4],   %[temp4]                   \n\t"
349
    "subu      %[temp2],   %[p3],         %[ithresh]  \n\t"
350
    "sll       %[step1],   %[temp1],      1           \n\t"
351
    "bgtz      %[temp2],   0f                         \n\t"
352
    " subu     %[temp4],   %[temp4],      %[ithresh]  \n\t"
353
    "subu      %[temp2],   %[q2],         %[q1]       \n\t"
354
    "bgtz      %[temp4],   0f                         \n\t"
355
    " absq_s.w %[temp2],   %[temp2]                   \n\t"
356
    "subu      %[q3],      %[q1],         %[q0]       \n\t"
357
    "absq_s.w  %[q3],      %[q3]                      \n\t"
358
    "subu      %[temp2],   %[temp2],      %[ithresh]  \n\t"
359
    "addu      %[temp1],   %[temp1],      %[step1]    \n\t"
360
    "bgtz      %[temp2],   0f                         \n\t"
361
    " subu     %[temp4],   %[q3],         %[ithresh]  \n\t"
362
    "slt       %[p3],      %[hev_thresh], %[p3]       \n\t"
363
    "bgtz      %[temp4],   0f                         \n\t"
364
    " slt      %[q3],      %[hev_thresh], %[q3]       \n\t"
365
    "or        %[q3],      %[q3],         %[p3]       \n\t"
366
    "bgtz      %[q3],      1f                         \n\t"
367
    " shra_r.w %[temp2],   %[temp1],      3           \n\t"
368
    "addiu     %[temp1],   %[temp1],      3           \n\t"
369
    "sra       %[temp1],   %[temp1],      3           \n\t"
370
    "shll_s.w  %[temp2],   %[temp2],      27          \n\t"
371
    "shll_s.w  %[temp1],   %[temp1],      27          \n\t"
372
    "addu      %[pTemp1],  %[p],          %[hstride]  \n\t"
373
    "sra       %[temp2],   %[temp2],      27          \n\t"
374
    "sra       %[temp1],   %[temp1],      27          \n\t"
375
    "addiu     %[step1],   %[temp2],      1           \n\t"
376
    "sra       %[step1],   %[step1],      1           \n\t"
377
    "addu      %[p0],      %[p0],         %[temp1]    \n\t"
378
    "addu      %[p1],      %[p1],         %[step1]    \n\t"
379
    "subu      %[q0],      %[q0],         %[temp2]    \n\t"
380
    "subu      %[q1],      %[q1],         %[step1]    \n\t"
381
    "lbux      %[temp2],   %[p0](%[VP8kclip1])        \n\t"
382
    "lbux      %[temp3],   %[q0](%[VP8kclip1])        \n\t"
383
    "lbux      %[temp4],   %[q1](%[VP8kclip1])        \n\t"
384
    "sb        %[temp2],   0(%[pTemp0])               \n\t"
385
    "lbux      %[temp1],   %[p1](%[VP8kclip1])        \n\t"
386
    "subu      %[pTemp0],  %[pTemp0],    %[hstride]   \n\t"
387
    "sb        %[temp3],   0(%[p])                    \n\t"
388
    "sb        %[temp4],   0(%[pTemp1])               \n\t"
389
    "j         0f                                     \n\t"
390
    " sb       %[temp1],   0(%[pTemp0])               \n\t"
391
  "1:                                                 \n\t"
392
    "shll_s.w  %[temp3],   %[temp3],      24          \n\t"
393
    "sra       %[temp3],   %[temp3],      24          \n\t"
394
    "addu      %[temp1],   %[temp1],      %[temp3]    \n\t"
395
    "shra_r.w  %[temp2],   %[temp1],      3           \n\t"
396
    "addiu     %[temp1],   %[temp1],      3           \n\t"
397
    "shll_s.w  %[temp2],   %[temp2],      27          \n\t"
398
    "sra       %[temp1],   %[temp1],      3           \n\t"
399
    "shll_s.w  %[temp1],   %[temp1],      27          \n\t"
400
    "sra       %[temp2],   %[temp2],      27          \n\t"
401
    "sra       %[temp1],   %[temp1],      27          \n\t"
402
    "addu      %[p0],      %[p0],         %[temp1]    \n\t"
403
    "subu      %[q0],      %[q0],         %[temp2]    \n\t"
404
    "lbux      %[temp1],   %[p0](%[VP8kclip1])        \n\t"
405
    "lbux      %[temp2],   %[q0](%[VP8kclip1])        \n\t"
406
    "sb        %[temp2],   0(%[p])                    \n\t"
407
    "sb        %[temp1],   0(%[pTemp0])               \n\t"
408
  "0:                                                 \n\t"
409
    "subu      %[size],    %[size],       1           \n\t"
410
    "bgtz      %[size],    2b                         \n\t"
411
    " addu     %[p],       %[p],          %[vstride]  \n\t"
412
  "3:                                                 \n\t"
413
    ".set      pop                                    \n\t"
414
    : [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),
415
      [p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),
416
      [step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),
417
      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
418
      [pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),
419
      [size]"+&r"(size)
420
    : [vstride]"r"(vstride), [ithresh]"r"(ithresh),
421
      [hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),
422
      [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
423
    : "memory"
424
  );
425
}
426

427
// on macroblock edges
428
static void VFilter16(uint8_t* p, int stride,
429
                      int thresh, int ithresh, int hev_thresh) {
430
  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
431
}
432

433
static void HFilter16(uint8_t* p, int stride,
434
                      int thresh, int ithresh, int hev_thresh) {
435
  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
436
}
437

438
// 8-pixels wide variant, for chroma filtering
439
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
440
                     int thresh, int ithresh, int hev_thresh) {
441
  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
442
  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
443
}
444

445
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
446
                     int thresh, int ithresh, int hev_thresh) {
447
  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
448
  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
449
}
450

451
// on three inner edges
452
static void VFilter16i(uint8_t* p, int stride,
453
                       int thresh, int ithresh, int hev_thresh) {
454
  int k;
455
  for (k = 3; k > 0; --k) {
456
    p += 4 * stride;
457
    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
458
  }
459
}
460

461
static void HFilter16i(uint8_t* p, int stride,
462
                       int thresh, int ithresh, int hev_thresh) {
463
  int k;
464
  for (k = 3; k > 0; --k) {
465
    p += 4;
466
    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
467
  }
468
}
469

470
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
471
                      int thresh, int ithresh, int hev_thresh) {
472
  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
473
  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
474
}
475

476
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
477
                      int thresh, int ithresh, int hev_thresh) {
478
  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
479
  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
480
}
481

482
#undef MUL
483

484
//------------------------------------------------------------------------------
485
// Simple In-loop filtering (Paragraph 15.2)
486

487
static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
488
  int i;
489
  const int thresh2 = 2 * thresh + 1;
490
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
491
  uint8_t* p1 = p - stride;
492
  __asm__ volatile (
493
    ".set      push                                      \n\t"
494
    ".set      noreorder                                 \n\t"
495
    "li        %[i],        16                           \n\t"
496
  "0:                                                    \n\t"
497
    "negu      %[temp4],    %[stride]                    \n\t"
498
    "sll       %[temp5],    %[temp4],       1            \n\t"
499
    "lbu       %[temp2],    0(%[p])                      \n\t"
500
    "lbux      %[temp3],    %[stride](%[p])              \n\t"
501
    "lbux      %[temp1],    %[temp4](%[p])               \n\t"
502
    "lbux      %[temp0],    %[temp5](%[p])               \n\t"
503
    "subu      %[temp7],    %[temp1],       %[temp2]     \n\t"
504
    "subu      %[temp6],    %[temp0],       %[temp3]     \n\t"
505
    "absq_s.w  %[temp4],    %[temp7]                     \n\t"
506
    "absq_s.w  %[temp5],    %[temp6]                     \n\t"
507
    "sll       %[temp4],    %[temp4],       2            \n\t"
508
    "subu      %[temp5],    %[temp5],       %[thresh2]   \n\t"
509
    "addu      %[temp5],    %[temp4],       %[temp5]     \n\t"
510
    "negu      %[temp8],    %[temp7]                     \n\t"
511
    "bgtz      %[temp5],    1f                           \n\t"
512
    " addiu    %[i],        %[i],           -1           \n\t"
513
    "sll       %[temp4],    %[temp8],       1            \n\t"
514
    "shll_s.w  %[temp5],    %[temp6],       24           \n\t"
515
    "addu      %[temp3],    %[temp4],       %[temp8]     \n\t"
516
    "sra       %[temp5],    %[temp5],       24           \n\t"
517
    "addu      %[temp3],    %[temp3],       %[temp5]     \n\t"
518
    "addiu     %[temp7],    %[temp3],       3            \n\t"
519
    "sra       %[temp7],    %[temp7],       3            \n\t"
520
    "shra_r.w  %[temp8],    %[temp3],       3            \n\t"
521
    "shll_s.w  %[temp0],    %[temp7],       27           \n\t"
522
    "shll_s.w  %[temp4],    %[temp8],       27           \n\t"
523
    "sra       %[temp0],    %[temp0],       27           \n\t"
524
    "sra       %[temp4],    %[temp4],       27           \n\t"
525
    "addu      %[temp7],    %[temp1],       %[temp0]     \n\t"
526
    "subu      %[temp2],    %[temp2],       %[temp4]     \n\t"
527
    "lbux      %[temp3],    %[temp7](%[VP8kclip1])       \n\t"
528
    "lbux      %[temp4],    %[temp2](%[VP8kclip1])       \n\t"
529
    "sb        %[temp3],    0(%[p1])                     \n\t"
530
    "sb        %[temp4],    0(%[p])                      \n\t"
531
  "1:                                                    \n\t"
532
    "addiu     %[p1],       %[p1],          1            \n\t"
533
    "bgtz      %[i],        0b                           \n\t"
534
    " addiu    %[p],        %[p],           1            \n\t"
535
    " .set     pop                                       \n\t"
536
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
537
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
538
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
539
      [p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)
540
    : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
541
    : "memory"
542
  );
543
}
544

545
// TEMP0 = SRC[A + A1 * BPS]
546
// TEMP1 = SRC[B + B1 * BPS]
547
// TEMP2 = SRC[C + C1 * BPS]
548
// TEMP3 = SRC[D + D1 * BPS]
549
#define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3,                               \
550
                     A, A1, B, B1, C, C1, D, D1, SRC)                          \
551
  "lbu      %[" #TEMP0 "],   " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
552
  "lbu      %[" #TEMP1 "],   " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
553
  "lbu      %[" #TEMP2 "],   " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
554
  "lbu      %[" #TEMP3 "],   " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
555

556
static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
557
  int i;
558
  const int thresh2 = 2 * thresh + 1;
559
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
560
  __asm__ volatile (
561
    ".set      push                                     \n\t"
562
    ".set      noreorder                                \n\t"
563
    "li        %[i],       16                           \n\t"
564
  "0:                                                   \n\t"
565
    LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p)
566
    "subu      %[temp7],    %[temp1],       %[temp2]    \n\t"
567
    "subu      %[temp6],    %[temp0],       %[temp3]    \n\t"
568
    "absq_s.w  %[temp4],    %[temp7]                    \n\t"
569
    "absq_s.w  %[temp5],    %[temp6]                    \n\t"
570
    "sll       %[temp4],    %[temp4],       2           \n\t"
571
    "addu      %[temp5],    %[temp4],       %[temp5]    \n\t"
572
    "subu      %[temp5],    %[temp5],       %[thresh2]  \n\t"
573
    "negu      %[temp8],    %[temp7]                    \n\t"
574
    "bgtz      %[temp5],    1f                          \n\t"
575
    " addiu    %[i],        %[i],           -1          \n\t"
576
    "sll       %[temp4],    %[temp8],       1           \n\t"
577
    "shll_s.w  %[temp5],    %[temp6],       24          \n\t"
578
    "addu      %[temp3],    %[temp4],       %[temp8]    \n\t"
579
    "sra       %[temp5],    %[temp5],       24          \n\t"
580
    "addu      %[temp3],    %[temp3],       %[temp5]    \n\t"
581
    "addiu     %[temp7],    %[temp3],       3           \n\t"
582
    "sra       %[temp7],    %[temp7],       3           \n\t"
583
    "shra_r.w  %[temp8],    %[temp3],       3           \n\t"
584
    "shll_s.w  %[temp0],    %[temp7],       27          \n\t"
585
    "shll_s.w  %[temp4],    %[temp8],       27          \n\t"
586
    "sra       %[temp0],    %[temp0],       27          \n\t"
587
    "sra       %[temp4],    %[temp4],       27          \n\t"
588
    "addu      %[temp7],    %[temp1],       %[temp0]    \n\t"
589
    "subu      %[temp2],    %[temp2],       %[temp4]    \n\t"
590
    "lbux      %[temp3],    %[temp7](%[VP8kclip1])      \n\t"
591
    "lbux      %[temp4],    %[temp2](%[VP8kclip1])      \n\t"
592
    "sb        %[temp3],    -1(%[p])                    \n\t"
593
    "sb        %[temp4],    0(%[p])                     \n\t"
594
  "1:                                                   \n\t"
595
    "bgtz      %[i],        0b                          \n\t"
596
    " addu     %[p],        %[p],           %[stride]   \n\t"
597
    ".set      pop                                      \n\t"
598
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
599
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
600
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
601
      [p]"+&r"(p), [i]"=&r"(i)
602
    : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
603
    : "memory"
604
  );
605
}
606

607
static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
608
  int k;
609
  for (k = 3; k > 0; --k) {
610
    p += 4 * stride;
611
    SimpleVFilter16(p, stride, thresh);
612
  }
613
}
614

615
static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
616
  int k;
617
  for (k = 3; k > 0; --k) {
618
    p += 4;
619
    SimpleHFilter16(p, stride, thresh);
620
  }
621
}
622

623
// DST[A * BPS]     = TEMP0
624
// DST[B + C * BPS] = TEMP1
625
#define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST)                              \
626
  "usw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #DST "])         \n\t"     \
627
  "usw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #DST "])  \n\t"
628

629
static void VE4(uint8_t* dst) {    // vertical
630
  const uint8_t* top = dst - BPS;
631
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
632
  __asm__ volatile (
633
    "ulw             %[temp0],   -1(%[top])              \n\t"
634
    "ulh             %[temp1],   3(%[top])               \n\t"
635
    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
636
    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
637
    "preceu.ph.qbr   %[temp4],   %[temp1]                \n\t"
638
    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
639
    "packrl.ph       %[temp6],   %[temp4],    %[temp3]   \n\t"
640
    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
641
    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
642
    "addq.ph         %[temp2],   %[temp5],    %[temp2]   \n\t"
643
    "addq.ph         %[temp6],   %[temp6],    %[temp4]   \n\t"
644
    "addq.ph         %[temp2],   %[temp2],    %[temp3]   \n\t"
645
    "addq.ph         %[temp6],   %[temp6],    %[temp3]   \n\t"
646
    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
647
    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
648
    "precr.qb.ph     %[temp4],   %[temp6],    %[temp2]   \n\t"
649
    STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst)
650
    STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst)
651
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
652
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
653
      [temp6]"=&r"(temp6)
654
    : [top]"r"(top), [dst]"r"(dst)
655
    : "memory"
656
  );
657
}
658

659
static void DC4(uint8_t* dst) {   // DC
660
  int temp0, temp1, temp2, temp3, temp4;
661
  __asm__ volatile (
662
    "ulw          %[temp0],   -1*" XSTR(BPS) "(%[dst]) \n\t"
663
    LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)
664
    "ins          %[temp1],   %[temp2],    8,     8    \n\t"
665
    "ins          %[temp1],   %[temp3],    16,    8    \n\t"
666
    "ins          %[temp1],   %[temp4],    24,    8    \n\t"
667
    "raddu.w.qb   %[temp0],   %[temp0]                 \n\t"
668
    "raddu.w.qb   %[temp1],   %[temp1]                 \n\t"
669
    "addu         %[temp0],   %[temp0],    %[temp1]    \n\t"
670
    "shra_r.w     %[temp0],   %[temp0],    3           \n\t"
671
    "replv.qb     %[temp0],   %[temp0]                 \n\t"
672
    STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst)
673
    STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst)
674
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
675
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
676
    : [dst]"r"(dst)
677
    : "memory"
678
  );
679
}
680

681
static void RD4(uint8_t* dst) {   // Down-right
682
  int temp0, temp1, temp2, temp3, temp4;
683
  int temp5, temp6, temp7, temp8;
684
  __asm__ volatile (
685
    LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)
686
    "ulw            %[temp7],   -1-" XSTR(BPS) "(%[dst])       \n\t"
687
    "ins            %[temp1],   %[temp0], 16, 16               \n\t"
688
    "preceu.ph.qbr  %[temp5],   %[temp7]                       \n\t"
689
    "ins            %[temp2],   %[temp1], 16, 16               \n\t"
690
    "preceu.ph.qbl  %[temp4],   %[temp7]                       \n\t"
691
    "ins            %[temp3],   %[temp2], 16, 16               \n\t"
692
    "shll.ph        %[temp2],   %[temp2], 1                    \n\t"
693
    "addq.ph        %[temp3],   %[temp3], %[temp1]             \n\t"
694
    "packrl.ph      %[temp6],   %[temp5], %[temp1]             \n\t"
695
    "addq.ph        %[temp3],   %[temp3], %[temp2]             \n\t"
696
    "addq.ph        %[temp1],   %[temp1], %[temp5]             \n\t"
697
    "shll.ph        %[temp6],   %[temp6], 1                    \n\t"
698
    "addq.ph        %[temp1],   %[temp1], %[temp6]             \n\t"
699
    "packrl.ph      %[temp0],   %[temp4], %[temp5]             \n\t"
700
    "addq.ph        %[temp8],   %[temp5], %[temp4]             \n\t"
701
    "shra_r.ph      %[temp3],   %[temp3], 2                    \n\t"
702
    "shll.ph        %[temp0],   %[temp0], 1                    \n\t"
703
    "shra_r.ph      %[temp1],   %[temp1], 2                    \n\t"
704
    "addq.ph        %[temp8],   %[temp0], %[temp8]             \n\t"
705
    "lbu            %[temp5],   3-" XSTR(BPS) "(%[dst])        \n\t"
706
    "precrq.ph.w    %[temp7],   %[temp7], %[temp7]             \n\t"
707
    "shra_r.ph      %[temp8],   %[temp8], 2                    \n\t"
708
    "ins            %[temp7],   %[temp5], 0,  8                \n\t"
709
    "precr.qb.ph    %[temp2],   %[temp1], %[temp3]             \n\t"
710
    "raddu.w.qb     %[temp4],   %[temp7]                       \n\t"
711
    "precr.qb.ph    %[temp6],   %[temp8], %[temp1]             \n\t"
712
    "shra_r.w       %[temp4],   %[temp4], 2                    \n\t"
713
    STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst)
714
    "prepend        %[temp2],   %[temp8], 8                    \n\t"
715
    "prepend        %[temp6],   %[temp4], 8                    \n\t"
716
    STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst)
717
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
718
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
719
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
720
    : [dst]"r"(dst)
721
    : "memory"
722
  );
723
}
724

725
// TEMP0 = SRC[A * BPS]
726
// TEMP1 = SRC[B + C * BPS]
727
#define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC)                               \
728
  "ulw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #SRC "])         \n\t"     \
729
  "ulw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "])  \n\t"
730

731
static void LD4(uint8_t* dst) {   // Down-Left
732
  int temp0, temp1, temp2, temp3, temp4;
733
  int temp5, temp6, temp7, temp8, temp9;
734
  __asm__ volatile (
735
    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
736
    "preceu.ph.qbl   %[temp2],    %[temp0]                     \n\t"
737
    "preceu.ph.qbr   %[temp3],    %[temp0]                     \n\t"
738
    "preceu.ph.qbr   %[temp4],    %[temp1]                     \n\t"
739
    "preceu.ph.qbl   %[temp5],    %[temp1]                     \n\t"
740
    "packrl.ph       %[temp6],    %[temp2],    %[temp3]        \n\t"
741
    "packrl.ph       %[temp7],    %[temp4],    %[temp2]        \n\t"
742
    "packrl.ph       %[temp8],    %[temp5],    %[temp4]        \n\t"
743
    "shll.ph         %[temp6],    %[temp6],    1               \n\t"
744
    "addq.ph         %[temp9],    %[temp2],    %[temp6]        \n\t"
745
    "shll.ph         %[temp7],    %[temp7],    1               \n\t"
746
    "addq.ph         %[temp9],    %[temp9],    %[temp3]        \n\t"
747
    "shll.ph         %[temp8],    %[temp8],    1               \n\t"
748
    "shra_r.ph       %[temp9],    %[temp9],    2               \n\t"
749
    "addq.ph         %[temp3],    %[temp4],    %[temp7]        \n\t"
750
    "addq.ph         %[temp0],    %[temp5],    %[temp8]        \n\t"
751
    "addq.ph         %[temp3],    %[temp3],    %[temp2]        \n\t"
752
    "addq.ph         %[temp0],    %[temp0],    %[temp4]        \n\t"
753
    "shra_r.ph       %[temp3],    %[temp3],    2               \n\t"
754
    "shra_r.ph       %[temp0],    %[temp0],    2               \n\t"
755
    "srl             %[temp1],    %[temp1],    24              \n\t"
756
    "sll             %[temp1],    %[temp1],    1               \n\t"
757
    "raddu.w.qb      %[temp5],    %[temp5]                     \n\t"
758
    "precr.qb.ph     %[temp9],    %[temp3],    %[temp9]        \n\t"
759
    "precr.qb.ph     %[temp3],    %[temp0],    %[temp3]        \n\t"
760
    "addu            %[temp1],    %[temp1],    %[temp5]        \n\t"
761
    "shra_r.w        %[temp1],    %[temp1],    2               \n\t"
762
    STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst)
763
    "prepend         %[temp9],    %[temp0],    8               \n\t"
764
    "prepend         %[temp3],    %[temp1],    8               \n\t"
765
    STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst)
766
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
767
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
768
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
769
      [temp9]"=&r"(temp9)
770
    : [dst]"r"(dst)
771
    : "memory"
772
  );
773
}
774

775
//------------------------------------------------------------------------------
776
// Chroma
777

778
static void DC8uv(uint8_t* dst) {     // DC
779
  int temp0, temp1, temp2, temp3, temp4;
780
  int temp5, temp6, temp7, temp8, temp9;
781
  __asm__ volatile (
782
    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
783
    LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
784
    LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst)
785
    "raddu.w.qb   %[temp0],   %[temp0]                   \n\t"
786
    "raddu.w.qb   %[temp1],   %[temp1]                   \n\t"
787
    "addu         %[temp2],   %[temp2],    %[temp3]      \n\t"
788
    "addu         %[temp4],   %[temp4],    %[temp5]      \n\t"
789
    "addu         %[temp6],   %[temp6],    %[temp7]      \n\t"
790
    "addu         %[temp8],   %[temp8],    %[temp9]      \n\t"
791
    "addu         %[temp0],   %[temp0],    %[temp1]      \n\t"
792
    "addu         %[temp2],   %[temp2],    %[temp4]      \n\t"
793
    "addu         %[temp6],   %[temp6],    %[temp8]      \n\t"
794
    "addu         %[temp0],   %[temp0],    %[temp2]      \n\t"
795
    "addu         %[temp0],   %[temp0],    %[temp6]      \n\t"
796
    "shra_r.w     %[temp0],   %[temp0],    4             \n\t"
797
    "replv.qb     %[temp0],   %[temp0]                   \n\t"
798
    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
799
    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
800
    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
801
    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
802
    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
803
    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
804
    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
805
    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
806
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
807
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
808
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
809
      [temp9]"=&r"(temp9)
810
    : [dst]"r"(dst)
811
    : "memory"
812
  );
813
}
814

815
static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
816
  int temp0, temp1;
817
  __asm__ volatile (
818
    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
819
    "raddu.w.qb   %[temp0],   %[temp0]                   \n\t"
820
    "raddu.w.qb   %[temp1],   %[temp1]                   \n\t"
821
    "addu         %[temp0],   %[temp0],    %[temp1]      \n\t"
822
    "shra_r.w     %[temp0],   %[temp0],    3             \n\t"
823
    "replv.qb     %[temp0],   %[temp0]                   \n\t"
824
    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
825
    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
826
    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
827
    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
828
    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
829
    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
830
    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
831
    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
832
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
833
    : [dst]"r"(dst)
834
    : "memory"
835
  );
836
}
837

838
static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
839
  int temp0, temp1, temp2, temp3, temp4;
840
  int temp5, temp6, temp7, temp8;
841
  __asm__ volatile (
842
    LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
843
    LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst)
844
    "addu         %[temp2],   %[temp2],    %[temp3]      \n\t"
845
    "addu         %[temp4],   %[temp4],    %[temp5]      \n\t"
846
    "addu         %[temp6],   %[temp6],    %[temp7]      \n\t"
847
    "addu         %[temp8],   %[temp8],    %[temp1]      \n\t"
848
    "addu         %[temp2],   %[temp2],    %[temp4]      \n\t"
849
    "addu         %[temp6],   %[temp6],    %[temp8]      \n\t"
850
    "addu         %[temp0],   %[temp6],    %[temp2]      \n\t"
851
    "shra_r.w     %[temp0],   %[temp0],    3             \n\t"
852
    "replv.qb     %[temp0],   %[temp0]                   \n\t"
853
    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
854
    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
855
    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
856
    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
857
    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
858
    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
859
    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
860
    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
861
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
862
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
863
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
864
    : [dst]"r"(dst)
865
    : "memory"
866
  );
867
}
868

869
#undef LOAD_8_BYTES
870
#undef STORE_8_BYTES
871
#undef LOAD_4_BYTES
872

873
#define CLIPPING(SIZE)                                                         \
874
  "preceu.ph.qbl   %[temp2],   %[temp0]                  \n\t"                 \
875
  "preceu.ph.qbr   %[temp0],   %[temp0]                  \n\t"                 \
876
".if " #SIZE " == 8                                      \n\t"                 \
877
  "preceu.ph.qbl   %[temp3],   %[temp1]                  \n\t"                 \
878
  "preceu.ph.qbr   %[temp1],   %[temp1]                  \n\t"                 \
879
".endif                                                  \n\t"                 \
880
  "addu.ph         %[temp2],   %[temp2],   %[dst_1]      \n\t"                 \
881
  "addu.ph         %[temp0],   %[temp0],   %[dst_1]      \n\t"                 \
882
".if " #SIZE " == 8                                      \n\t"                 \
883
  "addu.ph         %[temp3],   %[temp3],   %[dst_1]      \n\t"                 \
884
  "addu.ph         %[temp1],   %[temp1],   %[dst_1]      \n\t"                 \
885
".endif                                                  \n\t"                 \
886
  "shll_s.ph       %[temp2],   %[temp2],   7             \n\t"                 \
887
  "shll_s.ph       %[temp0],   %[temp0],   7             \n\t"                 \
888
".if " #SIZE " == 8                                      \n\t"                 \
889
  "shll_s.ph       %[temp3],   %[temp3],   7             \n\t"                 \
890
  "shll_s.ph       %[temp1],   %[temp1],   7             \n\t"                 \
891
".endif                                                  \n\t"                 \
892
  "precrqu_s.qb.ph %[temp0],   %[temp2],   %[temp0]      \n\t"                 \
893
".if " #SIZE " == 8                                      \n\t"                 \
894
  "precrqu_s.qb.ph %[temp1],   %[temp3],   %[temp1]      \n\t"                 \
895
".endif                                                  \n\t"
896

897

898
#define CLIP_8B_TO_DST(DST, TOP, SIZE) do {                                    \
899
  int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1];                              \
900
  int temp0, temp1, temp2, temp3;                                              \
901
  __asm__ volatile (                                                           \
902
  ".if " #SIZE " < 8                                     \n\t"                 \
903
    "ulw             %[temp0],   0(%[top])               \n\t"                 \
904
    "subu.ph         %[dst_1],   %[dst_1],    %[top_1]   \n\t"                 \
905
    CLIPPING(4)                                                                \
906
    "usw             %[temp0],   0(%[dst])               \n\t"                 \
907
  ".else                                                 \n\t"                 \
908
    "ulw             %[temp0],   0(%[top])               \n\t"                 \
909
    "ulw             %[temp1],   4(%[top])               \n\t"                 \
910
    "subu.ph         %[dst_1],   %[dst_1],    %[top_1]   \n\t"                 \
911
    CLIPPING(8)                                                                \
912
    "usw             %[temp0],   0(%[dst])               \n\t"                 \
913
    "usw             %[temp1],   4(%[dst])               \n\t"                 \
914
  ".if " #SIZE " == 16                                   \n\t"                 \
915
    "ulw             %[temp0],   8(%[top])               \n\t"                 \
916
    "ulw             %[temp1],   12(%[top])              \n\t"                 \
917
    CLIPPING(8)                                                                \
918
    "usw             %[temp0],   8(%[dst])               \n\t"                 \
919
    "usw             %[temp1],   12(%[dst])              \n\t"                 \
920
  ".endif                                                \n\t"                 \
921
  ".endif                                                \n\t"                 \
922
    : [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),           \
923
      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)                                 \
924
    : [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST))                      \
925
    : "memory"                                                                 \
926
  );                                                                           \
927
} while (0)
928

929
#define CLIP_TO_DST(DST, SIZE) do {                                            \
930
  int y;                                                                       \
931
  const uint8_t* top = (DST) - BPS;                                            \
932
  const int top_1 = ((int)top[-1] << 16) + top[-1];                            \
933
  for (y = 0; y < (SIZE); ++y) {                                               \
934
    CLIP_8B_TO_DST((DST), top, (SIZE));                                        \
935
    (DST) += BPS;                                                              \
936
  }                                                                            \
937
} while (0)
938

939
#define TRUE_MOTION(DST, SIZE)                                                 \
940
static void TrueMotion##SIZE(uint8_t* (DST)) {                                 \
941
  CLIP_TO_DST((DST), (SIZE));                                                  \
942
}
943

944
TRUE_MOTION(dst, 4)
945
TRUE_MOTION(dst, 8)
946
TRUE_MOTION(dst, 16)
947

948
#undef TRUE_MOTION
949
#undef CLIP_TO_DST
950
#undef CLIP_8B_TO_DST
951
#undef CLIPPING
952

953
//------------------------------------------------------------------------------
954
// Entry point
955

956
extern void VP8DspInitMIPSdspR2(void);
957

958
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {
959
  VP8TransformDC = TransformDC;
960
  VP8TransformAC3 = TransformAC3;
961
  VP8Transform = TransformTwo;
962

963
  VP8VFilter16 = VFilter16;
964
  VP8HFilter16 = HFilter16;
965
  VP8VFilter8 = VFilter8;
966
  VP8HFilter8 = HFilter8;
967
  VP8VFilter16i = VFilter16i;
968
  VP8HFilter16i = HFilter16i;
969
  VP8VFilter8i = VFilter8i;
970
  VP8HFilter8i = HFilter8i;
971
  VP8SimpleVFilter16 = SimpleVFilter16;
972
  VP8SimpleHFilter16 = SimpleHFilter16;
973
  VP8SimpleVFilter16i = SimpleVFilter16i;
974
  VP8SimpleHFilter16i = SimpleHFilter16i;
975

976
  VP8PredLuma4[0] = DC4;
977
  VP8PredLuma4[1] = TrueMotion4;
978
  VP8PredLuma4[2] = VE4;
979
  VP8PredLuma4[4] = RD4;
980
  VP8PredLuma4[6] = LD4;
981

982
  VP8PredChroma8[0] = DC8uv;
983
  VP8PredChroma8[1] = TrueMotion8;
984
  VP8PredChroma8[4] = DC8uvNoTop;
985
  VP8PredChroma8[5] = DC8uvNoLeft;
986

987
  VP8PredLuma16[1] = TrueMotion16;
988
}
989

990
#else  // !WEBP_USE_MIPS_DSP_R2
991

992
WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)
993

994
#endif  // WEBP_USE_MIPS_DSP_R2
995

996
Product

Resources

Company