CoCalc -- enc_mips_dsp

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
⁹⁹¹³ views
1
// Copyright 2014 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
// MIPS version of speed-critical encoding functions.
11
//
12
// Author(s): Darko Laus ([email protected])
13
//            Mirko Raus ([email protected])
14

15
#include "src/dsp/dsp.h"
16

17
#if defined(WEBP_USE_MIPS_DSP_R2)
18

19
#include "src/dsp/mips_macro.h"
20
#include "src/enc/cost_enc.h"
21
#include "src/enc/vp8i_enc.h"
22

23
static const int kC1 = WEBP_TRANSFORM_AC3_C1;
24
static const int kC2 = WEBP_TRANSFORM_AC3_C2;
25

26
// O - output
27
// I - input (macro doesn't change it)
28
#define ADD_SUB_HALVES_X4(O0, O1, O2, O3, O4, O5, O6, O7,                      \
29
                          I0, I1, I2, I3, I4, I5, I6, I7)                      \
30
  "addq.ph          %[" #O0 "],   %[" #I0 "],  %[" #I1 "]     \n\t"            \
31
  "subq.ph          %[" #O1 "],   %[" #I0 "],  %[" #I1 "]     \n\t"            \
32
  "addq.ph          %[" #O2 "],   %[" #I2 "],  %[" #I3 "]     \n\t"            \
33
  "subq.ph          %[" #O3 "],   %[" #I2 "],  %[" #I3 "]     \n\t"            \
34
  "addq.ph          %[" #O4 "],   %[" #I4 "],  %[" #I5 "]     \n\t"            \
35
  "subq.ph          %[" #O5 "],   %[" #I4 "],  %[" #I5 "]     \n\t"            \
36
  "addq.ph          %[" #O6 "],   %[" #I6 "],  %[" #I7 "]     \n\t"            \
37
  "subq.ph          %[" #O7 "],   %[" #I6 "],  %[" #I7 "]     \n\t"
38

39
// IO - input/output
40
#define ABS_X8(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7)                         \
41
  "absq_s.ph        %[" #IO0 "],   %[" #IO0 "]                \n\t"            \
42
  "absq_s.ph        %[" #IO1 "],   %[" #IO1 "]                \n\t"            \
43
  "absq_s.ph        %[" #IO2 "],   %[" #IO2 "]                \n\t"            \
44
  "absq_s.ph        %[" #IO3 "],   %[" #IO3 "]                \n\t"            \
45
  "absq_s.ph        %[" #IO4 "],   %[" #IO4 "]                \n\t"            \
46
  "absq_s.ph        %[" #IO5 "],   %[" #IO5 "]                \n\t"            \
47
  "absq_s.ph        %[" #IO6 "],   %[" #IO6 "]                \n\t"            \
48
  "absq_s.ph        %[" #IO7 "],   %[" #IO7 "]                \n\t"
49

50
// dpa.w.ph $ac0 temp0 ,temp1
51
//  $ac += temp0[31..16] * temp1[31..16] + temp0[15..0] * temp1[15..0]
52
// dpax.w.ph $ac0 temp0 ,temp1
53
//  $ac += temp0[31..16] * temp1[15..0] + temp0[15..0] * temp1[31..16]
54
// O - output
55
// I - input (macro doesn't change it)
56
#define MUL_HALF(O0, I0, I1, I2, I3, I4, I5, I6, I7,                           \
57
                 I8, I9, I10, I11, I12, I13, I14, I15)                         \
58
    "mult            $ac0,      $zero,     $zero              \n\t"            \
59
    "dpa.w.ph        $ac0,      %[" #I2 "],  %[" #I0 "]       \n\t"            \
60
    "dpax.w.ph       $ac0,      %[" #I5 "],  %[" #I6 "]       \n\t"            \
61
    "dpa.w.ph        $ac0,      %[" #I8 "],  %[" #I9 "]       \n\t"            \
62
    "dpax.w.ph       $ac0,      %[" #I11 "], %[" #I4 "]       \n\t"            \
63
    "dpa.w.ph        $ac0,      %[" #I12 "], %[" #I7 "]       \n\t"            \
64
    "dpax.w.ph       $ac0,      %[" #I13 "], %[" #I1 "]       \n\t"            \
65
    "dpa.w.ph        $ac0,      %[" #I14 "], %[" #I3 "]       \n\t"            \
66
    "dpax.w.ph       $ac0,      %[" #I15 "], %[" #I10 "]      \n\t"            \
67
    "mflo            %[" #O0 "],  $ac0                        \n\t"
68

69
#define OUTPUT_EARLY_CLOBBER_REGS_17()                                         \
70
  OUTPUT_EARLY_CLOBBER_REGS_10(),                                              \
71
  [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13),         \
72
  [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16),         \
73
  [temp17]"=&r"(temp17)
74

75
// macro for one horizontal pass in FTransform
76
// temp0..temp15 holds tmp[0]..tmp[15]
77
// A - offset in bytes to load from src and ref buffers
78
// TEMP0..TEMP3 - registers for corresponding tmp elements
79
#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                         \
80
  "lw              %[" #TEMP0 "],   0(%[args])                          \n\t"  \
81
  "lw              %[" #TEMP1 "],   4(%[args])                          \n\t"  \
82
  "lw              %[" #TEMP2 "],   " XSTR(BPS) "*" #A "(%[" #TEMP0 "]) \n\t"  \
83
  "lw              %[" #TEMP3 "],   " XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t"  \
84
  "preceu.ph.qbl   %[" #TEMP0 "],   %[" #TEMP2 "]                       \n\t"  \
85
  "preceu.ph.qbl   %[" #TEMP1 "],   %[" #TEMP3 "]                       \n\t"  \
86
  "preceu.ph.qbr   %[" #TEMP2 "],   %[" #TEMP2 "]                       \n\t"  \
87
  "preceu.ph.qbr   %[" #TEMP3 "],   %[" #TEMP3 "]                       \n\t"  \
88
  "subq.ph         %[" #TEMP0 "],   %[" #TEMP0 "],   %[" #TEMP1 "]      \n\t"  \
89
  "subq.ph         %[" #TEMP2 "],   %[" #TEMP2 "],   %[" #TEMP3 "]      \n\t"  \
90
  "rotr            %[" #TEMP0 "],   %[" #TEMP0 "],   16                 \n\t"  \
91
  "addq.ph         %[" #TEMP1 "],   %[" #TEMP2 "],   %[" #TEMP0 "]      \n\t"  \
92
  "subq.ph         %[" #TEMP3 "],   %[" #TEMP2 "],   %[" #TEMP0 "]      \n\t"  \
93
  "seh             %[" #TEMP0 "],   %[" #TEMP1 "]                       \n\t"  \
94
  "sra             %[temp16],     %[" #TEMP1 "],   16                   \n\t"  \
95
  "seh             %[temp19],     %[" #TEMP3 "]                         \n\t"  \
96
  "sra             %[" #TEMP3 "],   %[" #TEMP3 "],   16                 \n\t"  \
97
  "subu            %[" #TEMP2 "],   %[" #TEMP0 "],   %[temp16]          \n\t"  \
98
  "addu            %[" #TEMP0 "],   %[" #TEMP0 "],   %[temp16]          \n\t"  \
99
  "mul             %[temp17],     %[temp19],     %[c2217]               \n\t"  \
100
  "mul             %[temp18],     %[" #TEMP3 "],   %[c5352]             \n\t"  \
101
  "mul             %[" #TEMP1 "],   %[temp19],     %[c5352]             \n\t"  \
102
  "mul             %[temp16],     %[" #TEMP3 "],   %[c2217]             \n\t"  \
103
  "sll             %[" #TEMP2 "],   %[" #TEMP2 "],   3                  \n\t"  \
104
  "sll             %[" #TEMP0 "],   %[" #TEMP0 "],   3                  \n\t"  \
105
  "subu            %[" #TEMP3 "],   %[temp17],     %[temp18]            \n\t"  \
106
  "addu            %[" #TEMP1 "],   %[temp16],     %[" #TEMP1 "]        \n\t"  \
107
  "addiu           %[" #TEMP3 "],   %[" #TEMP3 "],   937                \n\t"  \
108
  "addiu           %[" #TEMP1 "],   %[" #TEMP1 "],   1812               \n\t"  \
109
  "sra             %[" #TEMP3 "],   %[" #TEMP3 "],   9                  \n\t"  \
110
  "sra             %[" #TEMP1 "],   %[" #TEMP1 "],   9                  \n\t"
111

112
// macro for one vertical pass in FTransform
113
// temp0..temp15 holds tmp[0]..tmp[15]
114
// A..D - offsets in bytes to store to out buffer
115
// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
116
#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)                 \
117
  "addu            %[temp16],     %[" #TEMP0 "],   %[" #TEMP12 "]   \n\t"      \
118
  "subu            %[temp19],     %[" #TEMP0 "],   %[" #TEMP12 "]   \n\t"      \
119
  "addu            %[temp17],     %[" #TEMP4 "],   %[" #TEMP8 "]    \n\t"      \
120
  "subu            %[temp18],     %[" #TEMP4 "],   %[" #TEMP8 "]    \n\t"      \
121
  "mul             %[" #TEMP8 "],   %[temp19],     %[c2217]         \n\t"      \
122
  "mul             %[" #TEMP12 "],  %[temp18],     %[c2217]         \n\t"      \
123
  "mul             %[" #TEMP4 "],   %[temp19],     %[c5352]         \n\t"      \
124
  "mul             %[temp18],     %[temp18],     %[c5352]           \n\t"      \
125
  "addiu           %[temp16],     %[temp16],     7                  \n\t"      \
126
  "addu            %[" #TEMP0 "],   %[temp16],     %[temp17]        \n\t"      \
127
  "sra             %[" #TEMP0 "],   %[" #TEMP0 "],   4              \n\t"      \
128
  "addu            %[" #TEMP12 "],  %[" #TEMP12 "],  %[" #TEMP4 "]  \n\t"      \
129
  "subu            %[" #TEMP4 "],   %[temp16],     %[temp17]        \n\t"      \
130
  "sra             %[" #TEMP4 "],   %[" #TEMP4 "],   4              \n\t"      \
131
  "addiu           %[" #TEMP8 "],   %[" #TEMP8 "],   30000          \n\t"      \
132
  "addiu           %[" #TEMP12 "],  %[" #TEMP12 "],  12000          \n\t"      \
133
  "addiu           %[" #TEMP8 "],   %[" #TEMP8 "],   21000          \n\t"      \
134
  "subu            %[" #TEMP8 "],   %[" #TEMP8 "],   %[temp18]      \n\t"      \
135
  "sra             %[" #TEMP12 "],  %[" #TEMP12 "],  16             \n\t"      \
136
  "sra             %[" #TEMP8 "],   %[" #TEMP8 "],   16             \n\t"      \
137
  "addiu           %[temp16],     %[" #TEMP12 "],  1                \n\t"      \
138
  "movn            %[" #TEMP12 "],  %[temp16],     %[temp19]        \n\t"      \
139
  "sh              %[" #TEMP0 "],   " #A "(%[temp20])               \n\t"      \
140
  "sh              %[" #TEMP4 "],   " #C "(%[temp20])               \n\t"      \
141
  "sh              %[" #TEMP8 "],   " #D "(%[temp20])               \n\t"      \
142
  "sh              %[" #TEMP12 "],  " #B "(%[temp20])               \n\t"
143

144
static void FTransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT src,
145
                                 const uint8_t* WEBP_RESTRICT ref,
146
                                 int16_t* WEBP_RESTRICT out) {
147
  const int c2217 = 2217;
148
  const int c5352 = 5352;
149
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
150
  int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
151
  int temp17, temp18, temp19, temp20;
152
  const int* const args[3] =
153
      { (const int*)src, (const int*)ref, (const int*)out };
154

155
  __asm__ volatile (
156
    HORIZONTAL_PASS(0, temp0,  temp1,  temp2,  temp3)
157
    HORIZONTAL_PASS(1, temp4,  temp5,  temp6,  temp7)
158
    HORIZONTAL_PASS(2, temp8,  temp9,  temp10, temp11)
159
    HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
160
    "lw            %[temp20],     8(%[args])                  \n\t"
161
    VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
162
    VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
163
    VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
164
    VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
165
    OUTPUT_EARLY_CLOBBER_REGS_18(),
166
      [temp0]"=&r"(temp0), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
167
    : [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
168
    : "memory", "hi", "lo"
169
  );
170
}
171

172
#undef VERTICAL_PASS
173
#undef HORIZONTAL_PASS
174

175
static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
176
                                      const int16_t* WEBP_RESTRICT in,
177
                                      uint8_t* WEBP_RESTRICT dst) {
178
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
179
  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
180

181
  __asm__ volatile (
182
    "ulw              %[temp1],   0(%[in])                 \n\t"
183
    "ulw              %[temp2],   16(%[in])                \n\t"
184
    LOAD_IN_X2(temp5, temp6, 24, 26)
185
    ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
186
    LOAD_IN_X2(temp1, temp2, 8, 10)
187
    MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
188
                  temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
189
                  temp13, temp11, temp14, temp12)
190
    INSERT_HALF_X2(temp8, temp7, temp10, temp9)
191
    "ulw              %[temp17],  4(%[in])                 \n\t"
192
    "ulw              %[temp18],  20(%[in])                \n\t"
193
    ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
194
    ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
195
    ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
196
    LOAD_IN_X2(temp17, temp18, 12, 14)
197
    LOAD_IN_X2(temp9, temp10, 28, 30)
198
    MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
199
                  temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
200
                  temp15, temp4, temp16, temp17)
201
    INSERT_HALF_X2(temp11, temp12, temp13, temp14)
202
    ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
203
    ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
204

205
    // horizontal
206
    SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
207
    INSERT_HALF_X2(temp1, temp6, temp5, temp2)
208
    SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
209
    "repl.ph          %[temp2],   0x4                      \n\t"
210
    INSERT_HALF_X2(temp3, temp8, temp17, temp4)
211
    "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
212
    "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
213
    ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
214
    ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
215
    MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
216
                  temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
217
                  temp6, temp17, temp8, temp18)
218
    MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
219
                  temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
220
                  temp18, temp12, temp17, temp16)
221
    INSERT_HALF_X2(temp1, temp3, temp9, temp13)
222
    INSERT_HALF_X2(temp6, temp8, temp11, temp15)
223
    SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
224
                   temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
225
                   temp6)
226
    PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
227
                          temp16, temp11, temp10, temp15, temp14)
228
    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, ref,
229
                        0, 0, 0, 0,
230
                        0, 1, 2, 3,
231
                        BPS)
232
    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
233
                            temp11, temp10, temp11, temp14, temp15)
234
    STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
235
                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
236
                     dst, 0, 1, 2, 3, BPS)
237

238
    OUTPUT_EARLY_CLOBBER_REGS_18()
239
    : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2), [ref]"r"(ref)
240
    : "memory", "hi", "lo"
241
  );
242
}
243

244
static void ITransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT ref,
245
                                 const int16_t* WEBP_RESTRICT in,
246
                                 uint8_t* WEBP_RESTRICT dst, int do_two) {
247
  ITransformOne(ref, in, dst);
248
  if (do_two) {
249
    ITransformOne(ref + 4, in + 16, dst + 4);
250
  }
251
}
252

253
static int Disto4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a,
254
                              const uint8_t* WEBP_RESTRICT const b,
255
                              const uint16_t* WEBP_RESTRICT const w) {
256
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
257
  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
258

259
  __asm__ volatile (
260
    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, a,
261
                        0, 0, 0, 0,
262
                        0, 1, 2, 3,
263
                        BPS)
264
    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp9,temp10, temp11,
265
                            temp12, temp1, temp2, temp3, temp4)
266
    ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
267
                      temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
268
    PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
269
                          temp7, temp2, temp4, temp6, temp8)
270
    ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
271
                      temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
272
    ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
273
                      temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
274
    ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
275
                      temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
276
    ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
277
    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
278
                        0, 4, 8, 12,
279
                        0, 0, 0, 0,
280
                        0)
281
    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
282
                        0, 4, 8, 12,
283
                        1, 1, 1, 1,
284
                        16)
285
    MUL_HALF(temp17, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
286
             temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
287
    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, b,
288
                        0, 0, 0, 0,
289
                        0, 1, 2, 3,
290
                        BPS)
291
    CONVERT_2_BYTES_TO_HALF(temp5,temp6, temp7, temp8, temp9,temp10, temp11,
292
                            temp12, temp1, temp2, temp3, temp4)
293
    ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
294
                      temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
295
    PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
296
                          temp7, temp2, temp4, temp6, temp8)
297
    ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
298
                      temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
299
    ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
300
                      temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
301
    ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
302
                      temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
303
    ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
304
    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
305
                        0, 4, 8, 12,
306
                        0, 0, 0, 0,
307
                        0)
308
    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
309
                        0, 4, 8, 12,
310
                        1, 1, 1, 1,
311
                        16)
312
    MUL_HALF(temp3, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
313
             temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
314
    OUTPUT_EARLY_CLOBBER_REGS_17()
315
    : [a]"r"(a), [b]"r"(b), [w]"r"(w)
316
    : "memory", "hi", "lo"
317
  );
318
  return abs(temp3 - temp17) >> 5;
319
}
320

321
static int Disto16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a,
322
                                const uint8_t* WEBP_RESTRICT const b,
323
                                const uint16_t* WEBP_RESTRICT const w) {
324
  int D = 0;
325
  int x, y;
326
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
327
    for (x = 0; x < 16; x += 4) {
328
      D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w);
329
    }
330
  }
331
  return D;
332
}
333

334
//------------------------------------------------------------------------------
335
// Intra predictions
336

337
#define FILL_PART(J, SIZE)                                            \
338
    "usw        %[value],  0+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
339
    "usw        %[value],  4+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
340
  ".if " #SIZE " == 16                                     \n\t"      \
341
    "usw        %[value],  8+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
342
    "usw        %[value], 12+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
343
  ".endif                                                  \n\t"
344

345
#define FILL_8_OR_16(DST, VALUE, SIZE) do {                         \
346
  int value = (VALUE);                                              \
347
  __asm__ volatile (                                                \
348
    "replv.qb   %[value],  %[value]                      \n\t"      \
349
    FILL_PART( 0, SIZE)                                             \
350
    FILL_PART( 1, SIZE)                                             \
351
    FILL_PART( 2, SIZE)                                             \
352
    FILL_PART( 3, SIZE)                                             \
353
    FILL_PART( 4, SIZE)                                             \
354
    FILL_PART( 5, SIZE)                                             \
355
    FILL_PART( 6, SIZE)                                             \
356
    FILL_PART( 7, SIZE)                                             \
357
  ".if " #SIZE " == 16                                   \n\t"      \
358
    FILL_PART( 8, 16)                                               \
359
    FILL_PART( 9, 16)                                               \
360
    FILL_PART(10, 16)                                               \
361
    FILL_PART(11, 16)                                               \
362
    FILL_PART(12, 16)                                               \
363
    FILL_PART(13, 16)                                               \
364
    FILL_PART(14, 16)                                               \
365
    FILL_PART(15, 16)                                               \
366
  ".endif                                                \n\t"      \
367
    : [value]"+&r"(value)                                           \
368
    : [dst]"r"((DST))                                               \
369
    : "memory"                                                      \
370
  );                                                                \
371
} while (0)
372

373
#define VERTICAL_PRED(DST, TOP, SIZE)                                          \
374
static WEBP_INLINE void VerticalPred##SIZE(                                    \
375
    uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (TOP)) {        \
376
  int j;                                                                       \
377
  if ((TOP)) {                                                                 \
378
    for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE));       \
379
  } else {                                                                     \
380
    FILL_8_OR_16((DST), 127, (SIZE));                                          \
381
  }                                                                            \
382
}
383

384
VERTICAL_PRED(dst, top, 8)
385
VERTICAL_PRED(dst, top, 16)
386

387
#undef VERTICAL_PRED
388

389
#define HORIZONTAL_PRED(DST, LEFT, SIZE)                                       \
390
static WEBP_INLINE void HorizontalPred##SIZE(                                  \
391
    uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (LEFT)) {       \
392
  if (LEFT) {                                                                  \
393
    int j;                                                                     \
394
    for (j = 0; j < (SIZE); ++j) {                                             \
395
      memset((DST) + j * BPS, (LEFT)[j], (SIZE));                              \
396
    }                                                                          \
397
  } else {                                                                     \
398
    FILL_8_OR_16((DST), 129, (SIZE));                                          \
399
  }                                                                            \
400
}
401

402
HORIZONTAL_PRED(dst, left, 8)
403
HORIZONTAL_PRED(dst, left, 16)
404

405
#undef HORIZONTAL_PRED
406

407
#define CLIPPING()                                                             \
408
  "preceu.ph.qbl   %[temp2],   %[temp0]                  \n\t"                 \
409
  "preceu.ph.qbr   %[temp0],   %[temp0]                  \n\t"                 \
410
  "preceu.ph.qbl   %[temp3],   %[temp1]                  \n\t"                 \
411
  "preceu.ph.qbr   %[temp1],   %[temp1]                  \n\t"                 \
412
  "addu.ph         %[temp2],   %[temp2],   %[leftY_1]    \n\t"                 \
413
  "addu.ph         %[temp0],   %[temp0],   %[leftY_1]    \n\t"                 \
414
  "addu.ph         %[temp3],   %[temp3],   %[leftY_1]    \n\t"                 \
415
  "addu.ph         %[temp1],   %[temp1],   %[leftY_1]    \n\t"                 \
416
  "shll_s.ph       %[temp2],   %[temp2],   7             \n\t"                 \
417
  "shll_s.ph       %[temp0],   %[temp0],   7             \n\t"                 \
418
  "shll_s.ph       %[temp3],   %[temp3],   7             \n\t"                 \
419
  "shll_s.ph       %[temp1],   %[temp1],   7             \n\t"                 \
420
  "precrqu_s.qb.ph %[temp0],   %[temp2],   %[temp0]      \n\t"                 \
421
  "precrqu_s.qb.ph %[temp1],   %[temp3],   %[temp1]      \n\t"
422

423
#define CLIP_8B_TO_DST(DST, LEFT, TOP, SIZE) do {                              \
424
  int leftY_1 = ((int)(LEFT)[y] << 16) + (LEFT)[y];                            \
425
  int temp0, temp1, temp2, temp3;                                              \
426
  __asm__ volatile (                                                           \
427
    "replv.ph        %[leftY_1], %[leftY_1]              \n\t"                 \
428
    "ulw             %[temp0],   0(%[top])               \n\t"                 \
429
    "ulw             %[temp1],   4(%[top])               \n\t"                 \
430
    "subu.ph         %[leftY_1], %[leftY_1], %[left_1]   \n\t"                 \
431
    CLIPPING()                                                                 \
432
    "usw             %[temp0],   0(%[dst])               \n\t"                 \
433
    "usw             %[temp1],   4(%[dst])               \n\t"                 \
434
  ".if " #SIZE " == 16                                   \n\t"                 \
435
    "ulw             %[temp0],   8(%[top])               \n\t"                 \
436
    "ulw             %[temp1],   12(%[top])              \n\t"                 \
437
    CLIPPING()                                                                 \
438
    "usw             %[temp0],   8(%[dst])               \n\t"                 \
439
    "usw             %[temp1],   12(%[dst])              \n\t"                 \
440
  ".endif                                                \n\t"                 \
441
    : [leftY_1]"+&r"(leftY_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),       \
442
      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)                                 \
443
    : [left_1]"r"(left_1), [top]"r"((TOP)), [dst]"r"((DST))                    \
444
    : "memory"                                                                 \
445
  );                                                                           \
446
} while (0)
447

448
#define CLIP_TO_DST(DST, LEFT, TOP, SIZE) do {                                 \
449
  int y;                                                                       \
450
  const int left_1 = ((int)(LEFT)[-1] << 16) + (LEFT)[-1];                     \
451
  for (y = 0; y < (SIZE); ++y) {                                               \
452
    CLIP_8B_TO_DST((DST), (LEFT), (TOP), (SIZE));                              \
453
    (DST) += BPS;                                                              \
454
  }                                                                            \
455
} while (0)
456

457
#define TRUE_MOTION(DST, LEFT, TOP, SIZE)                                      \
458
static WEBP_INLINE void TrueMotion##SIZE(uint8_t* WEBP_RESTRICT (DST),         \
459
                                         const uint8_t* WEBP_RESTRICT (LEFT),  \
460
                                         const uint8_t* WEBP_RESTRICT (TOP)) { \
461
  if ((LEFT) != NULL) {                                                        \
462
    if ((TOP) != NULL) {                                                       \
463
      CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE));                               \
464
    } else {                                                                   \
465
      HorizontalPred##SIZE((DST), (LEFT));                                     \
466
    }                                                                          \
467
  } else {                                                                     \
468
    /* true motion without left samples (hence: with default 129 value)    */  \
469
    /* is equivalent to VE prediction where you just copy the top samples. */  \
470
    /* Note that if top samples are not available, the default value is    */  \
471
    /* then 129, and not 127 as in the VerticalPred case.                  */  \
472
    if ((TOP) != NULL) {                                                       \
473
      VerticalPred##SIZE((DST), (TOP));                                        \
474
    } else {                                                                   \
475
      FILL_8_OR_16((DST), 129, (SIZE));                                        \
476
    }                                                                          \
477
  }                                                                            \
478
}
479

480
TRUE_MOTION(dst, left, top, 8)
481
TRUE_MOTION(dst, left, top, 16)
482

483
#undef TRUE_MOTION
484
#undef CLIP_TO_DST
485
#undef CLIP_8B_TO_DST
486
#undef CLIPPING
487

488
static WEBP_INLINE void DCMode16(uint8_t* WEBP_RESTRICT dst,
489
                                 const uint8_t* WEBP_RESTRICT left,
490
                                 const uint8_t* WEBP_RESTRICT top) {
491
  int DC, DC1;
492
  int temp0, temp1, temp2, temp3;
493

494
  __asm__ volatile(
495
    "beqz        %[top],   2f                  \n\t"
496
    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, top,
497
                        0, 4, 8, 12,
498
                        0, 0, 0, 0,
499
                        0)
500
    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
501
    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
502
    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
503
    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
504
    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
505
    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
506
    "addu        %[DC],    %[temp0], %[temp2]  \n\t"
507
    "move        %[DC1],   %[DC]               \n\t"
508
    "beqz        %[left],  1f                  \n\t"
509
    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
510
                        0, 4, 8, 12,
511
                        0, 0, 0, 0,
512
                        0)
513
    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
514
    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
515
    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
516
    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
517
    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
518
    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
519
    "addu        %[DC1],   %[temp0], %[temp2]  \n\t"
520
  "1:                                          \n\t"
521
    "addu        %[DC],   %[DC],     %[DC1]    \n\t"
522
    "j           3f                            \n\t"
523
  "2:                                          \n\t"
524
    "beqz        %[left],  4f                  \n\t"
525
    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
526
                        0, 4, 8, 12,
527
                        0, 0, 0, 0,
528
                        0)
529
    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
530
    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
531
    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
532
    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
533
    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
534
    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
535
    "addu        %[DC],    %[temp0], %[temp2]  \n\t"
536
    "addu        %[DC],    %[DC],    %[DC]     \n\t"
537
  "3:                                          \n\t"
538
    "shra_r.w    %[DC],    %[DC],    5         \n\t"
539
    "j           5f                            \n\t"
540
  "4:                                          \n\t"
541
    "li          %[DC],    0x80                \n\t"
542
  "5:                                          \n\t"
543
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
544
      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
545
    : [left]"r"(left), [top]"r"(top)
546
    : "memory"
547
  );
548

549
  FILL_8_OR_16(dst, DC, 16);
550
}
551

552
static WEBP_INLINE void DCMode8(uint8_t* WEBP_RESTRICT dst,
553
                                const uint8_t* WEBP_RESTRICT left,
554
                                const uint8_t* WEBP_RESTRICT top) {
555
  int DC, DC1;
556
  int temp0, temp1, temp2, temp3;
557

558
  __asm__ volatile(
559
    "beqz        %[top],   2f                  \n\t"
560
    "ulw         %[temp0], 0(%[top])           \n\t"
561
    "ulw         %[temp1], 4(%[top])           \n\t"
562
    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
563
    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
564
    "addu        %[DC],    %[temp0], %[temp1]  \n\t"
565
    "move        %[DC1],   %[DC]               \n\t"
566
    "beqz        %[left],  1f                  \n\t"
567
    "ulw         %[temp2], 0(%[left])          \n\t"
568
    "ulw         %[temp3], 4(%[left])          \n\t"
569
    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
570
    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
571
    "addu        %[DC1],   %[temp2], %[temp3]  \n\t"
572
  "1:                                          \n\t"
573
    "addu        %[DC],    %[DC],    %[DC1]    \n\t"
574
    "j           3f                            \n\t"
575
  "2:                                          \n\t"
576
    "beqz        %[left],  4f                  \n\t"
577
    "ulw         %[temp2], 0(%[left])          \n\t"
578
    "ulw         %[temp3], 4(%[left])          \n\t"
579
    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
580
    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
581
    "addu        %[DC],    %[temp2], %[temp3]  \n\t"
582
    "addu        %[DC],    %[DC],    %[DC]     \n\t"
583
  "3:                                          \n\t"
584
    "shra_r.w    %[DC], %[DC], 4               \n\t"
585
    "j           5f                            \n\t"
586
  "4:                                          \n\t"
587
    "li          %[DC], 0x80                   \n\t"
588
  "5:                                          \n\t"
589
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
590
      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
591
    : [left]"r"(left), [top]"r"(top)
592
    : "memory"
593
  );
594

595
  FILL_8_OR_16(dst, DC, 8);
596
}
597

598
static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
599
  int temp0, temp1;
600
  __asm__ volatile(
601
    "ulw          %[temp0],   0(%[top])               \n\t"
602
    "ulw          %[temp1],   -5(%[top])              \n\t"
603
    "raddu.w.qb   %[temp0],   %[temp0]                \n\t"
604
    "raddu.w.qb   %[temp1],   %[temp1]                \n\t"
605
    "addu         %[temp0],   %[temp0],    %[temp1]   \n\t"
606
    "addiu        %[temp0],   %[temp0],    4          \n\t"
607
    "srl          %[temp0],   %[temp0],    3          \n\t"
608
    "replv.qb     %[temp0],   %[temp0]                \n\t"
609
    "usw          %[temp0],   0*" XSTR(BPS) "(%[dst]) \n\t"
610
    "usw          %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
611
    "usw          %[temp0],   2*" XSTR(BPS) "(%[dst]) \n\t"
612
    "usw          %[temp0],   3*" XSTR(BPS) "(%[dst]) \n\t"
613
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
614
    : [top]"r"(top), [dst]"r"(dst)
615
    : "memory"
616
  );
617
}
618

619
static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
620
  int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5;
621
  const int c35 = 0xff00ff;
622
  __asm__ volatile (
623
    "lbu              %[temp1],  0(%[top])                     \n\t"
624
    "lbu              %[a10],    1(%[top])                     \n\t"
625
    "lbu              %[temp2],  2(%[top])                     \n\t"
626
    "lbu              %[a32],    3(%[top])                     \n\t"
627
    "ulw              %[temp0],  -5(%[top])                    \n\t"
628
    "lbu              %[temp4],  -1(%[top])                    \n\t"
629
    "append           %[a10],    %[temp1],   16                \n\t"
630
    "append           %[a32],    %[temp2],   16                \n\t"
631
    "replv.ph         %[temp4],  %[temp4]                      \n\t"
632
    "shrl.ph          %[temp1],  %[temp0],   8                 \n\t"
633
    "and              %[temp0],  %[temp0],   %[c35]            \n\t"
634
    "subu.ph          %[temp1],  %[temp1],   %[temp4]          \n\t"
635
    "subu.ph          %[temp0],  %[temp0],   %[temp4]          \n\t"
636
    "srl              %[temp2],  %[temp1],   16                \n\t"
637
    "srl              %[temp3],  %[temp0],   16                \n\t"
638
    "replv.ph         %[temp2],  %[temp2]                      \n\t"
639
    "replv.ph         %[temp3],  %[temp3]                      \n\t"
640
    "replv.ph         %[temp4],  %[temp1]                      \n\t"
641
    "replv.ph         %[temp5],  %[temp0]                      \n\t"
642
    "addu.ph          %[temp0],  %[temp3],   %[a10]            \n\t"
643
    "addu.ph          %[temp1],  %[temp3],   %[a32]            \n\t"
644
    "addu.ph          %[temp3],  %[temp2],   %[a10]            \n\t"
645
    "addu.ph          %[temp2],  %[temp2],   %[a32]            \n\t"
646
    "shll_s.ph        %[temp0],  %[temp0],   7                 \n\t"
647
    "shll_s.ph        %[temp1],  %[temp1],   7                 \n\t"
648
    "shll_s.ph        %[temp3],  %[temp3],   7                 \n\t"
649
    "shll_s.ph        %[temp2],  %[temp2],   7                 \n\t"
650
    "precrqu_s.qb.ph  %[temp0],  %[temp1],   %[temp0]          \n\t"
651
    "precrqu_s.qb.ph  %[temp1],  %[temp2],   %[temp3]          \n\t"
652
    "addu.ph          %[temp2],  %[temp5],   %[a10]            \n\t"
653
    "addu.ph          %[temp3],  %[temp5],   %[a32]            \n\t"
654
    "addu.ph          %[temp5],  %[temp4],   %[a10]            \n\t"
655
    "addu.ph          %[temp4],  %[temp4],   %[a32]            \n\t"
656
    "shll_s.ph        %[temp2],  %[temp2],   7                 \n\t"
657
    "shll_s.ph        %[temp3],  %[temp3],   7                 \n\t"
658
    "shll_s.ph        %[temp4],  %[temp4],   7                 \n\t"
659
    "shll_s.ph        %[temp5],  %[temp5],   7                 \n\t"
660
    "precrqu_s.qb.ph  %[temp2],  %[temp3],   %[temp2]          \n\t"
661
    "precrqu_s.qb.ph  %[temp3],  %[temp4],   %[temp5]          \n\t"
662
    "usw              %[temp1],  0*" XSTR(BPS) "(%[dst])       \n\t"
663
    "usw              %[temp0],  1*" XSTR(BPS) "(%[dst])       \n\t"
664
    "usw              %[temp3],  2*" XSTR(BPS) "(%[dst])       \n\t"
665
    "usw              %[temp2],  3*" XSTR(BPS) "(%[dst])       \n\t"
666
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
667
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
668
      [a10]"=&r"(a10), [a32]"=&r"(a32)
669
    : [c35]"r"(c35), [top]"r"(top), [dst]"r"(dst)
670
    : "memory"
671
  );
672
}
673

674
static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
675
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
676
  __asm__ volatile(
677
    "ulw             %[temp0],   -1(%[top])              \n\t"
678
    "ulh             %[temp1],   3(%[top])               \n\t"
679
    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
680
    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
681
    "preceu.ph.qbr   %[temp4],   %[temp1]                \n\t"
682
    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
683
    "packrl.ph       %[temp6],   %[temp4],    %[temp3]   \n\t"
684
    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
685
    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
686
    "addq.ph         %[temp2],   %[temp5],    %[temp2]   \n\t"
687
    "addq.ph         %[temp6],   %[temp6],    %[temp4]   \n\t"
688
    "addq.ph         %[temp2],   %[temp2],    %[temp3]   \n\t"
689
    "addq.ph         %[temp6],   %[temp6],    %[temp3]   \n\t"
690
    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
691
    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
692
    "precr.qb.ph     %[temp4],   %[temp6],    %[temp2]   \n\t"
693
    "usw             %[temp4],   0*" XSTR(BPS) "(%[dst]) \n\t"
694
    "usw             %[temp4],   1*" XSTR(BPS) "(%[dst]) \n\t"
695
    "usw             %[temp4],   2*" XSTR(BPS) "(%[dst]) \n\t"
696
    "usw             %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
697
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
698
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
699
      [temp6]"=&r"(temp6)
700
    : [top]"r"(top), [dst]"r"(dst)
701
    : "memory"
702
  );
703
}
704

705
static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
706
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
707
  __asm__ volatile(
708
    "ulw             %[temp0],   -4(%[top])              \n\t"
709
    "lbu             %[temp1],   -5(%[top])              \n\t"
710
    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
711
    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
712
    "replv.ph        %[temp4],   %[temp1]                \n\t"
713
    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
714
    "packrl.ph       %[temp6],   %[temp2],    %[temp4]   \n\t"
715
    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
716
    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
717
    "addq.ph         %[temp3],   %[temp3],    %[temp5]   \n\t"
718
    "addq.ph         %[temp3],   %[temp3],    %[temp2]   \n\t"
719
    "addq.ph         %[temp2],   %[temp2],    %[temp6]   \n\t"
720
    "addq.ph         %[temp2],   %[temp2],    %[temp4]   \n\t"
721
    "shra_r.ph       %[temp3],   %[temp3],    2          \n\t"
722
    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
723
    "replv.qb        %[temp0],   %[temp3]                \n\t"
724
    "replv.qb        %[temp1],   %[temp2]                \n\t"
725
    "srl             %[temp3],   %[temp3],    16         \n\t"
726
    "srl             %[temp2],   %[temp2],    16         \n\t"
727
    "replv.qb        %[temp3],   %[temp3]                \n\t"
728
    "replv.qb        %[temp2],   %[temp2]                \n\t"
729
    "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
730
    "usw             %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
731
    "usw             %[temp2],   2*" XSTR(BPS) "(%[dst]) \n\t"
732
    "usw             %[temp1],   3*" XSTR(BPS) "(%[dst]) \n\t"
733
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
734
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
735
      [temp6]"=&r"(temp6)
736
    : [top]"r"(top), [dst]"r"(dst)
737
    : "memory"
738
  );
739
}
740

741
static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
742
  int temp0, temp1, temp2, temp3, temp4, temp5;
743
  int temp6, temp7, temp8, temp9, temp10, temp11;
744
  __asm__ volatile(
745
    "ulw             %[temp0],    -5(%[top])               \n\t"
746
    "ulw             %[temp1],    -1(%[top])               \n\t"
747
    "preceu.ph.qbl   %[temp2],    %[temp0]                 \n\t"
748
    "preceu.ph.qbr   %[temp3],    %[temp0]                 \n\t"
749
    "preceu.ph.qbr   %[temp4],    %[temp1]                 \n\t"
750
    "preceu.ph.qbl   %[temp5],    %[temp1]                 \n\t"
751
    "packrl.ph       %[temp6],    %[temp2],    %[temp3]    \n\t"
752
    "packrl.ph       %[temp7],    %[temp4],    %[temp2]    \n\t"
753
    "packrl.ph       %[temp8],    %[temp5],    %[temp4]    \n\t"
754
    "shll.ph         %[temp6],    %[temp6],    1           \n\t"
755
    "addq.ph         %[temp9],    %[temp2],    %[temp6]    \n\t"
756
    "shll.ph         %[temp7],    %[temp7],    1           \n\t"
757
    "addq.ph         %[temp9],    %[temp9],    %[temp3]    \n\t"
758
    "shll.ph         %[temp8],    %[temp8],    1           \n\t"
759
    "shra_r.ph       %[temp9],    %[temp9],    2           \n\t"
760
    "addq.ph         %[temp10],   %[temp4],    %[temp7]    \n\t"
761
    "addq.ph         %[temp11],   %[temp5],    %[temp8]    \n\t"
762
    "addq.ph         %[temp10],   %[temp10],   %[temp2]    \n\t"
763
    "addq.ph         %[temp11],   %[temp11],   %[temp4]    \n\t"
764
    "shra_r.ph       %[temp10],   %[temp10],   2           \n\t"
765
    "shra_r.ph       %[temp11],   %[temp11],   2           \n\t"
766
    "lbu             %[temp0],    3(%[top])                \n\t"
767
    "lbu             %[temp1],    2(%[top])                \n\t"
768
    "lbu             %[temp2],    1(%[top])                \n\t"
769
    "sll             %[temp1],    %[temp1],    1           \n\t"
770
    "addu            %[temp0],    %[temp0],    %[temp1]    \n\t"
771
    "addu            %[temp0],    %[temp0],    %[temp2]    \n\t"
772
    "precr.qb.ph     %[temp9],    %[temp10],   %[temp9]    \n\t"
773
    "shra_r.w        %[temp0],    %[temp0],    2           \n\t"
774
    "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]   \n\t"
775
    "usw             %[temp9],    3*" XSTR(BPS) "(%[dst])  \n\t"
776
    "usw             %[temp10],   1*" XSTR(BPS) "(%[dst])  \n\t"
777
    "prepend         %[temp9],    %[temp11],   8           \n\t"
778
    "prepend         %[temp10],   %[temp0],    8           \n\t"
779
    "usw             %[temp9],    2*" XSTR(BPS) "(%[dst])  \n\t"
780
    "usw             %[temp10],   0*" XSTR(BPS) "(%[dst])  \n\t"
781
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
782
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
783
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
784
      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
785
    : [top]"r"(top), [dst]"r"(dst)
786
    : "memory"
787
  );
788
}
789

790
static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
791
  int temp0, temp1, temp2, temp3, temp4;
792
  int temp5, temp6, temp7, temp8, temp9;
793
  __asm__ volatile (
794
    "ulw              %[temp0],   -4(%[top])              \n\t"
795
    "ulw              %[temp1],   0(%[top])               \n\t"
796
    "preceu.ph.qbl    %[temp2],   %[temp0]                \n\t"
797
    "preceu.ph.qbr    %[temp0],   %[temp0]                \n\t"
798
    "preceu.ph.qbla   %[temp3],   %[temp1]                \n\t"
799
    "preceu.ph.qbra   %[temp1],   %[temp1]                \n\t"
800
    "packrl.ph        %[temp7],   %[temp3],    %[temp2]   \n\t"
801
    "addqh_r.ph       %[temp4],   %[temp1],    %[temp3]   \n\t"
802
    "move             %[temp6],   %[temp1]                \n\t"
803
    "append           %[temp1],   %[temp2],    16         \n\t"
804
    "shll.ph          %[temp9],   %[temp6],    1          \n\t"
805
    "addqh_r.ph       %[temp5],   %[temp7],    %[temp6]   \n\t"
806
    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
807
    "addu.ph          %[temp3],   %[temp7],    %[temp3]   \n\t"
808
    "addu.ph          %[temp1],   %[temp1],    %[temp6]   \n\t"
809
    "packrl.ph        %[temp7],   %[temp2],    %[temp0]   \n\t"
810
    "addu.ph          %[temp6],   %[temp0],    %[temp2]   \n\t"
811
    "addu.ph          %[temp3],   %[temp3],    %[temp9]   \n\t"
812
    "addu.ph          %[temp1],   %[temp1],    %[temp8]   \n\t"
813
    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
814
    "shra_r.ph        %[temp3],   %[temp3],    2          \n\t"
815
    "shra_r.ph        %[temp1],   %[temp1],    2          \n\t"
816
    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
817
    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
818
    "precrq.ph.w      %[temp8],   %[temp4],    %[temp5]   \n\t"
819
    "append           %[temp4],   %[temp5],    16         \n\t"
820
    "precrq.ph.w      %[temp2],   %[temp3],    %[temp1]   \n\t"
821
    "append           %[temp3],   %[temp1],    16         \n\t"
822
    "precr.qb.ph      %[temp8],   %[temp8],    %[temp4]   \n\t"
823
    "precr.qb.ph      %[temp3],   %[temp2],    %[temp3]   \n\t"
824
    "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
825
    "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
826
    "append           %[temp3],   %[temp6],    8          \n\t"
827
    "srl              %[temp6],   %[temp6],    16         \n\t"
828
    "append           %[temp8],   %[temp6],    8          \n\t"
829
    "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
830
    "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
831
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
832
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
833
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
834
      [temp9]"=&r"(temp9)
835
    : [top]"r"(top), [dst]"r"(dst)
836
    : "memory"
837
  );
838
}
839

840
static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
841
  int temp0, temp1, temp2, temp3, temp4, temp5;
842
  int temp6, temp7, temp8, temp9, temp10, temp11;
843
  __asm__ volatile(
844
    "ulw             %[temp0],    0(%[top])               \n\t"
845
    "ulw             %[temp1],    4(%[top])               \n\t"
846
    "preceu.ph.qbl   %[temp2],    %[temp0]                \n\t"
847
    "preceu.ph.qbr   %[temp3],    %[temp0]                \n\t"
848
    "preceu.ph.qbr   %[temp4],    %[temp1]                \n\t"
849
    "preceu.ph.qbl   %[temp5],    %[temp1]                \n\t"
850
    "packrl.ph       %[temp6],    %[temp2],    %[temp3]   \n\t"
851
    "packrl.ph       %[temp7],    %[temp4],    %[temp2]   \n\t"
852
    "packrl.ph       %[temp8],    %[temp5],    %[temp4]   \n\t"
853
    "shll.ph         %[temp6],    %[temp6],    1          \n\t"
854
    "addq.ph         %[temp9],    %[temp2],    %[temp6]   \n\t"
855
    "shll.ph         %[temp7],    %[temp7],    1          \n\t"
856
    "addq.ph         %[temp9],    %[temp9],    %[temp3]   \n\t"
857
    "shll.ph         %[temp8],    %[temp8],    1          \n\t"
858
    "shra_r.ph       %[temp9],    %[temp9],    2          \n\t"
859
    "addq.ph         %[temp10],   %[temp4],    %[temp7]   \n\t"
860
    "addq.ph         %[temp11],   %[temp5],    %[temp8]   \n\t"
861
    "addq.ph         %[temp10],   %[temp10],   %[temp2]   \n\t"
862
    "addq.ph         %[temp11],   %[temp11],   %[temp4]   \n\t"
863
    "shra_r.ph       %[temp10],   %[temp10],   2          \n\t"
864
    "shra_r.ph       %[temp11],   %[temp11],   2          \n\t"
865
    "srl             %[temp1],    %[temp1],    24         \n\t"
866
    "sll             %[temp1],    %[temp1],    1          \n\t"
867
    "raddu.w.qb      %[temp5],    %[temp5]                \n\t"
868
    "precr.qb.ph     %[temp9],    %[temp10],   %[temp9]   \n\t"
869
    "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]  \n\t"
870
    "addu            %[temp1],    %[temp1],    %[temp5]   \n\t"
871
    "shra_r.w        %[temp1],    %[temp1],    2          \n\t"
872
    "usw             %[temp9],    0*" XSTR(BPS) "(%[dst]) \n\t"
873
    "usw             %[temp10],   2*" XSTR(BPS) "(%[dst]) \n\t"
874
    "prepend         %[temp9],    %[temp11],   8          \n\t"
875
    "prepend         %[temp10],   %[temp1],    8          \n\t"
876
    "usw             %[temp9],    1*" XSTR(BPS) "(%[dst]) \n\t"
877
    "usw             %[temp10],   3*" XSTR(BPS) "(%[dst]) \n\t"
878
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
879
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
880
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
881
      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
882
    : [top]"r"(top), [dst]"r"(dst)
883
    : "memory"
884
  );
885
}
886

887
static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
888
  int temp0, temp1, temp2, temp3, temp4;
889
  int temp5, temp6, temp7, temp8, temp9;
890
  __asm__ volatile (
891
    "ulw              %[temp0],   0(%[top])               \n\t"
892
    "ulw              %[temp1],   4(%[top])               \n\t"
893
    "preceu.ph.qbla   %[temp2],   %[temp0]                \n\t"
894
    "preceu.ph.qbra   %[temp0],   %[temp0]                \n\t"
895
    "preceu.ph.qbl    %[temp3],   %[temp1]                \n\t"
896
    "preceu.ph.qbr    %[temp1],   %[temp1]                \n\t"
897
    "addqh_r.ph       %[temp4],   %[temp0],    %[temp2]   \n\t"
898
    "packrl.ph        %[temp7],   %[temp1],    %[temp0]   \n\t"
899
    "precrq.ph.w      %[temp6],   %[temp1],    %[temp2]   \n\t"
900
    "shll.ph          %[temp9],   %[temp2],    1          \n\t"
901
    "addqh_r.ph       %[temp5],   %[temp7],    %[temp2]   \n\t"
902
    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
903
    "addu.ph          %[temp2],   %[temp2],    %[temp6]   \n\t"
904
    "addu.ph          %[temp0],   %[temp0],    %[temp7]   \n\t"
905
    "packrl.ph        %[temp7],   %[temp3],    %[temp1]   \n\t"
906
    "addu.ph          %[temp6],   %[temp1],    %[temp3]   \n\t"
907
    "addu.ph          %[temp2],   %[temp2],    %[temp8]   \n\t"
908
    "addu.ph          %[temp0],   %[temp0],    %[temp9]   \n\t"
909
    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
910
    "shra_r.ph        %[temp2],   %[temp2],    2          \n\t"
911
    "shra_r.ph        %[temp0],   %[temp0],    2          \n\t"
912
    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
913
    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
914
    "precrq.ph.w      %[temp8],   %[temp5],    %[temp4]   \n\t"
915
    "append           %[temp5],   %[temp4],    16         \n\t"
916
    "precrq.ph.w      %[temp3],   %[temp2],    %[temp0]   \n\t"
917
    "append           %[temp2],   %[temp0],    16         \n\t"
918
    "precr.qb.ph      %[temp8],   %[temp8],    %[temp5]   \n\t"
919
    "precr.qb.ph      %[temp3],   %[temp3],    %[temp2]   \n\t"
920
    "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
921
    "prepend          %[temp8],   %[temp6],    8          \n\t"
922
    "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
923
    "srl              %[temp6],   %[temp6],    16         \n\t"
924
    "prepend          %[temp3],   %[temp6],    8          \n\t"
925
    "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
926
    "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
927
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
928
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
929
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
930
      [temp9]"=&r"(temp9)
931
    : [top]"r"(top), [dst]"r"(dst)
932
    : "memory"
933
  );
934
}
935

936
static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
937
  int temp0, temp1, temp2, temp3, temp4;
938
  int temp5, temp6, temp7, temp8, temp9;
939
  __asm__ volatile (
940
    "ulw              %[temp0],   -5(%[top])              \n\t"
941
    "ulw              %[temp1],   -1(%[top])              \n\t"
942
    "preceu.ph.qbla   %[temp2],   %[temp0]                \n\t"
943
    "preceu.ph.qbra   %[temp0],   %[temp0]                \n\t"
944
    "preceu.ph.qbl    %[temp3],   %[temp1]                \n\t"
945
    "preceu.ph.qbr    %[temp1],   %[temp1]                \n\t"
946
    "addqh_r.ph       %[temp4],   %[temp0],    %[temp2]   \n\t"
947
    "packrl.ph        %[temp7],   %[temp1],    %[temp0]   \n\t"
948
    "precrq.ph.w      %[temp6],   %[temp1],    %[temp2]   \n\t"
949
    "shll.ph          %[temp9],   %[temp2],    1          \n\t"
950
    "addqh_r.ph       %[temp5],   %[temp7],    %[temp2]   \n\t"
951
    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
952
    "addu.ph          %[temp2],   %[temp2],    %[temp6]   \n\t"
953
    "addu.ph          %[temp0],   %[temp0],    %[temp7]   \n\t"
954
    "packrl.ph        %[temp7],   %[temp3],    %[temp1]   \n\t"
955
    "addu.ph          %[temp6],   %[temp1],    %[temp3]   \n\t"
956
    "addu.ph          %[temp2],   %[temp2],    %[temp8]   \n\t"
957
    "addu.ph          %[temp0],   %[temp0],    %[temp9]   \n\t"
958
    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
959
    "shra_r.ph        %[temp2],   %[temp2],    2          \n\t"
960
    "shra_r.ph        %[temp0],   %[temp0],    2          \n\t"
961
    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
962
    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
963
    "precrq.ph.w      %[temp1],   %[temp2],    %[temp5]   \n\t"
964
    "precrq.ph.w      %[temp3],   %[temp0],    %[temp4]   \n\t"
965
    "precr.qb.ph      %[temp7],   %[temp6],    %[temp1]   \n\t"
966
    "precr.qb.ph      %[temp6],   %[temp1],    %[temp3]   \n\t"
967
    "usw              %[temp7],   0*" XSTR(BPS) "(%[dst]) \n\t"
968
    "usw              %[temp6],   1*" XSTR(BPS) "(%[dst]) \n\t"
969
    "append           %[temp2],   %[temp5],    16         \n\t"
970
    "append           %[temp0],   %[temp4],    16         \n\t"
971
    "precr.qb.ph      %[temp5],   %[temp3],    %[temp2]   \n\t"
972
    "precr.qb.ph      %[temp4],   %[temp2],    %[temp0]   \n\t"
973
    "usw              %[temp5],   2*" XSTR(BPS) "(%[dst]) \n\t"
974
    "usw              %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
975
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
976
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
977
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
978
      [temp9]"=&r"(temp9)
979
    : [top]"r"(top), [dst]"r"(dst)
980
    : "memory"
981
  );
982
}
983

984
static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
985
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
986
  __asm__ volatile (
987
    "ulw             %[temp0],   -5(%[top])              \n\t"
988
    "preceu.ph.qbl   %[temp1],   %[temp0]                \n\t"
989
    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
990
    "packrl.ph       %[temp3],   %[temp1],    %[temp2]   \n\t"
991
    "replv.qb        %[temp7],   %[temp2]                \n\t"
992
    "addqh_r.ph      %[temp4],   %[temp1],    %[temp3]   \n\t"
993
    "addqh_r.ph      %[temp5],   %[temp3],    %[temp2]   \n\t"
994
    "shll.ph         %[temp6],   %[temp3],    1          \n\t"
995
    "addu.ph         %[temp3],   %[temp2],    %[temp3]   \n\t"
996
    "addu.ph         %[temp6],   %[temp1],    %[temp6]   \n\t"
997
    "shll.ph         %[temp0],   %[temp2],    1          \n\t"
998
    "addu.ph         %[temp6],   %[temp6],    %[temp2]   \n\t"
999
    "addu.ph         %[temp0],   %[temp3],    %[temp0]   \n\t"
1000
    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
1001
    "shra_r.ph       %[temp0],   %[temp0],    2          \n\t"
1002
    "packrl.ph       %[temp3],   %[temp6],    %[temp5]   \n\t"
1003
    "precrq.ph.w     %[temp2],   %[temp6],    %[temp4]   \n\t"
1004
    "append          %[temp0],   %[temp5],    16         \n\t"
1005
    "precr.qb.ph     %[temp3],   %[temp3],    %[temp2]   \n\t"
1006
    "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
1007
    "precr.qb.ph     %[temp1],   %[temp7],    %[temp0]   \n\t"
1008
    "usw             %[temp7],   3*" XSTR(BPS) "(%[dst]) \n\t"
1009
    "packrl.ph       %[temp2],   %[temp1],    %[temp3]   \n\t"
1010
    "usw             %[temp1],   2*" XSTR(BPS) "(%[dst]) \n\t"
1011
    "usw             %[temp2],   1*" XSTR(BPS) "(%[dst]) \n\t"
1012
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1013
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
1014
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
1015
    : [top]"r"(top), [dst]"r"(dst)
1016
    : "memory"
1017
  );
1018
}
1019

1020
//------------------------------------------------------------------------------
1021
// Chroma 8x8 prediction (paragraph 12.2)
1022

1023
static void IntraChromaPreds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
1024
                                       const uint8_t* WEBP_RESTRICT left,
1025
                                       const uint8_t* WEBP_RESTRICT top) {
1026
  // U block
1027
  DCMode8(C8DC8 + dst, left, top);
1028
  VerticalPred8(C8VE8 + dst, top);
1029
  HorizontalPred8(C8HE8 + dst, left);
1030
  TrueMotion8(C8TM8 + dst, left, top);
1031
  // V block
1032
  dst += 8;
1033
  if (top) top += 8;
1034
  if (left) left += 16;
1035
  DCMode8(C8DC8 + dst, left, top);
1036
  VerticalPred8(C8VE8 + dst, top);
1037
  HorizontalPred8(C8HE8 + dst, left);
1038
  TrueMotion8(C8TM8 + dst, left, top);
1039
}
1040

1041
//------------------------------------------------------------------------------
1042
// luma 16x16 prediction (paragraph 12.3)
1043

1044
static void Intra16Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
1045
                                   const uint8_t* WEBP_RESTRICT left,
1046
                                   const uint8_t* WEBP_RESTRICT top) {
1047
  DCMode16(I16DC16 + dst, left, top);
1048
  VerticalPred16(I16VE16 + dst, top);
1049
  HorizontalPred16(I16HE16 + dst, left);
1050
  TrueMotion16(I16TM16 + dst, left, top);
1051
}
1052

1053
// Left samples are top[-5 .. -2], top_left is top[-1], top are
1054
// located at top[0..3], and top right is top[4..7]
1055
static void Intra4Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
1056
                                  const uint8_t* WEBP_RESTRICT top) {
1057
  DC4(I4DC4 + dst, top);
1058
  TM4(I4TM4 + dst, top);
1059
  VE4(I4VE4 + dst, top);
1060
  HE4(I4HE4 + dst, top);
1061
  RD4(I4RD4 + dst, top);
1062
  VR4(I4VR4 + dst, top);
1063
  LD4(I4LD4 + dst, top);
1064
  VL4(I4VL4 + dst, top);
1065
  HD4(I4HD4 + dst, top);
1066
  HU4(I4HU4 + dst, top);
1067
}
1068

1069
//------------------------------------------------------------------------------
1070
// Metric
1071

1072
#if !defined(WORK_AROUND_GCC)
1073

1074
#define GET_SSE_INNER(A)                                                  \
1075
  "lw               %[temp0],    " #A "(%[a])                  \n\t"      \
1076
  "lw               %[temp1],    " #A "(%[b])                  \n\t"      \
1077
  "preceu.ph.qbr    %[temp2],    %[temp0]                      \n\t"      \
1078
  "preceu.ph.qbl    %[temp0],    %[temp0]                      \n\t"      \
1079
  "preceu.ph.qbr    %[temp3],    %[temp1]                      \n\t"      \
1080
  "preceu.ph.qbl    %[temp1],    %[temp1]                      \n\t"      \
1081
  "subq.ph          %[temp2],    %[temp2],    %[temp3]         \n\t"      \
1082
  "subq.ph          %[temp0],    %[temp0],    %[temp1]         \n\t"      \
1083
  "dpa.w.ph         $ac0,        %[temp2],    %[temp2]         \n\t"      \
1084
  "dpa.w.ph         $ac0,        %[temp0],    %[temp0]         \n\t"
1085

1086
#define GET_SSE(A, B, C, D)               \
1087
  GET_SSE_INNER(A)                        \
1088
  GET_SSE_INNER(B)                        \
1089
  GET_SSE_INNER(C)                        \
1090
  GET_SSE_INNER(D)
1091

1092
static int SSE16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
1093
                              const uint8_t* WEBP_RESTRICT b) {
1094
  int count;
1095
  int temp0, temp1, temp2, temp3;
1096
  __asm__ volatile (
1097
    "mult   $zero,    $zero                            \n\t"
1098
    GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
1099
    GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
1100
    GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
1101
    GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
1102
    GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
1103
    GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
1104
    GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
1105
    GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
1106
    GET_SSE( 8 * BPS, 4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS)
1107
    GET_SSE( 9 * BPS, 4 +  9 * BPS, 8 +  9 * BPS, 12 +  9 * BPS)
1108
    GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
1109
    GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
1110
    GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
1111
    GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
1112
    GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
1113
    GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
1114
    "mflo   %[count]                                   \n\t"
1115
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1116
      [temp3]"=&r"(temp3), [count]"=&r"(count)
1117
    : [a]"r"(a), [b]"r"(b)
1118
    : "memory", "hi", "lo"
1119
  );
1120
  return count;
1121
}
1122

1123
static int SSE16x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
1124
                             const uint8_t* WEBP_RESTRICT b) {
1125
  int count;
1126
  int temp0, temp1, temp2, temp3;
1127
  __asm__ volatile (
1128
    "mult   $zero,    $zero                            \n\t"
1129
    GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
1130
    GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
1131
    GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
1132
    GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
1133
    GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
1134
    GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
1135
    GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
1136
    GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
1137
    "mflo   %[count]                                   \n\t"
1138
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1139
      [temp3]"=&r"(temp3), [count]"=&r"(count)
1140
    : [a]"r"(a), [b]"r"(b)
1141
    : "memory", "hi", "lo"
1142
  );
1143
  return count;
1144
}
1145

1146
static int SSE8x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
1147
                            const uint8_t* WEBP_RESTRICT b) {
1148
  int count;
1149
  int temp0, temp1, temp2, temp3;
1150
  __asm__ volatile (
1151
    "mult   $zero,    $zero                            \n\t"
1152
    GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
1153
    GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
1154
    GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
1155
    GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
1156
    "mflo   %[count]                                   \n\t"
1157
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1158
      [temp3]"=&r"(temp3), [count]"=&r"(count)
1159
    : [a]"r"(a), [b]"r"(b)
1160
    : "memory", "hi", "lo"
1161
  );
1162
  return count;
1163
}
1164

1165
static int SSE4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
1166
                            const uint8_t* WEBP_RESTRICT b) {
1167
  int count;
1168
  int temp0, temp1, temp2, temp3;
1169
  __asm__ volatile (
1170
    "mult   $zero,    $zero                            \n\t"
1171
    GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
1172
    "mflo   %[count]                                   \n\t"
1173
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1174
      [temp3]"=&r"(temp3), [count]"=&r"(count)
1175
    : [a]"r"(a), [b]"r"(b)
1176
    : "memory", "hi", "lo"
1177
  );
1178
  return count;
1179
}
1180

1181
#undef GET_SSE
1182
#undef GET_SSE_INNER
1183

1184
#endif  // !WORK_AROUND_GCC
1185

1186
#undef FILL_8_OR_16
1187
#undef FILL_PART
1188
#undef OUTPUT_EARLY_CLOBBER_REGS_17
1189
#undef MUL_HALF
1190
#undef ABS_X8
1191
#undef ADD_SUB_HALVES_X4
1192

1193
//------------------------------------------------------------------------------
1194
// Quantization
1195
//
1196

1197
// macro for one pass through for loop in QuantizeBlock reading 2 values at time
1198
// QUANTDIV macro inlined
1199
// J - offset in bytes (kZigzag[n] * 2)
1200
// K - offset in bytes (kZigzag[n] * 4)
1201
// N - offset in bytes (n * 2)
1202
// N1 - offset in bytes ((n + 1) * 2)
1203
#define QUANTIZE_ONE(J, K, N, N1)                                         \
1204
  "ulw         %[temp1],     " #J "(%[ppin])                 \n\t"        \
1205
  "ulw         %[temp2],     " #J "(%[ppsharpen])            \n\t"        \
1206
  "lhu         %[temp3],     " #K "(%[ppzthresh])            \n\t"        \
1207
  "lhu         %[temp6],     " #K "+4(%[ppzthresh])          \n\t"        \
1208
  "absq_s.ph   %[temp4],     %[temp1]                        \n\t"        \
1209
  "ins         %[temp3],     %[temp6],         16,       16  \n\t"        \
1210
  "addu.ph     %[coeff],     %[temp4],         %[temp2]      \n\t"        \
1211
  "shra.ph     %[sign],      %[temp1],         15            \n\t"        \
1212
  "li          %[level],     0x10001                         \n\t"        \
1213
  "cmp.lt.ph   %[temp3],     %[coeff]                        \n\t"        \
1214
  "lhu         %[temp1],     " #J "(%[ppiq])                 \n\t"        \
1215
  "pick.ph     %[temp5],     %[level],         $0            \n\t"        \
1216
  "lw          %[temp2],     " #K "(%[ppbias])               \n\t"        \
1217
  "beqz        %[temp5],     0f                              \n\t"        \
1218
  "lhu         %[temp3],     " #J "(%[ppq])                  \n\t"        \
1219
  "beq         %[temp5],     %[level],         1f            \n\t"        \
1220
  "andi        %[temp5],     %[temp5],         0x1           \n\t"        \
1221
  "andi        %[temp4],     %[coeff],         0xffff        \n\t"        \
1222
  "beqz        %[temp5],     2f                              \n\t"        \
1223
  "mul         %[level],     %[temp4],         %[temp1]      \n\t"        \
1224
  "sh          $0,           " #J "+2(%[ppin])               \n\t"        \
1225
  "sh          $0,           " #N1 "(%[pout])                \n\t"        \
1226
  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
1227
  "sra         %[level],     %[level],         17            \n\t"        \
1228
  "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
1229
  "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
1230
  "andi        %[temp6],     %[sign],          0xffff        \n\t"        \
1231
  "xor         %[level],     %[level],         %[temp6]      \n\t"        \
1232
  "subu        %[level],     %[level],         %[temp6]      \n\t"        \
1233
  "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
1234
  "or          %[ret],       %[ret],           %[level]      \n\t"        \
1235
  "sh          %[level],     " #N "(%[pout])                 \n\t"        \
1236
  "sh          %[temp5],     " #J "(%[ppin])                 \n\t"        \
1237
  "j           3f                                            \n\t"        \
1238
"2:                                                          \n\t"        \
1239
  "lhu         %[temp1],     " #J "+2(%[ppiq])               \n\t"        \
1240
  "srl         %[temp5],     %[coeff],         16            \n\t"        \
1241
  "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
1242
  "lw          %[temp2],     " #K "+4(%[ppbias])             \n\t"        \
1243
  "lhu         %[temp3],     " #J "+2(%[ppq])                \n\t"        \
1244
  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
1245
  "sra         %[level],     %[level],         17            \n\t"        \
1246
  "srl         %[temp6],     %[sign],          16            \n\t"        \
1247
  "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
1248
  "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
1249
  "xor         %[level],     %[level],         %[temp6]      \n\t"        \
1250
  "subu        %[level],     %[level],         %[temp6]      \n\t"        \
1251
  "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
1252
  "sh          $0,           " #J "(%[ppin])                 \n\t"        \
1253
  "sh          $0,           " #N "(%[pout])                 \n\t"        \
1254
  "or          %[ret],       %[ret],           %[level]      \n\t"        \
1255
  "sh          %[temp5],     " #J "+2(%[ppin])               \n\t"        \
1256
  "sh          %[level],     " #N1 "(%[pout])                \n\t"        \
1257
  "j           3f                                            \n\t"        \
1258
"1:                                                          \n\t"        \
1259
  "lhu         %[temp1],     " #J "(%[ppiq])                 \n\t"        \
1260
  "lw          %[temp2],     " #K "(%[ppbias])               \n\t"        \
1261
  "ulw         %[temp3],     " #J "(%[ppq])                  \n\t"        \
1262
  "andi        %[temp5],     %[coeff],         0xffff        \n\t"        \
1263
  "srl         %[temp0],     %[coeff],         16            \n\t"        \
1264
  "lhu         %[temp6],     " #J "+2(%[ppiq])               \n\t"        \
1265
  "lw          %[coeff],     " #K "+4(%[ppbias])             \n\t"        \
1266
  "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
1267
  "mul         %[temp4],     %[temp0],         %[temp6]      \n\t"        \
1268
  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
1269
  "addu        %[temp4],     %[temp4],         %[coeff]      \n\t"        \
1270
  "precrq.ph.w %[level],     %[temp4],         %[level]      \n\t"        \
1271
  "shra.ph     %[level],     %[level],         1             \n\t"        \
1272
  "cmp.lt.ph   %[max_level1],%[level]                        \n\t"        \
1273
  "pick.ph     %[level],     %[max_level],     %[level]      \n\t"        \
1274
  "xor         %[level],     %[level],         %[sign]       \n\t"        \
1275
  "subu.ph     %[level],     %[level],         %[sign]       \n\t"        \
1276
  "mul.ph      %[temp3],     %[level],         %[temp3]      \n\t"        \
1277
  "or          %[ret],       %[ret],           %[level]      \n\t"        \
1278
  "sh          %[level],     " #N "(%[pout])                 \n\t"        \
1279
  "srl         %[level],     %[level],         16            \n\t"        \
1280
  "sh          %[level],     " #N1 "(%[pout])                \n\t"        \
1281
  "usw         %[temp3],     " #J "(%[ppin])                 \n\t"        \
1282
  "j           3f                                            \n\t"        \
1283
"0:                                                          \n\t"        \
1284
  "sh          $0,           " #N "(%[pout])                 \n\t"        \
1285
  "sh          $0,           " #N1 "(%[pout])                \n\t"        \
1286
  "usw         $0,           " #J "(%[ppin])                 \n\t"        \
1287
"3:                                                          \n\t"
1288

1289
static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
1290
                                   const VP8Matrix* WEBP_RESTRICT const mtx) {
1291
  int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
1292
  int sign, coeff, level;
1293
  int max_level = MAX_LEVEL;
1294
  int max_level1 = max_level << 16 | max_level;
1295
  int ret = 0;
1296

1297
  int16_t* ppin             = &in[0];
1298
  int16_t* pout             = &out[0];
1299
  const uint16_t* ppsharpen = &mtx->sharpen_[0];
1300
  const uint32_t* ppzthresh = &mtx->zthresh_[0];
1301
  const uint16_t* ppq       = &mtx->q_[0];
1302
  const uint16_t* ppiq      = &mtx->iq_[0];
1303
  const uint32_t* ppbias    = &mtx->bias_[0];
1304

1305
  __asm__ volatile (
1306
    QUANTIZE_ONE( 0,  0,  0,  2)
1307
    QUANTIZE_ONE( 4,  8, 10, 12)
1308
    QUANTIZE_ONE( 8, 16,  4,  8)
1309
    QUANTIZE_ONE(12, 24, 14, 24)
1310
    QUANTIZE_ONE(16, 32,  6, 16)
1311
    QUANTIZE_ONE(20, 40, 22, 26)
1312
    QUANTIZE_ONE(24, 48, 18, 20)
1313
    QUANTIZE_ONE(28, 56, 28, 30)
1314

1315
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
1316
      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
1317
      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
1318
      [sign]"=&r"(sign), [coeff]"=&r"(coeff),
1319
      [level]"=&r"(level), [temp6]"=&r"(temp6), [ret]"+&r"(ret)
1320
    : [ppin]"r"(ppin), [pout]"r"(pout), [max_level1]"r"(max_level1),
1321
      [ppiq]"r"(ppiq), [max_level]"r"(max_level),
1322
      [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
1323
      [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
1324
    : "memory", "hi", "lo"
1325
  );
1326

1327
  return (ret != 0);
1328
}
1329

1330
static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
1331
                                     const VP8Matrix* WEBP_RESTRICT const mtx) {
1332
  int nz;
1333
  nz  = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
1334
  nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
1335
  return nz;
1336
}
1337

1338
#undef QUANTIZE_ONE
1339

1340
// macro for one horizontal pass in FTransformWHT
1341
// temp0..temp7 holds tmp[0]..tmp[15]
1342
// A, B, C, D - offset in bytes to load from in buffer
1343
// TEMP0, TEMP1 - registers for corresponding tmp elements
1344
#define HORIZONTAL_PASS_WHT(A, B, C, D, TEMP0, TEMP1)                          \
1345
  "lh              %[" #TEMP0 "],  " #A "(%[in])            \n\t"              \
1346
  "lh              %[" #TEMP1 "],  " #B "(%[in])            \n\t"              \
1347
  "lh              %[temp8],     " #C "(%[in])              \n\t"              \
1348
  "lh              %[temp9],     " #D "(%[in])              \n\t"              \
1349
  "ins             %[" #TEMP1 "],  %[" #TEMP0 "],  16,  16  \n\t"              \
1350
  "ins             %[temp9],     %[temp8],     16,  16      \n\t"              \
1351
  "subq.ph         %[temp8],     %[" #TEMP1 "],  %[temp9]   \n\t"              \
1352
  "addq.ph         %[temp9],     %[" #TEMP1 "],  %[temp9]   \n\t"              \
1353
  "precrq.ph.w     %[" #TEMP0 "],  %[temp8],     %[temp9]   \n\t"              \
1354
  "append          %[temp8],     %[temp9],     16           \n\t"              \
1355
  "subq.ph         %[" #TEMP1 "],  %[" #TEMP0 "],  %[temp8] \n\t"              \
1356
  "addq.ph         %[" #TEMP0 "],  %[" #TEMP0 "],  %[temp8] \n\t"              \
1357
  "rotr            %[" #TEMP1 "],  %[" #TEMP1 "],  16       \n\t"
1358

1359
// macro for one vertical pass in FTransformWHT
1360
// temp0..temp7 holds tmp[0]..tmp[15]
1361
// A, B, C, D - offsets in bytes to store to out buffer
1362
// TEMP0, TEMP2, TEMP4 and TEMP6 - registers for corresponding tmp elements
1363
#define VERTICAL_PASS_WHT(A, B, C, D, TEMP0, TEMP2, TEMP4, TEMP6)              \
1364
  "addq.ph         %[temp8],     %[" #TEMP0 "],  %[" #TEMP4 "]    \n\t"        \
1365
  "addq.ph         %[temp9],     %[" #TEMP2 "],  %[" #TEMP6 "]    \n\t"        \
1366
  "subq.ph         %[" #TEMP2 "],  %[" #TEMP2 "],  %[" #TEMP6 "]  \n\t"        \
1367
  "subq.ph         %[" #TEMP6 "],  %[" #TEMP0 "],  %[" #TEMP4 "]  \n\t"        \
1368
  "addqh.ph        %[" #TEMP0 "],  %[temp8],     %[temp9]         \n\t"        \
1369
  "subqh.ph        %[" #TEMP4 "],  %[" #TEMP6 "],  %[" #TEMP2 "]  \n\t"        \
1370
  "addqh.ph        %[" #TEMP2 "],  %[" #TEMP2 "],  %[" #TEMP6 "]  \n\t"        \
1371
  "subqh.ph        %[" #TEMP6 "],  %[temp8],     %[temp9]         \n\t"        \
1372
  "usw             %[" #TEMP0 "],  " #A "(%[out])                 \n\t"        \
1373
  "usw             %[" #TEMP2 "],  " #B "(%[out])                 \n\t"        \
1374
  "usw             %[" #TEMP4 "],  " #C "(%[out])                 \n\t"        \
1375
  "usw             %[" #TEMP6 "],  " #D "(%[out])                 \n\t"
1376

1377
static void FTransformWHT_MIPSdspR2(const int16_t* WEBP_RESTRICT in,
1378
                                    int16_t* WEBP_RESTRICT out) {
1379
  int temp0, temp1, temp2, temp3, temp4;
1380
  int temp5, temp6, temp7, temp8, temp9;
1381

1382
  __asm__ volatile (
1383
    HORIZONTAL_PASS_WHT(  0,  32,  64,  96, temp0, temp1)
1384
    HORIZONTAL_PASS_WHT(128, 160, 192, 224, temp2, temp3)
1385
    HORIZONTAL_PASS_WHT(256, 288, 320, 352, temp4, temp5)
1386
    HORIZONTAL_PASS_WHT(384, 416, 448, 480, temp6, temp7)
1387
    VERTICAL_PASS_WHT(0,  8, 16, 24, temp0, temp2, temp4, temp6)
1388
    VERTICAL_PASS_WHT(4, 12, 20, 28, temp1, temp3, temp5, temp7)
1389
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1390
      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
1391
      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
1392
      [temp9]"=&r"(temp9)
1393
    : [in]"r"(in), [out]"r"(out)
1394
    : "memory"
1395
  );
1396
}
1397

1398
#undef VERTICAL_PASS_WHT
1399
#undef HORIZONTAL_PASS_WHT
1400

1401
// macro for converting coefficients to bin
1402
// convert 8 coeffs at time
1403
// A, B, C, D - offsets in bytes to load from out buffer
1404
#define CONVERT_COEFFS_TO_BIN(A, B, C, D)                                      \
1405
  "ulw        %[temp0],  " #A "(%[out])                \n\t"                   \
1406
  "ulw        %[temp1],  " #B "(%[out])                \n\t"                   \
1407
  "ulw        %[temp2],  " #C "(%[out])                \n\t"                   \
1408
  "ulw        %[temp3],  " #D "(%[out])                \n\t"                   \
1409
  "absq_s.ph  %[temp0],  %[temp0]                      \n\t"                   \
1410
  "absq_s.ph  %[temp1],  %[temp1]                      \n\t"                   \
1411
  "absq_s.ph  %[temp2],  %[temp2]                      \n\t"                   \
1412
  "absq_s.ph  %[temp3],  %[temp3]                      \n\t"                   \
1413
  "shra.ph    %[temp0],  %[temp0],    3                \n\t"                   \
1414
  "shra.ph    %[temp1],  %[temp1],    3                \n\t"                   \
1415
  "shra.ph    %[temp2],  %[temp2],    3                \n\t"                   \
1416
  "shra.ph    %[temp3],  %[temp3],    3                \n\t"                   \
1417
  "shll_s.ph  %[temp0],  %[temp0],    10               \n\t"                   \
1418
  "shll_s.ph  %[temp1],  %[temp1],    10               \n\t"                   \
1419
  "shll_s.ph  %[temp2],  %[temp2],    10               \n\t"                   \
1420
  "shll_s.ph  %[temp3],  %[temp3],    10               \n\t"                   \
1421
  "shrl.ph    %[temp0],  %[temp0],    10               \n\t"                   \
1422
  "shrl.ph    %[temp1],  %[temp1],    10               \n\t"                   \
1423
  "shrl.ph    %[temp2],  %[temp2],    10               \n\t"                   \
1424
  "shrl.ph    %[temp3],  %[temp3],    10               \n\t"                   \
1425
  "shll.ph    %[temp0],  %[temp0],    2                \n\t"                   \
1426
  "shll.ph    %[temp1],  %[temp1],    2                \n\t"                   \
1427
  "shll.ph    %[temp2],  %[temp2],    2                \n\t"                   \
1428
  "shll.ph    %[temp3],  %[temp3],    2                \n\t"                   \
1429
  "ext        %[temp4],  %[temp0],    0,       16      \n\t"                   \
1430
  "ext        %[temp0],  %[temp0],    16,      16      \n\t"                   \
1431
  "addu       %[temp4],  %[temp4],    %[dist]          \n\t"                   \
1432
  "addu       %[temp0],  %[temp0],    %[dist]          \n\t"                   \
1433
  "ext        %[temp5],  %[temp1],    0,       16      \n\t"                   \
1434
  "lw         %[temp8],  0(%[temp4])                   \n\t"                   \
1435
  "ext        %[temp1],  %[temp1],    16,      16      \n\t"                   \
1436
  "addu       %[temp5],  %[temp5],    %[dist]          \n\t"                   \
1437
  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1438
  "sw         %[temp8],  0(%[temp4])                   \n\t"                   \
1439
  "lw         %[temp8],  0(%[temp0])                   \n\t"                   \
1440
  "addu       %[temp1],  %[temp1],    %[dist]          \n\t"                   \
1441
  "ext        %[temp6],  %[temp2],    0,       16      \n\t"                   \
1442
  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1443
  "sw         %[temp8],  0(%[temp0])                   \n\t"                   \
1444
  "lw         %[temp8],  0(%[temp5])                   \n\t"                   \
1445
  "ext        %[temp2],  %[temp2],    16,      16      \n\t"                   \
1446
  "addu       %[temp6],  %[temp6],    %[dist]          \n\t"                   \
1447
  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1448
  "sw         %[temp8],  0(%[temp5])                   \n\t"                   \
1449
  "lw         %[temp8],  0(%[temp1])                   \n\t"                   \
1450
  "addu       %[temp2],  %[temp2],    %[dist]          \n\t"                   \
1451
  "ext        %[temp7],  %[temp3],    0,       16      \n\t"                   \
1452
  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1453
  "sw         %[temp8],  0(%[temp1])                   \n\t"                   \
1454
  "lw         %[temp8],  0(%[temp6])                   \n\t"                   \
1455
  "ext        %[temp3],  %[temp3],    16,      16      \n\t"                   \
1456
  "addu       %[temp7],  %[temp7],    %[dist]          \n\t"                   \
1457
  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1458
  "sw         %[temp8],  0(%[temp6])                   \n\t"                   \
1459
  "lw         %[temp8],  0(%[temp2])                   \n\t"                   \
1460
  "addu       %[temp3],  %[temp3],    %[dist]          \n\t"                   \
1461
  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1462
  "sw         %[temp8],  0(%[temp2])                   \n\t"                   \
1463
  "lw         %[temp8],  0(%[temp7])                   \n\t"                   \
1464
  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1465
  "sw         %[temp8],  0(%[temp7])                   \n\t"                   \
1466
  "lw         %[temp8],  0(%[temp3])                   \n\t"                   \
1467
  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1468
  "sw         %[temp8],  0(%[temp3])                   \n\t"
1469

1470
static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred,
1471
                                       int start_block, int end_block,
1472
                                       VP8Histogram* const histo) {
1473
  int j;
1474
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
1475
  const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH;
1476
  for (j = start_block; j < end_block; ++j) {
1477
    int16_t out[16];
1478
    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
1479

1480
    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
1481

1482
    // Convert coefficients to bin.
1483
    __asm__ volatile (
1484
      CONVERT_COEFFS_TO_BIN( 0,  4,  8, 12)
1485
      CONVERT_COEFFS_TO_BIN(16, 20, 24, 28)
1486
      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1487
        [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
1488
        [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
1489
      : [dist]"r"(distribution), [out]"r"(out), [max_coeff]"r"(max_coeff)
1490
      : "memory"
1491
    );
1492
  }
1493
  VP8SetHistogramData(distribution, histo);
1494
}
1495

1496
#undef CONVERT_COEFFS_TO_BIN
1497

1498
//------------------------------------------------------------------------------
1499
// Entry point
1500

1501
extern void VP8EncDspInitMIPSdspR2(void);
1502

1503
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
1504
  VP8FTransform = FTransform_MIPSdspR2;
1505
  VP8FTransformWHT = FTransformWHT_MIPSdspR2;
1506
  VP8ITransform = ITransform_MIPSdspR2;
1507

1508
  VP8TDisto4x4 = Disto4x4_MIPSdspR2;
1509
  VP8TDisto16x16 = Disto16x16_MIPSdspR2;
1510

1511
  VP8EncPredLuma16 = Intra16Preds_MIPSdspR2;
1512
  VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2;
1513
  VP8EncPredLuma4 = Intra4Preds_MIPSdspR2;
1514

1515
#if !defined(WORK_AROUND_GCC)
1516
  VP8SSE16x16 = SSE16x16_MIPSdspR2;
1517
  VP8SSE8x8 = SSE8x8_MIPSdspR2;
1518
  VP8SSE16x8 = SSE16x8_MIPSdspR2;
1519
  VP8SSE4x4 = SSE4x4_MIPSdspR2;
1520
#endif
1521

1522
  VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2;
1523
  VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2;
1524

1525
  VP8CollectHistogram = CollectHistogram_MIPSdspR2;
1526
}
1527

1528
#else  // !WEBP_USE_MIPS_DSP_R2
1529

1530
WEBP_DSP_INIT_STUB(VP8EncDspInitMIPSdspR2)
1531

1532
#endif  // WEBP_USE_MIPS_DSP_R2
1533

1534
Product

Resources

Company