Path: blob/21.2-virgl/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c
4574 views
/*1* Copyright 2013 Ilia Mirkin2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice shall be included in11* all copies or substantial portions of the Software.12*13* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR14* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,15* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL16* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR17* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,18* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR19* OTHER DEALINGS IN THE SOFTWARE.20*/2122#include "nv50/nv84_video.h"2324#include "util/u_sse.h"2526struct h264_iparm1 {27uint8_t scaling_lists_4x4[6][16]; // 0028uint8_t scaling_lists_8x8[2][64]; // 6029uint32_t width; // e030uint32_t height; // e431uint64_t ref1_addrs[16]; // e832uint64_t ref2_addrs[16]; // 16833uint32_t unk1e8;34uint32_t unk1ec;35uint32_t w1; // 1f036uint32_t w2; // 1f437uint32_t w3; // 1f838uint32_t h1; // 1fc39uint32_t h2; // 20040uint32_t h3; // 20441uint32_t mb_adaptive_frame_field_flag; // 20842uint32_t field_pic_flag; // 20c43uint32_t format; // 21044uint32_t unk214; // 21445};4647struct h264_iparm2 {48uint32_t width; // 0049uint32_t height; // 0450uint32_t mbs; // 0851uint32_t w1; // 0c52uint32_t w2; // 1053uint32_t w3; // 1454uint32_t h1; // 1855uint32_t h2; // 1c56uint32_t h3; // 2057uint32_t unk24;58uint32_t mb_adaptive_frame_field_flag; // 2859uint32_t top; // 2c60uint32_t bottom; // 3061uint32_t is_reference; // 3462};6364void65nv84_decoder_vp_h264(struct nv84_decoder *dec,66struct pipe_h264_picture_desc *desc,67struct nv84_video_buffer *dest)68{69struct h264_iparm1 param1;70struct h264_iparm2 param2;71int i, width = align(dest->base.width, 16),72height = align(dest->base.height, 16);7374struct nouveau_pushbuf *push = dec->vp_pushbuf;75struct nouveau_pushbuf_refn bo_refs[] = {76{ dest->interlaced, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },77{ dest->full, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },78{ dec->vpring, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },79{ dec->mbring, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },80{ dec->vp_params, NOUVEAU_BO_RDWR | NOUVEAU_BO_GART },81{ dec->fence, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },82};83int num_refs = ARRAY_SIZE(bo_refs);84bool is_ref = desc->is_reference;8586STATIC_ASSERT(sizeof(struct h264_iparm1) == 0x218);87STATIC_ASSERT(sizeof(struct h264_iparm2) == 0x38);8889memset(¶m1, 0, sizeof(param1));90memset(¶m2, 0, sizeof(param2));9192memcpy(¶m1.scaling_lists_4x4, desc->pps->ScalingList4x4,93sizeof(param1.scaling_lists_4x4));94memcpy(¶m1.scaling_lists_8x8, desc->pps->ScalingList8x8,95sizeof(param1.scaling_lists_8x8));9697param1.width = width;98param1.w1 = param1.w2 = param1.w3 = align(width, 64);99param1.height = param1.h2 = height;100param1.h1 = param1.h3 = align(height, 32);101param1.format = 0x3231564e; /* 'NV12' */102param1.mb_adaptive_frame_field_flag = desc->pps->sps->mb_adaptive_frame_field_flag;103param1.field_pic_flag = desc->field_pic_flag;104105param2.width = width;106param2.w1 = param2.w2 = param2.w3 = param1.w1;107if (desc->field_pic_flag)108param2.height = align(height, 32) / 2;109else110param2.height = height;111param2.h1 = param2.h2 = align(height, 32);112param2.h3 = height;113param2.mbs = width * height >> 8;114if (desc->field_pic_flag) {115param2.top = desc->bottom_field_flag ? 2 : 1;116param2.bottom = desc->bottom_field_flag;117}118param2.mb_adaptive_frame_field_flag = desc->pps->sps->mb_adaptive_frame_field_flag;119param2.is_reference = desc->is_reference;120121PUSH_SPACE(push, 5 + 16 + 3 + 2 + 6 + (is_ref ? 2 : 0) + 3 + 2 + 4 + 2);122123struct nouveau_bo *ref2_default = dest->full;124125for (i = 0; i < 16; i++) {126struct nv84_video_buffer *buf = (struct nv84_video_buffer *)desc->ref[i];127struct nouveau_bo *bo1, *bo2;128if (buf) {129bo1 = buf->interlaced;130bo2 = buf->full;131if (i == 0)132ref2_default = buf->full;133} else {134bo1 = dest->interlaced;135bo2 = ref2_default;136}137param1.ref1_addrs[i] = bo1->offset;138param1.ref2_addrs[i] = bo2->offset;139struct nouveau_pushbuf_refn bo_refs[] = {140{ bo1, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },141{ bo2, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },142};143nouveau_pushbuf_refn(push, bo_refs, ARRAY_SIZE(bo_refs));144}145146memcpy(dec->vp_params->map, ¶m1, sizeof(param1));147memcpy(dec->vp_params->map + 0x400, ¶m2, sizeof(param2));148149nouveau_pushbuf_refn(push, bo_refs, num_refs);150151/* Wait for BSP to have completed */152BEGIN_NV04(push, SUBC_VP(0x10), 4);153PUSH_DATAh(push, dec->fence->offset);154PUSH_DATA (push, dec->fence->offset);155PUSH_DATA (push, 2);156PUSH_DATA (push, 1); /* wait for sem == 2 */157158/* VP step 1 */159BEGIN_NV04(push, SUBC_VP(0x400), 15);160PUSH_DATA (push, 1);161PUSH_DATA (push, param2.mbs);162PUSH_DATA (push, 0x3987654); /* each nibble probably a dma index */163PUSH_DATA (push, 0x55001); /* constant */164PUSH_DATA (push, dec->vp_params->offset >> 8);165PUSH_DATA (push, (dec->vpring->offset + dec->vpring_residual) >> 8);166PUSH_DATA (push, dec->vpring_ctrl);167PUSH_DATA (push, dec->vpring->offset >> 8);168PUSH_DATA (push, dec->bitstream->size / 2 - 0x700);169PUSH_DATA (push, (dec->mbring->offset + dec->mbring->size - 0x2000) >> 8);170PUSH_DATA (push, (dec->vpring->offset + dec->vpring_ctrl +171dec->vpring_residual + dec->vpring_deblock) >> 8);172PUSH_DATA (push, 0);173PUSH_DATA (push, 0x100008);174PUSH_DATA (push, dest->interlaced->offset >> 8);175PUSH_DATA (push, 0);176177BEGIN_NV04(push, SUBC_VP(0x620), 2);178PUSH_DATA (push, 0);179PUSH_DATA (push, 0);180181BEGIN_NV04(push, SUBC_VP(0x300), 1);182PUSH_DATA (push, 0);183184/* VP step 2 */185BEGIN_NV04(push, SUBC_VP(0x400), 5);186PUSH_DATA (push, 0x54530201);187PUSH_DATA (push, (dec->vp_params->offset >> 8) + 0x4);188PUSH_DATA (push, (dec->vpring->offset + dec->vpring_ctrl +189dec->vpring_residual) >> 8);190PUSH_DATA (push, dest->interlaced->offset >> 8);191PUSH_DATA (push, dest->interlaced->offset >> 8);192193if (is_ref) {194BEGIN_NV04(push, SUBC_VP(0x414), 1);195PUSH_DATA (push, dest->full->offset >> 8);196}197198BEGIN_NV04(push, SUBC_VP(0x620), 2);199PUSH_DATAh(push, dec->vp_fw2_offset);200PUSH_DATA (push, dec->vp_fw2_offset);201202BEGIN_NV04(push, SUBC_VP(0x300), 1);203PUSH_DATA (push, 0);204205/* Set the semaphore back to 1 */206BEGIN_NV04(push, SUBC_VP(0x610), 3);207PUSH_DATAh(push, dec->fence->offset);208PUSH_DATA (push, dec->fence->offset);209PUSH_DATA (push, 1);210211/* Write to the semaphore location, intr */212BEGIN_NV04(push, SUBC_VP(0x304), 1);213PUSH_DATA (push, 0x101);214215for (i = 0; i < 2; i++) {216struct nv50_miptree *mt = nv50_miptree(dest->resources[i]);217mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;218}219220PUSH_KICK (push);221}222223static inline int16_t inverse_quantize(int16_t val, uint8_t quant, int mpeg1) {224int16_t ret = val * quant / 16;225if (mpeg1 && ret) {226if (ret > 0)227ret = (ret - 1) | 1;228else229ret = (ret + 1) | 1;230}231if (ret < -2048)232ret = -2048;233else if (ret > 2047)234ret = 2047;235return ret;236}237238struct mpeg12_mb_info {239uint32_t index;240uint8_t unk4;241uint8_t unk5;242uint16_t coded_block_pattern;243uint8_t block_counts[6];244uint16_t PMV[8];245uint16_t skipped;246};247248void249nv84_decoder_vp_mpeg12_mb(struct nv84_decoder *dec,250struct pipe_mpeg12_picture_desc *desc,251const struct pipe_mpeg12_macroblock *macrob)252{253STATIC_ASSERT(sizeof(struct mpeg12_mb_info) == 32);254255struct mpeg12_mb_info info = {0};256int i, sum = 0, mask, block_index, count;257const int16_t *blocks;258int intra = macrob->macroblock_type & PIPE_MPEG12_MB_TYPE_INTRA;259int motion = macrob->macroblock_type &260(PIPE_MPEG12_MB_TYPE_MOTION_FORWARD | PIPE_MPEG12_MB_TYPE_MOTION_BACKWARD);261const uint8_t *quant_matrix = intra ? dec->mpeg12_intra_matrix :262dec->mpeg12_non_intra_matrix;263int mpeg1 = dec->base.profile == PIPE_VIDEO_PROFILE_MPEG1;264265info.index = macrob->y * mb(dec->base.width) + macrob->x;266info.unk4 = motion;267if (intra)268info.unk4 |= 1;269if (macrob->macroblock_modes.bits.dct_type)270info.unk4 |= 0x20;271info.unk5 = (macrob->motion_vertical_field_select << 4) |272(macrob->macroblock_modes.value & 0xf);273info.coded_block_pattern = macrob->coded_block_pattern;274if (motion) {275memcpy(info.PMV, macrob->PMV, sizeof(info.PMV));276}277blocks = macrob->blocks;278for (mask = 0x20, block_index = 0; mask > 0; mask >>= 1, block_index++) {279if ((macrob->coded_block_pattern & mask) == 0)280continue;281282count = 0;283284/*285* The observation here is that there are a lot of 0's, and things go286* a lot faster if one skips over them.287*/288289#if defined(PIPE_ARCH_SSE) && defined(PIPE_ARCH_X86_64)290/* Note that the SSE implementation is much more tuned to X86_64. As it's not291* benchmarked on X86_32, disable it there. I suspect that the code needs to292* be reorganized in terms of 32-bit wide data in order to be more293* efficient. NV84+ were released well into the 64-bit CPU era, so it should294* be a minority case.295*/296297/* This returns a 16-bit bit-mask, each 2 bits are both 1 or both 0, depending298* on whether the corresponding (16-bit) word in blocks is zero or non-zero. */299#define wordmask(blocks, zero) \300(uint64_t)(_mm_movemask_epi8( \301_mm_cmpeq_epi16( \302zero, _mm_load_si128((__m128i *)(blocks)))))303304__m128i zero = _mm_setzero_si128();305306/* TODO: Look into doing the inverse quantization in terms of SSE307* operations unconditionally, when necessary. */308uint64_t bmask0 = wordmask(blocks, zero);309bmask0 |= wordmask(blocks + 8, zero) << 16;310bmask0 |= wordmask(blocks + 16, zero) << 32;311bmask0 |= wordmask(blocks + 24, zero) << 48;312uint64_t bmask1 = wordmask(blocks + 32, zero);313bmask1 |= wordmask(blocks + 40, zero) << 16;314bmask1 |= wordmask(blocks + 48, zero) << 32;315bmask1 |= wordmask(blocks + 56, zero) << 48;316317/* The wordmask macro returns the inverse of what we want, since it318* returns a 1 for equal-to-zero. Invert. */319bmask0 = ~bmask0;320bmask1 = ~bmask1;321322/* Note that the bitmask is actually sequences of 2 bits for each block323* index. This is because there is no movemask_epi16. That means that324* (a) ffs will never return 64, since the prev bit will always be set325* in that case, and (b) we need to do an extra bit shift. Or'ing the326* bitmasks together is faster than having a loop that computes them one327* at a time and processes them, on a Core i7-920. Trying to put bmask328* into an array and then looping also slows things down.329*/330331/* shift needs to be the same width as i, and unsigned so that / 2332* becomes a rshift operation */333uint32_t shift;334i = 0;335336if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {337int16_t tmp;338while ((shift = __builtin_ffsll(bmask0))) {339i += (shift - 1) / 2;340bmask0 >>= shift - 1;341*dec->mpeg12_data++ = dec->zscan[i] * 2;342tmp = inverse_quantize(blocks[i], quant_matrix[i], mpeg1);343*dec->mpeg12_data++ = tmp;344sum += tmp;345count++;346i++;347bmask0 >>= 2;348}349i = 32;350while ((shift = __builtin_ffsll(bmask1))) {351i += (shift - 1) / 2;352bmask1 >>= shift - 1;353*dec->mpeg12_data++ = dec->zscan[i] * 2;354tmp = inverse_quantize(blocks[i], quant_matrix[i], mpeg1);355*dec->mpeg12_data++ = tmp;356sum += tmp;357count++;358i++;359bmask1 >>= 2;360}361} else {362while ((shift = __builtin_ffsll(bmask0))) {363i += (shift - 1) / 2;364bmask0 >>= shift - 1;365*dec->mpeg12_data++ = i * 2;366*dec->mpeg12_data++ = blocks[i];367count++;368i++;369bmask0 >>= 2;370}371i = 32;372while ((shift = __builtin_ffsll(bmask1))) {373i += (shift - 1) / 2;374bmask1 >>= shift - 1;375*dec->mpeg12_data++ = i * 2;376*dec->mpeg12_data++ = blocks[i];377count++;378i++;379bmask1 >>= 2;380}381}382#undef wordmask383#else384385/*386* This loop looks ridiculously written... and it is. I tried a lot of387* different ways of achieving this scan, and this was the fastest, at388* least on a Core i7-920. Note that it's not necessary to skip the 0's,389* the firmware will deal with those just fine. But it's faster to skip390* them. Note to people trying benchmarks: make sure to use realistic391* mpeg data, which can often be a single data point first followed by392* 63 0's, or <data> 7x <0> <data> 7x <0> etc.393*/394i = 0;395if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {396while (true) {397int16_t tmp;398while (likely(i < 64 && !(tmp = blocks[i]))) i++;399if (i >= 64) break;400*dec->mpeg12_data++ = dec->zscan[i] * 2;401tmp = inverse_quantize(tmp, quant_matrix[i], mpeg1);402*dec->mpeg12_data++ = tmp;403sum += tmp;404count++;405i++;406}407} else {408while (true) {409int16_t tmp;410while (likely(i < 64 && !(tmp = blocks[i]))) i++;411if (i >= 64) break;412*dec->mpeg12_data++ = i * 2;413*dec->mpeg12_data++ = tmp;414count++;415i++;416}417}418419#endif420421if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {422if (!mpeg1 && (sum & 1) == 0) {423if (count && *(dec->mpeg12_data - 2) == 63 * 2) {424uint16_t *val = dec->mpeg12_data - 1;425if (*val & 1) *val -= 1;426else *val += 1;427} else {428*dec->mpeg12_data++ = 63 * 2;429*dec->mpeg12_data++ = 1;430count++;431}432}433}434435if (count) {436*(dec->mpeg12_data - 2) |= 1;437} else {438*dec->mpeg12_data++ = 1;439*dec->mpeg12_data++ = 0;440count = 1;441}442info.block_counts[block_index] = count;443blocks += 64;444}445446memcpy(dec->mpeg12_mb_info, &info, sizeof(info));447dec->mpeg12_mb_info += sizeof(info);448449if (macrob->num_skipped_macroblocks) {450info.index++;451info.coded_block_pattern = 0;452info.skipped = macrob->num_skipped_macroblocks - 1;453memset(info.block_counts, 0, sizeof(info.block_counts));454memcpy(dec->mpeg12_mb_info, &info, sizeof(info));455dec->mpeg12_mb_info += sizeof(info);456}457}458459struct mpeg12_header {460uint32_t luma_top_size; // 00461uint32_t luma_bottom_size; // 04462uint32_t chroma_top_size; // 08463uint32_t mbs; // 0c464uint32_t mb_info_size; // 10465uint32_t mb_width_minus1; // 14466uint32_t mb_height_minus1; // 18467uint32_t width; // 1c468uint32_t height; // 20469uint8_t progressive; // 24470uint8_t mocomp_only; // 25471uint8_t frames; // 26472uint8_t picture_structure; // 27473uint32_t unk28; // 28 -- 0x50100474uint32_t unk2c; // 2c475uint32_t pad[4 * 13];476};477478void479nv84_decoder_vp_mpeg12(struct nv84_decoder *dec,480struct pipe_mpeg12_picture_desc *desc,481struct nv84_video_buffer *dest)482{483struct nouveau_pushbuf *push = dec->vp_pushbuf;484struct nv84_video_buffer *ref1 = (struct nv84_video_buffer *)desc->ref[0];485struct nv84_video_buffer *ref2 = (struct nv84_video_buffer *)desc->ref[1];486struct nouveau_pushbuf_refn bo_refs[] = {487{ dest->interlaced, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },488{ NULL, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },489{ NULL, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },490{ dec->mpeg12_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_GART },491};492int i, num_refs = ARRAY_SIZE(bo_refs);493struct mpeg12_header header = {0};494struct nv50_miptree *y = nv50_miptree(dest->resources[0]);495struct nv50_miptree *uv = nv50_miptree(dest->resources[1]);496497STATIC_ASSERT(sizeof(struct mpeg12_header) == 0x100);498499if (!ref1)500ref1 = dest;501if (!ref2)502ref2 = dest;503bo_refs[1].bo = ref1->interlaced;504bo_refs[2].bo = ref2->interlaced;505506header.luma_top_size = y->layer_stride;507header.luma_bottom_size = y->layer_stride;508header.chroma_top_size = uv->layer_stride;509header.mbs = mb(dec->base.width) * mb(dec->base.height);510header.mb_info_size = dec->mpeg12_mb_info - dec->mpeg12_bo->map - 0x100;511header.mb_width_minus1 = mb(dec->base.width) - 1;512header.mb_height_minus1 = mb(dec->base.height) - 1;513header.width = align(dec->base.width, 16);514header.height = align(dec->base.height, 16);515header.progressive = desc->frame_pred_frame_dct;516header.frames = 1 + (desc->ref[0] != NULL) + (desc->ref[1] != NULL);517header.picture_structure = desc->picture_structure;518header.unk28 = 0x50100;519520memcpy(dec->mpeg12_bo->map, &header, sizeof(header));521522PUSH_SPACE(push, 10 + 3 + 2);523524nouveau_pushbuf_refn(push, bo_refs, num_refs);525526BEGIN_NV04(push, SUBC_VP(0x400), 9);527PUSH_DATA (push, 0x543210); /* each nibble possibly a dma index */528PUSH_DATA (push, 0x555001); /* constant */529PUSH_DATA (push, dec->mpeg12_bo->offset >> 8);530PUSH_DATA (push, (dec->mpeg12_bo->offset + 0x100) >> 8);531PUSH_DATA (push, (dec->mpeg12_bo->offset + 0x100 +532align(0x20 * mb(dec->base.width) *533mb(dec->base.height), 0x100)) >> 8);534PUSH_DATA (push, dest->interlaced->offset >> 8);535PUSH_DATA (push, ref1->interlaced->offset >> 8);536PUSH_DATA (push, ref2->interlaced->offset >> 8);537PUSH_DATA (push, 6 * 64 * 8 * header.mbs);538539BEGIN_NV04(push, SUBC_VP(0x620), 2);540PUSH_DATA (push, 0);541PUSH_DATA (push, 0);542543BEGIN_NV04(push, SUBC_VP(0x300), 1);544PUSH_DATA (push, 0);545546for (i = 0; i < 2; i++) {547struct nv50_miptree *mt = nv50_miptree(dest->resources[i]);548mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;549}550PUSH_KICK (push);551}552553554