Path: blob/21.2-virgl/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
4565 views
/**************************************************************************1*2* Copyright 2013 VMware, Inc.3* All Rights Reserved.4*5* Permission is hereby granted, free of charge, to any person obtaining a6* copy of this software and associated documentation files (the7* "Software"), to deal in the Software without restriction, including8* without limitation the rights to use, copy, modify, merge, publish,9* distribute, sub license, and/or sell copies of the Software, and to10* permit persons to whom the Software is furnished to do so, subject to11* the following conditions:12*13* The above copyright notice and this permission notice (including the14* next paragraph) shall be included in all copies or substantial portions15* of the Software.16*17* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS18* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF19* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.20* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR21* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,22* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE23* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.24*25**************************************************************************/262728/**29* @file30* Format conversion code for srgb formats.31*32* Functions for converting from srgb to linear and vice versa.33* From http://www.opengl.org/registry/specs/EXT/texture_sRGB.txt:34*35* srgb->linear:36* cl = cs / 12.92, cs <= 0.0404537* cl = ((cs + 0.055)/1.055)^2.4, cs > 0.0404538*39* linear->srgb:40* if (isnan(cl)) {41* Map IEEE-754 Not-a-number to zero.42* cs = 0.0;43* } else if (cl > 1.0) {44* cs = 1.0;45* } else if (cl < 0.0) {46* cs = 0.0;47* } else if (cl < 0.0031308) {48* cs = 12.92 * cl;49* } else {50* cs = 1.055 * pow(cl, 0.41666) - 0.055;51* }52*53* This does not need to be accurate, however at least for d3d1054* (http://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx):55* 1) For srgb->linear, it is required that the error on the srgb side is56* not larger than 0.5f, which I interpret that if you map the value back57* to srgb from linear using the ideal conversion, it would not be off by58* more than 0.5f (that is, it would map to the same 8-bit integer value59* as it was before conversion to linear).60* 2) linear->srgb is permitted 0.6f which luckily looks like quite a large61* error is allowed.62* 3) Additionally, all srgb values converted to linear and back must result63* in the same value as they were originally.64*65* @author Roland Scheidegger <[email protected]>66*/676869#include "util/u_debug.h"70#include "util/u_math.h"7172#include "lp_bld_type.h"73#include "lp_bld_const.h"74#include "lp_bld_arit.h"75#include "lp_bld_bitarit.h"76#include "lp_bld_logic.h"77#include "lp_bld_format.h"78798081/**82* Convert srgb int values to linear float values.83* Several possibilities how to do this, e.g.84* - table85* - doing the pow() with int-to-float and float-to-int tricks86* (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)87* - just using standard polynomial approximation88* (3rd order polynomial is required for crappy but just sufficient accuracy)89*90* @param src integer (vector) value(s) to convert91* (chan_bits bit values unpacked to 32 bit already).92*/93LLVMValueRef94lp_build_srgb_to_linear(struct gallivm_state *gallivm,95struct lp_type src_type,96unsigned chan_bits,97LLVMValueRef src)98{99struct lp_type f32_type = lp_type_float_vec(32, src_type.length * 32);100struct lp_build_context f32_bld;101LLVMValueRef srcf, part_lin, part_pow, is_linear, lin_const, lin_thresh;102double coeffs[4] = {0.0023f,1030.0030f / 255.0f,1040.6935f / (255.0f * 255.0f),1050.3012f / (255.0f * 255.0f * 255.0f)106};107108assert(src_type.width == 32);109/* Technically this would work with more bits too but would be inaccurate. */110assert(chan_bits <= 8);111112lp_build_context_init(&f32_bld, gallivm, f32_type);113114/*115* using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + 0.0023)116* ( poly = 0.3012*x^3 + 0.6935*x^2 + 0.0030*x + 0.0023)117* (found with octave polyfit and some magic as I couldn't get the error118* function right). Using the above mentioned error function, the values stay119* within +-0.35, except for the lowest values - hence tweaking linear segment120* to cover the first 16 instead of the first 11 values (the error stays121* just about acceptable there too).122* Hence: lin = src > 15 ? poly : src / 12.6123* This function really only makes sense for vectors, should use LUT otherwise.124* All in all (including float conversion) 11 instructions (with sse4.1),125* 6 constants (polynomial could be done with 1 instruction less at the cost126* of slightly worse dependency chain, fma should also help).127*/128/* doing the 1/255 mul as part of the approximation */129srcf = lp_build_int_to_float(&f32_bld, src);130if (chan_bits != 8) {131/* could adjust all the constants instead */132LLVMValueRef rescale_const = lp_build_const_vec(gallivm, f32_type,133255.0f / ((1 << chan_bits) - 1));134srcf = lp_build_mul(&f32_bld, srcf, rescale_const);135}136lin_const = lp_build_const_vec(gallivm, f32_type, 1.0f / (12.6f * 255.0f));137part_lin = lp_build_mul(&f32_bld, srcf, lin_const);138139part_pow = lp_build_polynomial(&f32_bld, srcf, coeffs, 4);140141lin_thresh = lp_build_const_vec(gallivm, f32_type, 15.0f);142is_linear = lp_build_compare(gallivm, f32_type, PIPE_FUNC_LEQUAL, srcf, lin_thresh);143return lp_build_select(&f32_bld, is_linear, part_lin, part_pow);144}145146147/**148* Convert linear float values to srgb int values.149* Several possibilities how to do this, e.g.150* - use table (based on exponent/highest order mantissa bits) and do151* linear interpolation (https://gist.github.com/rygorous/2203834)152* - Chebyshev polynomial153* - Approximation using reciprocals154* - using int-to-float and float-to-int tricks for pow()155* (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)156*157* @param src float (vector) value(s) to convert.158*/159static LLVMValueRef160lp_build_linear_to_srgb(struct gallivm_state *gallivm,161struct lp_type src_type,162unsigned chan_bits,163LLVMValueRef src)164{165LLVMBuilderRef builder = gallivm->builder;166struct lp_build_context f32_bld;167LLVMValueRef lin_thresh, lin, lin_const, is_linear, tmp, pow_final;168169lp_build_context_init(&f32_bld, gallivm, src_type);170171src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one);172173if (0) {174/*175* using int-to-float and float-to-int trick for pow().176* This is much more accurate than necessary thanks to the correction,177* but it most certainly makes no sense without rsqrt available.178* Bonus points if you understand how this works...179* All in all (including min/max clamp, conversion) 19 instructions.180*/181182float exp_f = 2.0f / 3.0f;183/* some compilers can't do exp2f, so this is exp2f(127.0f/exp_f - 127.0f) */184float exp2f_c = 1.30438178253e+19f;185float coeff_f = 0.62996f;186LLVMValueRef pow_approx, coeff, x2, exponent, pow_1, pow_2;187struct lp_type int_type = lp_int_type(src_type);188189/*190* First calculate approx x^8/12191*/192exponent = lp_build_const_vec(gallivm, src_type, exp_f);193coeff = lp_build_const_vec(gallivm, src_type,194exp2f_c * powf(coeff_f, 1.0f / exp_f));195196/* premultiply src */197tmp = lp_build_mul(&f32_bld, coeff, src);198/* "log2" */199tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm, int_type), "");200tmp = lp_build_int_to_float(&f32_bld, tmp);201/* multiply for pow */202tmp = lp_build_mul(&f32_bld, tmp, exponent);203/* "exp2" */204pow_approx = lp_build_itrunc(&f32_bld, tmp);205pow_approx = LLVMBuildBitCast(builder, pow_approx,206lp_build_vec_type(gallivm, src_type), "");207208/*209* Since that pow was inaccurate (like 3 bits, though each sqrt step would210* give another bit), compensate the error (which is why we chose another211* exponent in the first place).212*/213/* x * x^(8/12) = x^(20/12) */214pow_1 = lp_build_mul(&f32_bld, pow_approx, src);215216/* x * x * x^(-4/12) = x^(20/12) */217/* Should avoid using rsqrt if it's not available, but218* using x * x^(4/12) * x^(4/12) instead will change error weight */219tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx);220x2 = lp_build_mul(&f32_bld, src, src);221pow_2 = lp_build_mul(&f32_bld, x2, tmp);222223/* average the values so the errors cancel out, compensate bias,224* we also squeeze the 1.055 mul of the srgb conversion plus the 255.0 mul225* for conversion to int in here */226tmp = lp_build_add(&f32_bld, pow_1, pow_2);227coeff = lp_build_const_vec(gallivm, src_type,2281.0f / (3.0f * coeff_f) * 0.999852f *229powf(1.055f * 255.0f, 4.0f));230pow_final = lp_build_mul(&f32_bld, tmp, coeff);231232/* x^(5/12) = rsqrt(rsqrt(x^20/12)) */233if (lp_build_fast_rsqrt_available(src_type)) {234pow_final = lp_build_fast_rsqrt(&f32_bld,235lp_build_fast_rsqrt(&f32_bld, pow_final));236}237else {238pow_final = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, pow_final));239}240pow_final = lp_build_add(&f32_bld, pow_final,241lp_build_const_vec(gallivm, src_type, -0.055f * 255.0f));242}243244else {245/*246* using "rational polynomial" approximation here.247* Essentially y = a*x^0.375 + b*x^0.5 + c, with also248* factoring in the 255.0 mul and the scaling mul.249* (a is closer to actual value so has higher weight than b.)250* Note: the constants are magic values. They were found empirically,251* possibly could be improved but good enough (be VERY careful with252* error metric if you'd want to tweak them, they also MUST fit with253* the crappy polynomial above for srgb->linear since it is required254* that each srgb value maps back to the same value).255* This function has an error of max +-0.17. Not sure this is actually256* enough, we require +-0.6 but that may include the +-0.5 from integer257* conversion. Seems to pass all relevant tests though...258* For the approximated srgb->linear values the error is naturally larger259* (+-0.42) but still accurate enough (required +-0.5 essentially).260* All in all (including min/max clamp, conversion) 15 instructions.261* FMA would help (minus 2 instructions).262*/263264LLVMValueRef x05, x0375, a_const, b_const, c_const, tmp2;265266if (lp_build_fast_rsqrt_available(src_type)) {267tmp = lp_build_fast_rsqrt(&f32_bld, src);268x05 = lp_build_mul(&f32_bld, src, tmp);269}270else {271/*272* I don't really expect this to be practical without rsqrt273* but there's no reason for triple punishment so at least274* save the otherwise resulting division and unnecessary mul...275*/276x05 = lp_build_sqrt(&f32_bld, src);277}278279tmp = lp_build_mul(&f32_bld, x05, src);280if (lp_build_fast_rsqrt_available(src_type)) {281x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, tmp));282}283else {284x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp));285}286287a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 * 255.0f);288b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 * 255.0f);289c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f);290291tmp = lp_build_mul(&f32_bld, a_const, x0375);292tmp2 = lp_build_mad(&f32_bld, b_const, x05, c_const);293pow_final = lp_build_add(&f32_bld, tmp, tmp2);294}295296/* linear part is easy */297lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f);298lin = lp_build_mul(&f32_bld, src, lin_const);299300lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f);301is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src, lin_thresh);302tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final);303304if (chan_bits != 8) {305/* could adjust all the constants instead */306LLVMValueRef rescale_const = lp_build_const_vec(gallivm, src_type,307((1 << chan_bits) - 1) / 255.0f);308tmp = lp_build_mul(&f32_bld, tmp, rescale_const);309}310311f32_bld.type.sign = 0;312return lp_build_iround(&f32_bld, tmp);313}314315316/**317* Convert linear float soa values to packed srgb AoS values.318* This only handles packed formats which are 4x8bit in size319* (rgba and rgbx plus swizzles), and 16bit 565-style formats320* with no alpha. (In the latter case the return values won't be321* fully packed, it will look like r5g6b5x16r5g6b5x16...)322*323* @param src float SoA (vector) values to convert.324*/325LLVMValueRef326lp_build_float_to_srgb_packed(struct gallivm_state *gallivm,327const struct util_format_description *dst_fmt,328struct lp_type src_type,329LLVMValueRef *src)330{331LLVMBuilderRef builder = gallivm->builder;332unsigned chan;333struct lp_build_context f32_bld;334struct lp_type int32_type = lp_int_type(src_type);335LLVMValueRef tmpsrgb[4], alpha, dst;336337lp_build_context_init(&f32_bld, gallivm, src_type);338339/* rgb is subject to linear->srgb conversion, alpha is not */340for (chan = 0; chan < 3; chan++) {341unsigned chan_bits = dst_fmt->channel[dst_fmt->swizzle[chan]].size;342tmpsrgb[chan] = lp_build_linear_to_srgb(gallivm, src_type, chan_bits, src[chan]);343}344/*345* can't use lp_build_conv since we want to keep values as 32bit346* here so we can interleave with rgb to go from SoA->AoS.347*/348alpha = lp_build_clamp_zero_one_nanzero(&f32_bld, src[3]);349alpha = lp_build_mul(&f32_bld, alpha,350lp_build_const_vec(gallivm, src_type, 255.0f));351tmpsrgb[3] = lp_build_iround(&f32_bld, alpha);352353dst = lp_build_zero(gallivm, int32_type);354for (chan = 0; chan < dst_fmt->nr_channels; chan++) {355if (dst_fmt->swizzle[chan] <= PIPE_SWIZZLE_W) {356unsigned ls;357LLVMValueRef shifted, shift_val;358ls = dst_fmt->channel[dst_fmt->swizzle[chan]].shift;359shift_val = lp_build_const_int_vec(gallivm, int32_type, ls);360shifted = LLVMBuildShl(builder, tmpsrgb[chan], shift_val, "");361dst = LLVMBuildOr(builder, dst, shifted, "");362}363}364return dst;365}366367368