BambuStudio/libigl/igl/Singular_Value_Decompositio...

119 lines
19 KiB
C++

//#####################################################################
// Copyright (c) 2010-2011, Eftychios Sifakis.
//
// Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
// * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
// other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
// BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
// SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//#####################################################################
//###########################################################
// Compute the Givens angle (and half-angle)
//###########################################################
ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=SS21.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(VS21,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(VS21,Vone_half);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=SS11.f-SS22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_sub_ps(VS11,VS22);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_sub_ps(VS11,VS22);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ssh.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vsh,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vsh,Vsh);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp1.ui=(Stmp2.f>=Stiny_number.f)?0xffffffff:0;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_cmpge_ps(Vtmp2,Vtiny_number);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_cmp_ps(Vtmp2,Vtiny_number, _CMP_GE_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_cmpge_ps(Vtmp2,Vtiny_number);)
ENABLE_SCALAR_IMPLEMENTATION(Ssh.ui=Stmp1.ui&Ssh.ui;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_and_ps(Vtmp1,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_and_ps(Vtmp1,Vsh);)
ENABLE_SCALAR_IMPLEMENTATION(Sch.ui=Stmp1.ui&Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_and_ps(Vtmp1,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vch=_mm256_blendv_ps(Vone,Vtmp5,Vtmp1);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=~Stmp1.ui&Sone.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_andnot_ps(Vtmp1,Vone);)
ENABLE_SCALAR_IMPLEMENTATION(Sch.ui=Sch.ui|Stmp2.ui;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_or_ps(Vch,Vtmp2);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ssh.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vsh,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vsh,Vsh);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sch.f*Sch.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vch,Vch);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vch,Vch);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vtmp1,Vtmp2);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=rsqrt(Stmp3.f);) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_rsqrt_ps(Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_rsqrt_ps(Vtmp3);)
#ifdef USE_ACCURATE_RSQRT_IN_JACOBI_CONJUGATION
ENABLE_SCALAR_IMPLEMENTATION(Ss.f=Stmp4.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vs=_mm_mul_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vs=_mm256_mul_ps(Vtmp4,Vone_half);)
ENABLE_SCALAR_IMPLEMENTATION(Sc.f=Stmp4.f*Ss.f;) ENABLE_SSE_IMPLEMENTATION(Vc=_mm_mul_ps(Vtmp4,Vs);) ENABLE_AVX_IMPLEMENTATION(Vc=_mm256_mul_ps(Vtmp4,Vs);)
ENABLE_SCALAR_IMPLEMENTATION(Sc.f=Stmp4.f*Sc.f;) ENABLE_SSE_IMPLEMENTATION(Vc=_mm_mul_ps(Vtmp4,Vc);) ENABLE_AVX_IMPLEMENTATION(Vc=_mm256_mul_ps(Vtmp4,Vc);)
ENABLE_SCALAR_IMPLEMENTATION(Sc.f=Stmp3.f*Sc.f;) ENABLE_SSE_IMPLEMENTATION(Vc=_mm_mul_ps(Vtmp3,Vc);) ENABLE_AVX_IMPLEMENTATION(Vc=_mm256_mul_ps(Vtmp3,Vc);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f+Ss.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_add_ps(Vtmp4,Vs);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_add_ps(Vtmp4,Vs);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Sc.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vc);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vc);)
#endif
ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=Stmp4.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(Vtmp4,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(Vtmp4,Vsh);)
ENABLE_SCALAR_IMPLEMENTATION(Sch.f=Stmp4.f*Sch.f;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_mul_ps(Vtmp4,Vch);) ENABLE_AVX_IMPLEMENTATION(Vch=_mm256_mul_ps(Vtmp4,Vch);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sfour_gamma_squared.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vfour_gamma_squared,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vfour_gamma_squared,Vtmp1);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp1.ui=(Stmp2.f<=Stmp1.f)?0xffffffff:0;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_cmple_ps(Vtmp2,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_cmp_ps(Vtmp2,Vtmp1, _CMP_LE_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_cmple_ps(Vtmp2,Vtmp1);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=Ssine_pi_over_eight.ui&Stmp1.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_and_ps(Vsine_pi_over_eight,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_blendv_ps(Vsh,Vsine_pi_over_eight,Vtmp1);)
ENABLE_SCALAR_IMPLEMENTATION(Ssh.ui=~Stmp1.ui&Ssh.ui;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_andnot_ps(Vtmp1,Vsh);)
ENABLE_SCALAR_IMPLEMENTATION(Ssh.ui=Ssh.ui|Stmp2.ui;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_or_ps(Vsh,Vtmp2);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=Scosine_pi_over_eight.ui&Stmp1.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_and_ps(Vcosine_pi_over_eight,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vch=_mm256_blendv_ps(Vch,Vcosine_pi_over_eight,Vtmp1);)
ENABLE_SCALAR_IMPLEMENTATION(Sch.ui=~Stmp1.ui&Sch.ui;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_andnot_ps(Vtmp1,Vch);)
ENABLE_SCALAR_IMPLEMENTATION(Sch.ui=Sch.ui|Stmp2.ui;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_or_ps(Vch,Vtmp2);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ssh.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vsh,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vsh,Vsh);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sch.f*Sch.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vch,Vch);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vch,Vch);)
ENABLE_SCALAR_IMPLEMENTATION(Sc.f=Stmp2.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vc=_mm_sub_ps(Vtmp2,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vc=_mm256_sub_ps(Vtmp2,Vtmp1);)
ENABLE_SCALAR_IMPLEMENTATION(Ss.f=Sch.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vs=_mm_mul_ps(Vch,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vs=_mm256_mul_ps(Vch,Vsh);)
ENABLE_SCALAR_IMPLEMENTATION(Ss.f=Ss.f+Ss.f;) ENABLE_SSE_IMPLEMENTATION(Vs=_mm_add_ps(Vs,Vs);) ENABLE_AVX_IMPLEMENTATION(Vs=_mm256_add_ps(Vs,Vs);)
//###########################################################
// Perform the actual Givens conjugation
//###########################################################
#ifndef USE_ACCURATE_RSQRT_IN_JACOBI_CONJUGATION
ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vtmp1,Vtmp2);)
ENABLE_SCALAR_IMPLEMENTATION(SS33.f=SS33.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(VS33=_mm_mul_ps(VS33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(VS33=_mm256_mul_ps(VS33,Vtmp3);)
ENABLE_SCALAR_IMPLEMENTATION(SS31.f=SS31.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(VS31=_mm_mul_ps(VS31,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(VS31=_mm256_mul_ps(VS31,Vtmp3);)
ENABLE_SCALAR_IMPLEMENTATION(SS32.f=SS32.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(VS32=_mm_mul_ps(VS32,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(VS32=_mm256_mul_ps(VS32,Vtmp3);)
ENABLE_SCALAR_IMPLEMENTATION(SS33.f=SS33.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(VS33=_mm_mul_ps(VS33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(VS33=_mm256_mul_ps(VS33,Vtmp3);)
#endif
ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ss.f*SS31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vs,VS31);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vs,VS31);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ss.f*SS32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vs,VS32);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vs,VS32);)
ENABLE_SCALAR_IMPLEMENTATION(SS31.f=Sc.f*SS31.f;) ENABLE_SSE_IMPLEMENTATION(VS31=_mm_mul_ps(Vc,VS31);) ENABLE_AVX_IMPLEMENTATION(VS31=_mm256_mul_ps(Vc,VS31);)
ENABLE_SCALAR_IMPLEMENTATION(SS32.f=Sc.f*SS32.f;) ENABLE_SSE_IMPLEMENTATION(VS32=_mm_mul_ps(Vc,VS32);) ENABLE_AVX_IMPLEMENTATION(VS32=_mm256_mul_ps(Vc,VS32);)
ENABLE_SCALAR_IMPLEMENTATION(SS31.f=Stmp2.f+SS31.f;) ENABLE_SSE_IMPLEMENTATION(VS31=_mm_add_ps(Vtmp2,VS31);) ENABLE_AVX_IMPLEMENTATION(VS31=_mm256_add_ps(Vtmp2,VS31);)
ENABLE_SCALAR_IMPLEMENTATION(SS32.f=SS32.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(VS32=_mm_sub_ps(VS32,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(VS32=_mm256_sub_ps(VS32,Vtmp1);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ss.f*Ss.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vs,Vs);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vs,Vs);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=SS22.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(VS22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(VS22,Vtmp2);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=SS11.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(VS11,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(VS11,Vtmp2);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sc.f*Sc.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vc,Vc);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vc,Vc);)
ENABLE_SCALAR_IMPLEMENTATION(SS11.f=SS11.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(VS11=_mm_mul_ps(VS11,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(VS11=_mm256_mul_ps(VS11,Vtmp4);)
ENABLE_SCALAR_IMPLEMENTATION(SS22.f=SS22.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(VS22=_mm_mul_ps(VS22,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(VS22=_mm256_mul_ps(VS22,Vtmp4);)
ENABLE_SCALAR_IMPLEMENTATION(SS11.f=SS11.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(VS11=_mm_add_ps(VS11,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(VS11=_mm256_add_ps(VS11,Vtmp1);)
ENABLE_SCALAR_IMPLEMENTATION(SS22.f=SS22.f+Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(VS22=_mm_add_ps(VS22,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(VS22=_mm256_add_ps(VS22,Vtmp3);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vtmp2);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=SS21.f+SS21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(VS21,VS21);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(VS21,VS21);)
ENABLE_SCALAR_IMPLEMENTATION(SS21.f=SS21.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(VS21=_mm_mul_ps(VS21,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(VS21=_mm256_mul_ps(VS21,Vtmp4);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sc.f*Ss.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vc,Vs);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vc,Vs);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp2.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vtmp2,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vtmp2,Vtmp4);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp5,Vtmp4);)
ENABLE_SCALAR_IMPLEMENTATION(SS11.f=SS11.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(VS11=_mm_add_ps(VS11,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(VS11=_mm256_add_ps(VS11,Vtmp2);)
ENABLE_SCALAR_IMPLEMENTATION(SS21.f=SS21.f-Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(VS21=_mm_sub_ps(VS21,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(VS21=_mm256_sub_ps(VS21,Vtmp5);)
ENABLE_SCALAR_IMPLEMENTATION(SS22.f=SS22.f-Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(VS22=_mm_sub_ps(VS22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(VS22=_mm256_sub_ps(VS22,Vtmp2);)
//###########################################################
// Compute the cumulative rotation, in quaternion form
//###########################################################
ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ssh.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vsh,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vsh,Vqvvx);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ssh.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vsh,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vsh,Vqvvy);)
ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Ssh.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vsh,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vsh,Vqvvz);)
ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=Ssh.f*Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(Vsh,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(Vsh,Vqvs);)
ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sch.f*Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vch,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vch,Vqvs);)
ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sch.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_mul_ps(Vch,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_mul_ps(Vch,Vqvvx);)
ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sch.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_mul_ps(Vch,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_mul_ps(Vch,Vqvvy);)
ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sch.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_mul_ps(Vch,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_mul_ps(Vch,Vqvvz);)
ENABLE_SCALAR_IMPLEMENTATION(SQVVZ.f=SQVVZ.f+Ssh.f;) ENABLE_SSE_IMPLEMENTATION(VQVVZ=_mm_add_ps(VQVVZ,Vsh);) ENABLE_AVX_IMPLEMENTATION(VQVVZ=_mm256_add_ps(VQVVZ,Vsh);)
ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f-STMP3.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_sub_ps(Vqvs,VTMP3);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_sub_ps(Vqvs,VTMP3);)
ENABLE_SCALAR_IMPLEMENTATION(SQVVX.f=SQVVX.f+STMP2.f;) ENABLE_SSE_IMPLEMENTATION(VQVVX=_mm_add_ps(VQVVX,VTMP2);) ENABLE_AVX_IMPLEMENTATION(VQVVX=_mm256_add_ps(VQVVX,VTMP2);)
ENABLE_SCALAR_IMPLEMENTATION(SQVVY.f=SQVVY.f-STMP1.f;) ENABLE_SSE_IMPLEMENTATION(VQVVY=_mm_sub_ps(VQVVY,VTMP1);) ENABLE_AVX_IMPLEMENTATION(VQVVY=_mm256_sub_ps(VQVVY,VTMP1);)