1.更新libyuv版本 2.支持输出sdi hd数据

3f59dfc9 · wangguotao · e90327f6 · 3f59dfc9 · 3f59dfc9 · 3f59dfc9
Commit 3f59dfc9 authored Oct 10, 2024 by wangguotao
71 changed files
--- a/MomentaMedia/FigureOut.log
+++ b/MomentaMedia/FigureOut.log
--- a/MomentaMedia/MomentaMedia.vcxproj
+++ b/MomentaMedia/MomentaMedia.vcxproj
@@ -65,7 +65,7 @@
    <ClCompile>
      <AdditionalIncludeDirectories>.\ThirdParty\stb_image\;.\ThirdParty\rabbitmq\include;.\ThirdParty\ffmpeg-master-latest-win64-gpl-shared\include;.\ThirdParty\libyuv\include;.\ThirdParty\OpenCV\include;.\ThirdParty\NewTek\include;.\ThirdParty\BlackmagicDesign\include;.\include;%(AdditionalIncludeDirectories);$(Qt_INCLUDEPATH_)</AdditionalIncludeDirectories>
      <ShowIncludes>false</ShowIncludes>
-      <PreprocessorDefinitions>WIN32;_WINSOCKAPI_;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;_WINSOCKAPI_;HAVE_CONFIG_H;LIBYUV_USING_SHARED_LIBRARY;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <OpenMPSupport>true</OpenMPSupport>
    </ClCompile>
    <Link>
@@ -131,6 +131,30 @@
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="include\BlackMagicDesign\DeckLinkDeviceDiscovery.h" />
+    <QtMoc Include="include\Threads\VideoScaleThread.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\basic_types.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\compare.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\compare_row.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\convert.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\convert_argb.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\convert_from.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\convert_from_argb.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\cpu_id.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\loongson_intrinsics.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\macros_msa.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\mjpeg_decoder.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\planar_functions.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\rotate.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\rotate_argb.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\rotate_row.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\row.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\scale.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\scale_argb.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\scale_rgb.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\scale_row.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\scale_uv.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\version.h" />
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\video_common.h" />
    <QtMoc Include="include\Threads\ConsumerMqThread.h" />
    <QtMoc Include="include\Threads\ProcessMaskThread.h" />
    <QtMoc Include="include\Threads\ReplayThread.h" />
@@ -157,27 +181,6 @@
    <ClInclude Include="include\Utils\SSEFunction.h" />
    <ClInclude Include="include\Utils\yuv4k.h" />
    <ClInclude Include="ThirdParty\libyuv\include\libyuv.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\basic_types.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\compare.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\compare_row.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\convert.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\convert_argb.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\convert_from.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\convert_from_argb.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\cpu_id.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\macros_msa.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\mjpeg_decoder.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\planar_functions.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\rotate.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\rotate_argb.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\rotate_row.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\row.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\scale.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\scale_argb.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\scale_row.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\scale_uv.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\version.h" />
-    <ClInclude Include="ThirdParty\libyuv\include\libyuv\video_common.h" />
    <QtMoc Include="include\BlackMagicDesign\DeckLinkInputDevice.h" />
    <QtMoc Include="include\BlackMagicDesign\DeckLinkOutputDevice.h" />
    <QtMoc Include="include\BlackMagicDesign\DeckLinkOutputPage.h" />
@@ -241,6 +244,7 @@
    <ClCompile Include="src\Threads\DecodeMaskThread.cpp" />
    <ClCompile Include="src\Threads\ProcessMaskThread.cpp" />
    <ClCompile Include="src\Threads\ReplayThread.cpp" />
+    <ClCompile Include="src\Threads\VideoScaleThread.cpp" />
    <ClCompile Include="src\Threads\ZoomThread.cpp" />
    <ClCompile Include="src\TimePlus.cpp" />
    <ClCompile Include="src\NDI\NDIOutputThread.cpp" />

--- a/MomentaMedia/MomentaMedia.vcxproj.filters
+++ b/MomentaMedia/MomentaMedia.vcxproj.filters
@@ -140,6 +140,9 @@
    <QtMoc Include="include\BlackMagicDesign\OpenFile.h">
      <Filter>Header Files\BlackMagicDesign</Filter>
    </QtMoc>
+    <QtMoc Include="include\Threads\VideoScaleThread.h">
+      <Filter>Header Files\Threads</Filter>
+    </QtMoc>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="include\stdafx.h">
@@ -202,6 +205,57 @@
    <ClInclude Include="ThirdParty\libyuv\include\libyuv.h">
      <Filter>ThirdParty\libuv\include</Filter>
    </ClInclude>
+    <ClInclude Include="include\Utils\AudioConvert.h">
+      <Filter>Header Files\Utils</Filter>
+    </ClInclude>
+    <ClInclude Include="include\Utils\Base64.h">
+      <Filter>Header Files\Utils</Filter>
+    </ClInclude>
+    <ClInclude Include="include\Utils\VideoScale.h">
+      <Filter>Header Files\Utils</Filter>
+    </ClInclude>
+    <ClInclude Include="include\Utils\MaskBuffer.h">
+      <Filter>Header Files\Utils</Filter>
+    </ClInclude>
+    <ClInclude Include="include\Utils\Algorithm.h">
+      <Filter>Header Files\Utils</Filter>
+    </ClInclude>
+    <ClInclude Include="include\Record\Record.h">
+      <Filter>Header Files\Record</Filter>
+    </ClInclude>
+    <ClInclude Include="include\Record\RecordStore.h">
+      <Filter>Header Files\Record</Filter>
+    </ClInclude>
+    <ClInclude Include="include\Record\RecordThread.h">
+      <Filter>Header Files\Record</Filter>
+    </ClInclude>
+    <ClInclude Include="include\Utils\SafeMap.h">
+      <Filter>Header Files\Utils</Filter>
+    </ClInclude>
+    <ClInclude Include="include\Utils\SSEFunction.h">
+      <Filter>Header Files\Utils</Filter>
+    </ClInclude>
+    <ClInclude Include="include\Utils\Memory4k.h">
+      <Filter>Header Files\Utils</Filter>
+    </ClInclude>
+    <ClInclude Include="include\Utils\yuv4k.h">
+      <Filter>Header Files\Utils</Filter>
+    </ClInclude>
+    <ClInclude Include="include\Utils\SampleDeque.h">
+      <Filter>Header Files\Utils</Filter>
+    </ClInclude>
+    <ClInclude Include="include\Utils\Settings.h">
+      <Filter>Header Files\Utils</Filter>
+    </ClInclude>
+    <ClInclude Include="include\Utils\FastMemcpy_Avx.h">
+      <Filter>Header Files\Utils</Filter>
+    </ClInclude>
+    <ClInclude Include="include\Utils\FastMemcpy.h">
+      <Filter>Header Files\Utils</Filter>
+    </ClInclude>
+    <ClInclude Include="include\Utils\Computer.h">
+      <Filter>Header Files\Utils</Filter>
+    </ClInclude>
    <ClInclude Include="ThirdParty\libyuv\include\libyuv\video_common.h">
      <Filter>ThirdParty\libuv\include\include</Filter>
    </ClInclude>
@@ -214,6 +268,9 @@
    <ClInclude Include="ThirdParty\libyuv\include\libyuv\scale_row.h">
      <Filter>ThirdParty\libuv\include\include</Filter>
    </ClInclude>
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\scale_rgb.h">
+      <Filter>ThirdParty\libuv\include\include</Filter>
+    </ClInclude>
    <ClInclude Include="ThirdParty\libyuv\include\libyuv\scale_argb.h">
      <Filter>ThirdParty\libuv\include\include</Filter>
    </ClInclude>
@@ -241,6 +298,9 @@
    <ClInclude Include="ThirdParty\libyuv\include\libyuv\macros_msa.h">
      <Filter>ThirdParty\libuv\include\include</Filter>
    </ClInclude>
+    <ClInclude Include="ThirdParty\libyuv\include\libyuv\loongson_intrinsics.h">
+      <Filter>ThirdParty\libuv\include\include</Filter>
+    </ClInclude>
    <ClInclude Include="ThirdParty\libyuv\include\libyuv\cpu_id.h">
      <Filter>ThirdParty\libuv\include\include</Filter>
    </ClInclude>
@@ -265,57 +325,6 @@
    <ClInclude Include="ThirdParty\libyuv\include\libyuv\basic_types.h">
      <Filter>ThirdParty\libuv\include\include</Filter>
    </ClInclude>
-    <ClInclude Include="include\Utils\AudioConvert.h">
-      <Filter>Header Files\Utils</Filter>
-    </ClInclude>
-    <ClInclude Include="include\Utils\Base64.h">
-      <Filter>Header Files\Utils</Filter>
-    </ClInclude>
-    <ClInclude Include="include\Utils\VideoScale.h">
-      <Filter>Header Files\Utils</Filter>
-    </ClInclude>
-    <ClInclude Include="include\Utils\MaskBuffer.h">
-      <Filter>Header Files\Utils</Filter>
-    </ClInclude>
-    <ClInclude Include="include\Utils\Algorithm.h">
-      <Filter>Header Files\Utils</Filter>
-    </ClInclude>
-    <ClInclude Include="include\Record\Record.h">
-      <Filter>Header Files\Record</Filter>
-    </ClInclude>
-    <ClInclude Include="include\Record\RecordStore.h">
-      <Filter>Header Files\Record</Filter>
-    </ClInclude>
-    <ClInclude Include="include\Record\RecordThread.h">
-      <Filter>Header Files\Record</Filter>
-    </ClInclude>
-    <ClInclude Include="include\Utils\SafeMap.h">
-      <Filter>Header Files\Utils</Filter>
-    </ClInclude>
-    <ClInclude Include="include\Utils\SSEFunction.h">
-      <Filter>Header Files\Utils</Filter>
-    </ClInclude>
-    <ClInclude Include="include\Utils\Memory4k.h">
-      <Filter>Header Files\Utils</Filter>
-    </ClInclude>
-    <ClInclude Include="include\Utils\yuv4k.h">
-      <Filter>Header Files\Utils</Filter>
-    </ClInclude>
-    <ClInclude Include="include\Utils\SampleDeque.h">
-      <Filter>Header Files\Utils</Filter>
-    </ClInclude>
-    <ClInclude Include="include\Utils\Settings.h">
-      <Filter>Header Files\Utils</Filter>
-    </ClInclude>
-    <ClInclude Include="include\Utils\FastMemcpy_Avx.h">
-      <Filter>Header Files\Utils</Filter>
-    </ClInclude>
-    <ClInclude Include="include\Utils\FastMemcpy.h">
-      <Filter>Header Files\Utils</Filter>
-    </ClInclude>
-    <ClInclude Include="include\Utils\Computer.h">
-      <Filter>Header Files\Utils</Filter>
-    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <QtRcc Include="Form\MomentaMedia.qrc">
@@ -454,6 +463,9 @@
    <ClCompile Include="src\BlackMagicDesign\OpenFile.cpp">
      <Filter>Source Files\BlackMagicDesign</Filter>
    </ClCompile>
+    <ClCompile Include="src\Threads\VideoScaleThread.cpp">
+      <Filter>Source Files\Threads</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <QtUic Include="Form\TimePlus.ui">

--- a/MomentaMedia/ThirdParty/libyuv/include/libyuv/compare_row.h
+++ b/MomentaMedia/ThirdParty/libyuv/include/libyuv/compare_row.h
@@ -28,7 +28,10 @@ extern "C" {
 #endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
@@ -75,8 +78,16 @@ extern "C" {
 // The following are available for Neon:
 #if !defined(LIBYUV_DISABLE_NEON) && \
    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
-#define HAS_SUMSQUAREERROR_NEON
 #define HAS_HAMMINGDISTANCE_NEON
+#define HAS_SUMSQUAREERROR_NEON
+#endif
+// The following are available for AArch64 Neon:
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_HASHDJB2_NEON
+#define HAS_HAMMINGDISTANCE_NEON_DOTPROD
+#define HAS_SUMSQUAREERROR_NEON_DOTPROD
 #endif
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@@ -99,6 +110,9 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
 uint32_t HammingDistance_NEON(const uint8_t* src_a,
                              const uint8_t* src_b,
                              int count);
+uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a,
+                                      const uint8_t* src_b,
+                                      int count);
 uint32_t HammingDistance_MSA(const uint8_t* src_a,
                             const uint8_t* src_b,
                             int count);
@@ -114,6 +128,9 @@ uint32_t SumSquareError_AVX2(const uint8_t* src_a,
 uint32_t SumSquareError_NEON(const uint8_t* src_a,
                             const uint8_t* src_b,
                             int count);
+uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
+                                     const uint8_t* src_b,
+                                     int count);
 uint32_t SumSquareError_MSA(const uint8_t* src_a,
                            const uint8_t* src_b,
                            int count);
@@ -121,6 +138,7 @@ uint32_t SumSquareError_MSA(const uint8_t* src_a,
 uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed);
 uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed);
 uint32_t HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed);
+uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed);
 #ifdef __cplusplus
 }  // extern "C"

--- a/MomentaMedia/ThirdParty/libyuv/include/libyuv/convert.h
+++ b/MomentaMedia/ThirdParty/libyuv/include/libyuv/convert.h
@@ -367,6 +367,23 @@ int I212ToI422(const uint16_t* src_y,
               int width,
               int height);
+#define H212ToH420 I212ToI420
+LIBYUV_API
+int I212ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 #define H412ToH444 I412ToI444
 LIBYUV_API
 int I412ToI444(const uint16_t* src_y,
@@ -384,6 +401,23 @@ int I412ToI444(const uint16_t* src_y,
               int width,
               int height);
+#define H412ToH420 I412ToI420
+LIBYUV_API
+int I412ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 #define I412ToI012 I410ToI010
 #define H410ToH010 I410ToI010
 #define H412ToH012 I410ToI010
@@ -751,6 +785,21 @@ int ARGBToI420(const uint8_t* src_argb,
               int width,
               int height);
+// Convert ARGB to I420 with Alpha
+LIBYUV_API
+int ARGBToI420Alpha(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    uint8_t* dst_a,
+                    int dst_stride_a,
+                    int width,
+                    int height);
 // BGRA little endian (argb in memory) to I420.
 LIBYUV_API
 int BGRAToI420(const uint8_t* src_bgra,

--- a/MomentaMedia/ThirdParty/libyuv/include/libyuv/convert_argb.h
+++ b/MomentaMedia/ThirdParty/libyuv/include/libyuv/convert_argb.h
@@ -67,6 +67,8 @@ LIBYUV_API extern const struct YuvConstants kYvuV2020Constants;  // BT.2020 full
  I210ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
 #define I410ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
  I410ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I012ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I012ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
 #define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
  I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
 #define I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
@@ -1902,6 +1904,26 @@ int NV21ToRGB24Matrix(const uint8_t* src_y,
                      int width,
                      int height);
+// Convert YUY2 to ARGB with matrix.
+LIBYUV_API
+int YUY2ToARGBMatrix(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert UYVY to ARGB with matrix.
+LIBYUV_API
+int UYVYToARGBMatrix(const uint8_t* src_uyvy,
+                     int src_stride_uyvy,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
 // Convert Android420 to ARGB with matrix.
 LIBYUV_API
 int Android420ToARGBMatrix(const uint8_t* src_y,

--- a/MomentaMedia/ThirdParty/libyuv/include/libyuv/cpu_id.h
+++ b/MomentaMedia/ThirdParty/libyuv/include/libyuv/cpu_id.h
@@ -21,40 +21,52 @@ extern "C" {
 // Internal flag to indicate cpuid requires initialization.
 static const int kCpuInitialized = 0x1;
-// These flags are only valid on ARM processors.
+// These flags are only valid on Arm processors.
 static const int kCpuHasARM = 0x2;
 static const int kCpuHasNEON = 0x4;
-// 0x8 reserved for future ARM flag.
+// Leave a gap to avoid setting kCpuHasX86.
+static const int kCpuHasNeonDotProd = 0x10;
+static const int kCpuHasNeonI8MM = 0x20;
+static const int kCpuHasSVE = 0x40;
+static const int kCpuHasSVE2 = 0x80;
+static const int kCpuHasSME = 0x100;
 // These flags are only valid on x86 processors.
-static const int kCpuHasX86 = 0x10;
+static const int kCpuHasX86 = 0x8;
-static const int kCpuHasSSE2 = 0x20;
+static const int kCpuHasSSE2 = 0x10;
-static const int kCpuHasSSSE3 = 0x40;
+static const int kCpuHasSSSE3 = 0x20;
-static const int kCpuHasSSE41 = 0x80;
+static const int kCpuHasSSE41 = 0x40;
-static const int kCpuHasSSE42 = 0x100;  // unused at this time.
+static const int kCpuHasSSE42 = 0x80;
-static const int kCpuHasAVX = 0x200;
+static const int kCpuHasAVX = 0x100;
-static const int kCpuHasAVX2 = 0x400;
+static const int kCpuHasAVX2 = 0x200;
-static const int kCpuHasERMS = 0x800;
+static const int kCpuHasERMS = 0x400;
-static const int kCpuHasFMA3 = 0x1000;
+static const int kCpuHasFMA3 = 0x800;
-static const int kCpuHasF16C = 0x2000;
+static const int kCpuHasF16C = 0x1000;
-static const int kCpuHasGFNI = 0x4000;
+static const int kCpuHasAVX512BW = 0x2000;
-static const int kCpuHasAVX512BW = 0x8000;
+static const int kCpuHasAVX512VL = 0x4000;
-static const int kCpuHasAVX512VL = 0x10000;
+static const int kCpuHasAVX512VNNI = 0x8000;
-static const int kCpuHasAVX512VNNI = 0x20000;
+static const int kCpuHasAVX512VBMI = 0x10000;
-static const int kCpuHasAVX512VBMI = 0x40000;
+static const int kCpuHasAVX512VBMI2 = 0x20000;
-static const int kCpuHasAVX512VBMI2 = 0x80000;
+static const int kCpuHasAVX512VBITALG = 0x40000;
-static const int kCpuHasAVX512VBITALG = 0x100000;
+static const int kCpuHasAVX10 = 0x80000;
-static const int kCpuHasAVX512VPOPCNTDQ = 0x200000;
+static const int kCpuHasAVXVNNI = 0x100000;
+static const int kCpuHasAVXVNNIINT8 = 0x200000;
+static const int kCpuHasAMXINT8 = 0x400000;
 // These flags are only valid on MIPS processors.
-static const int kCpuHasMIPS = 0x400000;
+static const int kCpuHasMIPS = 0x800000;
-static const int kCpuHasMSA = 0x800000;
+static const int kCpuHasMSA = 0x1000000;
 // These flags are only valid on LOONGARCH processors.
 static const int kCpuHasLOONGARCH = 0x2000000;
 static const int kCpuHasLSX = 0x4000000;
 static const int kCpuHasLASX = 0x8000000;
+// These flags are only valid on RISCV processors.
+static const int kCpuHasRISCV = 0x10000000;
+static const int kCpuHasRVV = 0x20000000;
+static const int kCpuHasRVVZVFH = 0x40000000;
 // Optional init function. TestCpuFlag does an auto-init.
 // Returns cpu_info flags.
 LIBYUV_API
@@ -78,6 +90,19 @@ LIBYUV_API
 int ArmCpuCaps(const char* cpuinfo_name);
 LIBYUV_API
 int MipsCpuCaps(const char* cpuinfo_name);
+LIBYUV_API
+int RiscvCpuCaps(const char* cpuinfo_name);
+#ifdef __aarch64__
+#if __linux__
+// On Linux, parse AArch64 features from getauxval(AT_HWCAP{,2}).
+LIBYUV_API
+int AArch64CpuCaps(unsigned long hwcap, unsigned long hwcap2);
+#else
+LIBYUV_API
+int AArch64CpuCaps();
+#endif
+#endif
 // For testing, allow CPU flags to be disabled.
 // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.

--- a/MomentaMedia/ThirdParty/libyuv/include/libyuv/macros_msa.h
+++ b/MomentaMedia/ThirdParty/libyuv/include/libyuv/macros_msa.h
@@ -20,7 +20,7 @@
  ({                                                   \
    const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
    uint32_t val_m;                                    \
-    asm volatile("lw  %[val_m],  %[psrc_lw_m]  \n"     \
+    asm("lw  %[val_m],  %[psrc_lw_m]  \n"              \
        : [val_m] "=r"(val_m)                          \
        : [psrc_lw_m] "m"(*psrc_lw_m));                \
    val_m;                                             \
@@ -31,7 +31,7 @@
  ({                                                   \
    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
    uint64_t val_m = 0;                                \
-    asm volatile("ld  %[val_m],  %[psrc_ld_m]  \n"     \
+    asm("ld  %[val_m],  %[psrc_ld_m]  \n"              \
        : [val_m] "=r"(val_m)                          \
        : [psrc_ld_m] "m"(*psrc_ld_m));                \
    val_m;                                             \
@@ -55,7 +55,7 @@
  ({                                                    \
    uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
    uint32_t val_m = (val);                             \
-    asm volatile("sw  %[val_m],  %[pdst_sw_m]  \n"      \
+    asm("sw  %[val_m],  %[pdst_sw_m]  \n"               \
        : [pdst_sw_m] "=m"(*pdst_sw_m)                  \
        : [val_m] "r"(val_m));                          \
  })
@@ -65,7 +65,7 @@
  ({                                                    \
    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
    uint64_t val_m = (val);                             \
-    asm volatile("sd  %[val_m],  %[pdst_sd_m]  \n"      \
+    asm("sd  %[val_m],  %[pdst_sd_m]  \n"               \
        : [pdst_sd_m] "=m"(*pdst_sd_m)                  \
        : [val_m] "r"(val_m));                          \
  })
@@ -86,8 +86,7 @@
    uint8_t* psrc_lw_m = (uint8_t*)(psrc);      \
    uint32_t val_lw_m;                          \
                                                \
-    __asm__ volatile(                           \
+    asm("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
-        "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
        "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
                                                \
        : [val_lw_m] "=&r"(val_lw_m)            \
@@ -102,8 +101,7 @@
    uint8_t* psrc_ld_m = (uint8_t*)(psrc);      \
    uint64_t val_ld_m = 0;                      \
                                                \
-    __asm__ volatile(                           \
+    asm("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
-        "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
        "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
                                                \
        : [val_ld_m] "=&r"(val_ld_m)            \
@@ -130,7 +128,7 @@
  ({                                                    \
    uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
    uint32_t val_m = (val);                             \
-    asm volatile("usw  %[val_m],  %[pdst_sw_m]  \n"     \
+    asm("usw  %[val_m],  %[pdst_sw_m]  \n"              \
        : [pdst_sw_m] "=m"(*pdst_sw_m)                  \
        : [val_m] "r"(val_m));                          \
  })

--- a/MomentaMedia/ThirdParty/libyuv/include/libyuv/planar_functions.h
+++ b/MomentaMedia/ThirdParty/libyuv/include/libyuv/planar_functions.h
@@ -30,7 +30,10 @@ extern "C" {
 #endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
@@ -827,15 +830,6 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
                     int width,
                     int height);
-typedef void (*ARGBBlendRow)(const uint8_t* src_argb0,
-                             const uint8_t* src_argb1,
-                             uint8_t* dst_argb,
-                             int width);
-// Get function to Alpha Blend ARGB pixels and store to destination.
-LIBYUV_API
-ARGBBlendRow GetARGBBlend();
 // Alpha Blend ARGB images and store to destination.
 // Source is pre-multiplied by alpha using ARGBAttenuate.
 // Alpha of destination is set to 255.

--- a/MomentaMedia/ThirdParty/libyuv/include/libyuv/rotate_row.h
+++ b/MomentaMedia/ThirdParty/libyuv/include/libyuv/rotate_row.h
@@ -26,9 +26,20 @@ extern "C" {
 #if defined(__native_client__)
 #define LIBYUV_DISABLE_NEON
 #endif
+// clang >= 19.0.0 required for SME
+#if !defined(LIBYUV_DISABLE_SME) && defined(__clang__) && defined(__aarch64__)
+#if __clang_major__ < 19
+#define LIBYUV_DISABLE_SME
+#endif
+#endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
@@ -42,6 +53,8 @@ extern "C" {
 // The following are available for GCC 32 or 64 bit:
 #if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
 #define HAS_TRANSPOSEWX8_SSSE3
+#define HAS_TRANSPOSE4X4_32_SSE2
+#define HAS_TRANSPOSE4X4_32_AVX2
 #endif
 // The following are available for 64 bit GCC:
@@ -52,8 +65,18 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && \
    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#if defined(__aarch64__)
+#define HAS_TRANSPOSEWX16_NEON
+#else
 #define HAS_TRANSPOSEWX8_NEON
+#endif
 #define HAS_TRANSPOSEUVWX8_NEON
+#define HAS_TRANSPOSE4X4_32_NEON
+#endif
+#if !defined(LIBYUV_DISABLE_SME) && defined(__aarch64__)
+#define HAS_TRANSPOSEWXH_SME
+#define HAS_TRANSPOSEUVWXH_SME
 #endif
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@@ -88,6 +111,17 @@ void TransposeWx8_NEON(const uint8_t* src,
                       uint8_t* dst,
                       int dst_stride,
                       int width);
+void TransposeWx16_NEON(const uint8_t* src,
+                        int src_stride,
+                        uint8_t* dst,
+                        int dst_stride,
+                        int width);
+void TransposeWxH_SME(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst,
+                      int dst_stride,
+                      int width,
+                      int height);
 void TransposeWx8_SSSE3(const uint8_t* src,
                        int src_stride,
                        uint8_t* dst,
@@ -114,6 +148,11 @@ void TransposeWx8_Any_NEON(const uint8_t* src,
                           uint8_t* dst,
                           int dst_stride,
                           int width);
+void TransposeWx16_Any_NEON(const uint8_t* src,
+                            int src_stride,
+                            uint8_t* dst,
+                            int dst_stride,
+                            int width);
 void TransposeWx8_Any_SSSE3(const uint8_t* src,
                            int src_stride,
                            uint8_t* dst,
@@ -172,6 +211,14 @@ void TransposeUVWx8_NEON(const uint8_t* src,
                         uint8_t* dst_b,
                         int dst_stride_b,
                         int width);
+void TransposeUVWxH_SME(const uint8_t* src,
+                        int src_stride,
+                        uint8_t* dst_a,
+                        int dst_stride_a,
+                        uint8_t* dst_b,
+                        int dst_stride_b,
+                        int width,
+                        int height);
 void TransposeUVWx16_MSA(const uint8_t* src,
                         int src_stride,
                         uint8_t* dst_a,
@@ -240,14 +287,19 @@ void Transpose4x4_32_NEON(const uint8_t* src,
                          int dst_stride,
                          int width);
-void Transpose4x4_32_C(const uint8_t* src,
+void Transpose4x4_32_SSE2(const uint8_t* src,
                          int src_stride,
                          uint8_t* dst,
                          int dst_stride,
                          int width);
-// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_AVX2(const uint8_t* src,
-void Transpose8x8_32_NEON(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
+void Transpose4x4_32_C(const uint8_t* src,
                       int src_stride,
                       uint8_t* dst,
                       int dst_stride,

--- a/MomentaMedia/ThirdParty/libyuv/include/libyuv/row.h
+++ b/MomentaMedia/ThirdParty/libyuv/include/libyuv/row.h
@@ -31,7 +31,10 @@ extern "C" {
 #endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
@@ -161,7 +164,6 @@ extern "C" {
 #define HAS_ARGBSEPIAROW_SSSE3
 #define HAS_ARGBSHADEROW_SSE2
 #define HAS_ARGBSUBTRACTROW_SSE2
-#define HAS_ARGBUNATTENUATEROW_SSE2
 #define HAS_BLENDPLANEROW_SSSE3
 #define HAS_COMPUTECUMULATIVESUMROW_SSE2
 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
@@ -171,9 +173,6 @@ extern "C" {
 #define HAS_SOBELXROW_SSE2
 #define HAS_SOBELXYROW_SSE2
 #define HAS_SOBELYROW_SSE2
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_ARGBATTENUATEROW_SSSE3
-#endif
 // The following functions fail on gcc/clang 32 bit with fpic and framepointer.
 // caveat: clangcl uses row_win.cc which works.
@@ -241,11 +240,7 @@ extern "C" {
 #define HAS_ARGBADDROW_AVX2
 #define HAS_ARGBMULTIPLYROW_AVX2
 #define HAS_ARGBSUBTRACTROW_AVX2
-#define HAS_ARGBUNATTENUATEROW_AVX2
 #define HAS_BLENDPLANEROW_AVX2
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_ARGBATTENUATEROW_AVX2
-#endif
 #if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
    defined(_MSC_VER)
@@ -285,14 +280,15 @@ extern "C" {
 #define HAS_ABGRTOAR30ROW_SSSE3
 #define HAS_ABGRTOYJROW_SSSE3
 #define HAS_AR64TOARGBROW_SSSE3
+#define HAS_ARGBATTENUATEROW_SSSE3
 #define HAS_ARGBTOAB64ROW_SSSE3
 #define HAS_ARGBTOAR30ROW_SSSE3
 #define HAS_ARGBTOAR64ROW_SSSE3
+#define HAS_ARGBUNATTENUATEROW_SSE2
 #define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_CONVERT8TO16ROW_SSE2
-#define HAS_DETILEROW_SSE2
 #define HAS_DETILEROW_16_SSE2
-#define HAS_DETILEROW_16_AVX
+#define HAS_DETILEROW_SSE2
 #define HAS_DETILESPLITUVROW_SSSE3
 #define HAS_DETILETOYUY2_SSE2
 #define HAS_HALFMERGEUVROW_SSSE3
@@ -345,13 +341,16 @@ extern "C" {
 #define HAS_ABGRTOYJROW_AVX2
 #define HAS_ABGRTOYROW_AVX2
 #define HAS_AR64TOARGBROW_AVX2
+#define HAS_ARGBATTENUATEROW_AVX2
 #define HAS_ARGBTOAB64ROW_AVX2
 #define HAS_ARGBTOAR30ROW_AVX2
 #define HAS_ARGBTOAR64ROW_AVX2
 #define HAS_ARGBTORAWROW_AVX2
 #define HAS_ARGBTORGB24ROW_AVX2
+#define HAS_ARGBUNATTENUATEROW_AVX2
 #define HAS_CONVERT16TO8ROW_AVX2
 #define HAS_CONVERT8TO16ROW_AVX2
+#define HAS_DETILEROW_16_AVX
 #define HAS_DIVIDEROW_16_AVX2
 #define HAS_HALFMERGEUVROW_AVX2
 #define HAS_I210TOAR30ROW_AVX2
@@ -402,9 +401,8 @@ extern "C" {
 // The following are available for AVX512 clang x86 platforms:
 // TODO(fbarchard): Port to GCC and Visual C
 // TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
-// TODO(fbarchard): Port MERGEUV to assembly
 #if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512) && !defined(_MSC_VER))
+    (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512)
 #define HAS_ARGBTORGB24ROW_AVX512VBMI
 #define HAS_MERGEUVROW_AVX512BW
 #endif
@@ -555,6 +553,67 @@ extern "C" {
 // The following are available on AArch64 platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_ARGBTOAR30ROW_NEON
+#define HAS_ABGRTOAR30ROW_NEON
+#define HAS_I210ALPHATOARGBROW_NEON
+#define HAS_I410ALPHATOARGBROW_NEON
+#define HAS_I210TOARGBROW_NEON
+#define HAS_I410TOARGBROW_NEON
+#define HAS_I210TOAR30ROW_NEON
+#define HAS_I410TOAR30ROW_NEON
+#define HAS_I212TOARGBROW_NEON
+#define HAS_I212TOAR30ROW_NEON
+#define HAS_I422TOAR30ROW_NEON
+#define HAS_P210TOAR30ROW_NEON
+#define HAS_P210TOARGBROW_NEON
+#define HAS_P410TOAR30ROW_NEON
+#define HAS_P410TOARGBROW_NEON
+#define HAS_ABGRTOYJROW_NEON_DOTPROD
+#define HAS_ABGRTOYROW_NEON_DOTPROD
+#define HAS_ARGBTOYJROW_NEON_DOTPROD
+#define HAS_ARGBTOYROW_NEON_DOTPROD
+#define HAS_BGRATOYROW_NEON_DOTPROD
+#define HAS_RGBATOYJROW_NEON_DOTPROD
+#define HAS_RGBATOYROW_NEON_DOTPROD
+#define HAS_ARGBCOLORMATRIXROW_NEON_I8MM
+#define HAS_ARGBTOUV444ROW_NEON_I8MM
+#endif
+// The following are available on AArch64 SVE platforms:
+#if !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
+#define HAS_ABGRTOUVJROW_SVE2
+#define HAS_ABGRTOUVROW_SVE2
+#define HAS_ARGB1555TOARGBROW_SVE2
+#define HAS_ARGBTORAWROW_SVE2
+#define HAS_ARGBTORGB24ROW_SVE2
+#define HAS_ARGBTORGB565DITHERROW_SVE2
+#define HAS_ARGBTORGB565ROW_SVE2
+#define HAS_ARGBTOUVJROW_SVE2
+#define HAS_ARGBTOUVROW_SVE2
+#define HAS_AYUVTOUVROW_SVE2
+#define HAS_AYUVTOVUROW_SVE2
+#define HAS_BGRATOUVROW_SVE2
+#define HAS_I400TOARGBROW_SVE2
+#define HAS_I422ALPHATOARGBROW_SVE2
+#define HAS_I422TOARGBROW_SVE2
+#define HAS_I422TORGBAROW_SVE2
+#define HAS_I444ALPHATOARGBROW_SVE2
+#define HAS_I444TOARGBROW_SVE2
+#define HAS_NV12TOARGBROW_SVE2
+#define HAS_NV21TOARGBROW_SVE2
+#define HAS_RAWTOARGBROW_SVE2
+#define HAS_RAWTORGB24ROW_SVE2
+#define HAS_RAWTORGBAROW_SVE2
+#define HAS_RGB24TOARGBROW_SVE2
+#define HAS_RGBATOUVROW_SVE2
+#define HAS_UYVYTOARGBROW_SVE2
+#define HAS_YUY2TOARGBROW_SVE2
+#endif
+// The following are available on AArch64 platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 #define HAS_GAUSSCOL_F32_NEON
 #define HAS_GAUSSROW_F32_NEON
 #define HAS_INTERPOLATEROW_16TO8_NEON
@@ -651,20 +710,47 @@ extern "C" {
 #define HAS_ARGB1555TOUVROW_LSX
 #define HAS_ARGB1555TOYROW_LSX
 #define HAS_ARGB4444TOARGBROW_LSX
+#define HAS_ARGBADDROW_LSX
+#define HAS_ARGBATTENUATEROW_LSX
 #define HAS_ARGBBLENDROW_LSX
 #define HAS_ARGBCOLORMATRIXROW_LSX
 #define HAS_ARGBEXTRACTALPHAROW_LSX
+#define HAS_ARGBGRAYROW_LSX
+#define HAS_ARGBSEPIAROW_LSX
+#define HAS_ARGBSHADEROW_LSX
+#define HAS_ARGBSHUFFLEROW_LSX
+#define HAS_ARGBSUBTRACTROW_LSX
 #define HAS_ARGBQUANTIZEROW_LSX
 #define HAS_ARGBSETROW_LSX
+#define HAS_ARGBTOARGB1555ROW_LSX
+#define HAS_ARGBTOARGB4444ROW_LSX
+#define HAS_ARGBTORAWROW_LSX
+#define HAS_ARGBTORGB24ROW_LSX
+#define HAS_ARGBTORGB565ROW_LSX
+#define HAS_ARGBTORGB565DITHERROW_LSX
 #define HAS_ARGBTOUVJROW_LSX
+#define HAS_ARGBTOUV444ROW_LSX
+#define HAS_ARGBTOUVROW_LSX
 #define HAS_ARGBTOYJROW_LSX
+#define HAS_ARGBMIRRORROW_LSX
+#define HAS_ARGBMULTIPLYROW_LSX
 #define HAS_BGRATOUVROW_LSX
 #define HAS_BGRATOYROW_LSX
 #define HAS_I400TOARGBROW_LSX
 #define HAS_I444TOARGBROW_LSX
 #define HAS_INTERPOLATEROW_LSX
+#define HAS_I422ALPHATOARGBROW_LSX
+#define HAS_I422TOARGB1555ROW_LSX
+#define HAS_I422TOARGB4444ROW_LSX
+#define HAS_I422TORGB24ROW_LSX
+#define HAS_I422TORGB565ROW_LSX
+#define HAS_I422TORGBAROW_LSX
+#define HAS_I422TOUYVYROW_LSX
+#define HAS_I422TOYUY2ROW_LSX
 #define HAS_J400TOARGBROW_LSX
 #define HAS_MERGEUVROW_LSX
+#define HAS_MIRRORROW_LSX
+#define HAS_MIRRORUVROW_LSX
 #define HAS_MIRRORSPLITUVROW_LSX
 #define HAS_NV12TOARGBROW_LSX
 #define HAS_NV12TORGB565ROW_LSX
@@ -687,7 +773,13 @@ extern "C" {
 #define HAS_SOBELXYROW_LSX
 #define HAS_SPLITUVROW_LSX
 #define HAS_UYVYTOARGBROW_LSX
+#define HAS_UYVYTOUV422ROW_LSX
+#define HAS_UYVYTOUVROW_LSX
+#define HAS_UYVYTOYROW_LSX
 #define HAS_YUY2TOARGBROW_LSX
+#define HAS_YUY2TOUVROW_LSX
+#define HAS_YUY2TOUV422ROW_LSX
+#define HAS_YUY2TOYROW_LSX
 #define HAS_ARGBTOYROW_LSX
 #define HAS_ABGRTOYJROW_LSX
 #define HAS_RGBATOYJROW_LSX
@@ -695,6 +787,10 @@ extern "C" {
 #define HAS_RAWTOYJROW_LSX
 #endif
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#define HAS_I422TOARGBROW_LSX
+#endif
 #if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
 #define HAS_ARGB1555TOARGBROW_LASX
 #define HAS_ARGB1555TOUVROW_LASX
@@ -758,6 +854,74 @@ extern "C" {
 #define HAS_RAWTOYJROW_LASX
 #endif
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
+#if __riscv_v_intrinsic > 11000
+// Since v0.12, TUPLE_TYPE is introduced for segment load and store.
+#define LIBYUV_RVV_HAS_TUPLE_TYPE
+// Since v0.12, VXRM(fixed-point rounding mode) is included in arguments of
+// fixed-point intrinsics.
+#define LIBYUV_RVV_HAS_VXRM_ARG
+#endif
+#define HAS_COPYROW_RVV
+#define HAS_AB64TOARGBROW_RVV
+#define HAS_ABGRTOYJROW_RVV
+#define HAS_ABGRTOYROW_RVV
+#define HAS_AR64TOARGBROW_RVV
+#define HAS_AR64TOAB64ROW_RVV
+#define HAS_ARGBATTENUATEROW_RVV
+#define HAS_ARGBBLENDROW_RVV
+#define HAS_ARGBCOPYYTOALPHAROW_RVV
+#define HAS_ARGBEXTRACTALPHAROW_RVV
+#define HAS_ARGBTOAB64ROW_RVV
+#define HAS_ARGBTOABGRROW_RVV
+#define HAS_ARGBTOAR64ROW_RVV
+#define HAS_ARGBTOBGRAROW_RVV
+#define HAS_ARGBTORAWROW_RVV
+#define HAS_ARGBTORGB24ROW_RVV
+#define HAS_ARGBTORGBAROW_RVV
+#define HAS_ARGBTOYJROW_RVV
+#define HAS_ARGBTOYMATRIXROW_RVV
+#define HAS_ARGBTOYROW_RVV
+#define HAS_BGRATOYROW_RVV
+#define HAS_BLENDPLANEROW_RVV
+#define HAS_I400TOARGBROW_RVV
+#define HAS_I422ALPHATOARGBROW_RVV
+#define HAS_I422TOARGBROW_RVV
+#define HAS_I422TORGB24ROW_RVV
+#define HAS_I422TORGBAROW_RVV
+#define HAS_I444ALPHATOARGBROW_RVV
+#define HAS_I444TOARGBROW_RVV
+#define HAS_I444TORGB24ROW_RVV
+#define HAS_INTERPOLATEROW_RVV
+#define HAS_J400TOARGBROW_RVV
+#define HAS_MERGEARGBROW_RVV
+#define HAS_MERGERGBROW_RVV
+#define HAS_MERGEUVROW_RVV
+#define HAS_MERGEXRGBROW_RVV
+#define HAS_NV12TOARGBROW_RVV
+#define HAS_NV12TORGB24ROW_RVV
+#define HAS_NV21TOARGBROW_RVV
+#define HAS_NV21TORGB24ROW_RVV
+#define HAS_RAWTOARGBROW_RVV
+#define HAS_RAWTORGB24ROW_RVV
+#define HAS_RAWTORGBAROW_RVV
+#define HAS_RAWTOYJROW_RVV
+#define HAS_RAWTOYROW_RVV
+#define HAS_RGB24TOARGBROW_RVV
+#define HAS_RGB24TOYJROW_RVV
+#define HAS_RGB24TOYROW_RVV
+#define HAS_RGBATOARGBROW_RVV
+#define HAS_RGBATOYJROW_RVV
+#define HAS_RGBATOYMATRIXROW_RVV
+#define HAS_RGBATOYROW_RVV
+#define HAS_RGBTOYMATRIXROW_RVV
+#define HAS_SPLITARGBROW_RVV
+#define HAS_SPLITRGBROW_RVV
+#define HAS_SPLITUVROW_RVV
+#define HAS_SPLITXRGBROW_RVV
+#endif
 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
 #if defined(VISUALC_HAS_AVX2)
 #define SIMD_ALIGNED(var) __declspec(align(32)) var
@@ -817,8 +981,8 @@ typedef uint32_t ulvec32[8];
 typedef uint8_t ulvec8[32];
 #endif
-#if defined(__aarch64__) || defined(__arm__)
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
-// This struct is for ARM color conversion.
+// This struct is for ARM and RISC-V color conversion.
 struct YuvConstants {
  uvec8 kUVCoeff;
  vec16 kRGBCoeffBias;
@@ -852,14 +1016,6 @@ struct YuvConstants {
  free(var##_mem);                  \
  var = NULL
-#define align_buffer_64_16(var, size)                                        \
-  void* var##_mem = malloc((size)*2 + 63);                      /* NOLINT */ \
-  uint16_t* var = (uint16_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */
-#define free_aligned_buffer_64_16(var) \
-  free(var##_mem);                     \
-  var = NULL
 #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
 #define OMITFP
 #else
@@ -924,24 +1080,92 @@ struct YuvConstants {
    IACA_UD_BYTES      \
  }
+void I210AlphaToARGBRow_NEON(const uint16_t* src_y,
+                             const uint16_t* src_u,
+                             const uint16_t* src_v,
+                             const uint16_t* src_a,
+                             uint8_t* rgb_buf,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I410AlphaToARGBRow_NEON(const uint16_t* src_y,
+                             const uint16_t* src_u,
+                             const uint16_t* src_v,
+                             const uint16_t* src_a,
+                             uint8_t* rgb_buf,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I444ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I444ToARGBRow_SVE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I444ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I210ToARGBRow_NEON(const uint16_t* src_y,
+                        const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I410ToARGBRow_NEON(const uint16_t* src_y,
+                        const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I210ToAR30Row_NEON(const uint16_t* src_y,
+                        const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I410ToAR30Row_NEON(const uint16_t* src_y,
+                        const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I212ToARGBRow_NEON(const uint16_t* src_y,
+                        const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I212ToAR30Row_NEON(const uint16_t* src_y,
+                        const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I422ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I422ToARGBRow_SVE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToAR30Row_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
@@ -949,6 +1173,13 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
@@ -956,12 +1187,25 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422ToRGBARow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_rgba,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I422ToRGBARow_SVE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgba,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I422ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -991,6 +1235,11 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void NV12ToARGBRow_SVE2(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void NV12ToRGB565Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_uv,
                          uint8_t* dst_rgb565,
@@ -1001,6 +1250,11 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void NV21ToARGBRow_SVE2(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void NV12ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_uv,
                         uint8_t* dst_rgb24,
@@ -1019,10 +1273,62 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I444ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I444ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGBARow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_rgba,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I444ToARGBRow_MSA(const uint8_t* src_y,
                       const uint8_t* src_u,
                       const uint8_t* src_v,
@@ -1042,6 +1348,12 @@ void I422ToARGBRow_MSA(const uint8_t* src_y,
                       uint8_t* dst_argb,
                       const struct YuvConstants* yuvconstants,
                       int width);
+void I422ToARGBRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
 void I422ToARGBRow_LASX(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
@@ -1054,6 +1366,12 @@ void I422ToRGBARow_MSA(const uint8_t* src_y,
                       uint8_t* dst_argb,
                       const struct YuvConstants* yuvconstants,
                       int width);
+void I422ToRGBARow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
 void I422ToRGBARow_LASX(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
@@ -1067,6 +1385,13 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
                            uint8_t* dst_argb,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I422AlphaToARGBRow_LSX(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I422AlphaToARGBRow_LASX(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
@@ -1080,6 +1405,12 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I422ToRGB24Row_LSX(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I422ToRGB24Row_LASX(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -1092,6 +1423,12 @@ void I422ToRGB565Row_MSA(const uint8_t* src_y,
                         uint8_t* dst_rgb565,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I422ToRGB565Row_LSX(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
 void I422ToRGB565Row_LASX(const uint8_t* src_y,
                          const uint8_t* src_u,
                          const uint8_t* src_v,
@@ -1104,6 +1441,12 @@ void I422ToARGB4444Row_MSA(const uint8_t* src_y,
                           uint8_t* dst_argb4444,
                           const struct YuvConstants* yuvconstants,
                           int width);
+void I422ToARGB4444Row_LSX(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb4444,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 void I422ToARGB4444Row_LASX(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
@@ -1116,6 +1459,12 @@ void I422ToARGB1555Row_MSA(const uint8_t* src_y,
                           uint8_t* dst_argb1555,
                           const struct YuvConstants* yuvconstants,
                           int width);
+void I422ToARGB1555Row_LSX(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb1555,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 void I422ToARGB1555Row_LASX(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
@@ -1184,6 +1533,26 @@ void UYVYToARGBRow_LSX(const uint8_t* src_uyvy,
                       uint8_t* dst_argb,
                       const struct YuvConstants* yuvconstants,
                       int width);
+void NV12ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV21ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV12ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void NV21ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -1213,6 +1582,22 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width);
 void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
 void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
+void ARGBToYRow_NEON_DotProd(const uint8_t* src_argb,
+                             uint8_t* dst_y,
+                             int width);
+void ARGBToYJRow_NEON_DotProd(const uint8_t* src_argb,
+                              uint8_t* dst_yj,
+                              int width);
+void ABGRToYJRow_NEON_DotProd(const uint8_t* src_abgr,
+                              uint8_t* dst_yj,
+                              int width);
+void RGBAToYJRow_NEON_DotProd(const uint8_t* src_rgba,
+                              uint8_t* dst_yj,
+                              int width);
+void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width);
+void ABGRToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
+void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
 void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
 void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
 void ARGBToYRow_LSX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
@@ -1227,11 +1612,20 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
+void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
+                              uint8_t* dst_u,
+                              uint8_t* dst_v,
+                              int width);
 void ARGBToUVRow_NEON(const uint8_t* src_argb,
                      int src_stride_argb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void ARGBToUVRow_SVE2(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
 void ARGBToUV444Row_MSA(const uint8_t* src_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
@@ -1241,11 +1635,20 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
+void ARGBToUVRow_LSX(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
 void ARGBToUVRow_LASX(const uint8_t* src_argb,
                      int src_stride_argb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void ARGBToUV444Row_LSX(const uint8_t* src_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 void ARGBToUV444Row_LASX(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
@@ -1255,26 +1658,51 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void ARGBToUVJRow_SVE2(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
 void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
                       int src_stride_abgr,
                       uint8_t* dst_uj,
                       uint8_t* dst_vj,
                       int width);
+void ABGRToUVJRow_SVE2(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_uj,
+                       uint8_t* dst_vj,
+                       int width);
 void BGRAToUVRow_NEON(const uint8_t* src_bgra,
                      int src_stride_bgra,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void BGRAToUVRow_SVE2(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
 void ABGRToUVRow_NEON(const uint8_t* src_abgr,
                      int src_stride_abgr,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void ABGRToUVRow_SVE2(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
 void RGBAToUVRow_NEON(const uint8_t* src_rgba,
                      int src_stride_rgba,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void RGBAToUVRow_SVE2(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
 void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
                       int src_stride_rgb24,
                       uint8_t* dst_u,
@@ -1423,6 +1851,15 @@ void RAWToUVRow_LASX(const uint8_t* src_raw,
 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width);
 void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra,
+                             uint8_t* dst_y,
+                             int width);
+void ABGRToYRow_NEON_DotProd(const uint8_t* src_abgr,
+                             uint8_t* dst_y,
+                             int width);
+void RGBAToYRow_NEON_DotProd(const uint8_t* src_rgba,
+                             uint8_t* dst_y,
+                             int width);
 void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
 void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
 void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);
@@ -1434,6 +1871,13 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
                         uint8_t* dst_y,
                         int width);
+void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width);
 void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
@@ -1493,12 +1937,33 @@ void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_NEON_DotProd(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int width);
 void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_NEON_DotProd(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  int width);
 void ABGRToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_NEON_DotProd(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  int width);
 void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_NEON_DotProd(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  int width);
 void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void BGRAToYRow_Any_NEON_DotProd(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int width);
 void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_NEON_DotProd(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int width);
 void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_NEON_DotProd(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int width);
 void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -1658,11 +2123,20 @@ void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
+void ARGBToUV444Row_Any_NEON_I8MM(const uint8_t* src_ptr,
+                                  uint8_t* dst_u,
+                                  uint8_t* dst_v,
+                                  int width);
 void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr,
                          int src_stride,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void ARGBToUVRow_Any_SVE2(const uint8_t* src_ptr,
+                          int src_stride,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
 void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
@@ -1672,11 +2146,20 @@ void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
+void ARGBToUVRow_Any_LSX(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
 void ARGBToUVRow_Any_LASX(const uint8_t* src_ptr,
                          int src_stride_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void ARGBToUV444Row_Any_LSX(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
 void ARGBToUV444Row_Any_LASX(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
@@ -1686,26 +2169,51 @@ void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void ARGBToUVJRow_Any_SVE2(const uint8_t* src_ptr,
+                           int src_stride,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
 void ABGRToUVJRow_Any_NEON(const uint8_t* src_ptr,
                           int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void ABGRToUVJRow_Any_SVE2(const uint8_t* src_ptr,
+                           int src_stride,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
 void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr,
                          int src_stride,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void BGRAToUVRow_Any_SVE2(const uint8_t* src_ptr,
+                          int src_stride,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
 void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr,
                          int src_stride,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void ABGRToUVRow_Any_SVE2(const uint8_t* src_ptr,
+                          int src_stride,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
 void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr,
                          int src_stride,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void RGBAToUVRow_Any_SVE2(const uint8_t* src_ptr,
+                          int src_stride,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
 void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr,
                           int src_stride,
                           uint8_t* dst_u,
@@ -1940,6 +2448,7 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -1947,17 +2456,20 @@ void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorSplitUVRow_SSSE3(const uint8_t* src,
@@ -1987,6 +2499,7 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr,
@@ -1999,6 +2512,7 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            int width);
 void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBMirrorRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBMirrorRow_Any_LASX(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            int width);
@@ -2041,6 +2555,10 @@ void SplitUVRow_LSX(const uint8_t* src_uv,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
+void SplitUVRow_RVV(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
 void SplitUVRow_Any_SSE2(const uint8_t* src_ptr,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
@@ -2202,6 +2720,10 @@ void MergeUVRow_LSX(const uint8_t* src_u,
                    const uint8_t* src_v,
                    uint8_t* dst_uv,
                    int width);
+void MergeUVRow_RVV(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width);
 void MergeUVRow_Any_SSE2(const uint8_t* y_buf,
                         const uint8_t* uv_buf,
                         uint8_t* dst_ptr,
@@ -2265,7 +2787,12 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width);
 void SplitRGBRow_NEON(const uint8_t* src_rgb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width);
+void SplitRGBRow_RVV(const uint8_t* src_rgb,
                     uint8_t* dst_r,
                     uint8_t* dst_g,
                     uint8_t* dst_b,
@@ -2296,6 +2823,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
                      const uint8_t* src_b,
                      uint8_t* dst_rgb,
                      int width);
+void MergeRGBRow_RVV(const uint8_t* src_r,
+                     const uint8_t* src_g,
+                     const uint8_t* src_b,
+                     uint8_t* dst_rgb,
+                     int width);
 void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf,
                           const uint8_t* u_buf,
                           const uint8_t* v_buf,
@@ -2330,6 +2862,12 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_a,
                       uint8_t* dst_argb,
                       int width);
+void MergeARGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      const uint8_t* src_a,
+                      uint8_t* dst_argb,
+                      int width);
 void MergeARGBRow_Any_SSE2(const uint8_t* y_buf,
                           const uint8_t* u_buf,
                           const uint8_t* v_buf,
@@ -2378,6 +2916,12 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba,
                       uint8_t* dst_b,
                       uint8_t* dst_a,
                       int width);
+void SplitARGBRow_RVV(const uint8_t* src_rgba,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      uint8_t* dst_a,
+                      int width);
 void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr,
                           uint8_t* dst_r,
                           uint8_t* dst_g,
@@ -2422,6 +2966,11 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_b,
                       uint8_t* dst_argb,
                       int width);
+void MergeXRGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_argb,
+                      int width);
 void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf,
                           const uint8_t* u_buf,
                           const uint8_t* v_buf,
@@ -2462,6 +3011,11 @@ void SplitXRGBRow_NEON(const uint8_t* src_rgba,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width);
+void SplitXRGBRow_RVV(const uint8_t* src_rgba,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width);
 void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr,
                           uint8_t* dst_r,
                           uint8_t* dst_g,
@@ -2805,6 +3359,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count);
+void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int count);
 void CopyRow_C(const uint8_t* src, uint8_t* dst, int count);
 void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -2838,6 +3393,9 @@ void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
 void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb,
                             uint8_t* dst_a,
                             int width);
+void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width);
 void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr,
                                  uint8_t* dst_ptr,
                                  int width);
@@ -2857,6 +3415,7 @@ void ARGBExtractAlphaRow_Any_LSX(const uint8_t* src_ptr,
 void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr,
                                  uint8_t* dst_ptr,
                                  int width);
@@ -2904,6 +3463,10 @@ void ARGBShuffleRow_MSA(const uint8_t* src_argb,
                        uint8_t* dst_argb,
                        const uint8_t* shuffler,
                        int width);
+void ARGBShuffleRow_LSX(const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        const uint8_t* shuffler,
+                        int width);
 void ARGBShuffleRow_LASX(const uint8_t* src_argb,
                         uint8_t* dst_argb,
                         const uint8_t* shuffler,
@@ -2924,6 +3487,10 @@ void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            const uint8_t* param,
                            int width);
+void ARGBShuffleRow_Any_LSX(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            const uint8_t* param,
+                            int width);
 void ARGBShuffleRow_Any_LASX(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             const uint8_t* param,
@@ -2951,19 +3518,29 @@ void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
 void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
                         uint8_t* dst_argb,
                         int width);
+void RGB24ToARGBRow_SVE2(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width);
 void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
 void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
 void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24,
                         uint8_t* dst_argb,
                         int width);
+void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_SVE2(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
+void RAWToRGBARow_SVE2(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_SVE2(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RAWToRGB24Row_LSX(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
                          uint8_t* dst_argb,
                          int width);
@@ -2979,6 +3556,9 @@ void RGB565ToARGBRow_LASX(const uint8_t* src_rgb565,
 void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
                            uint8_t* dst_argb,
                            int width);
+void ARGB1555ToARGBRow_SVE2(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
+                            int width);
 void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
                           uint8_t* dst_argb,
                           int width);
@@ -3123,15 +3703,15 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width);
 void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
                             uint8_t* dst_rgb,
-                             const uint32_t dither4,
+                             uint32_t dither4,
                             int width);
 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
                                uint8_t* dst,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                int width);
 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
                                uint8_t* dst,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                int width);
 void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -3147,10 +3727,17 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
                         uint8_t* dst_rgb24,
                         int width);
+void ARGBToRGB24Row_SVE2(const uint8_t* src_argb,
+                         uint8_t* dst_rgb24,
+                         int width);
 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width);
+void ARGBToRAWRow_SVE2(const uint8_t* src_argb, uint8_t* dst_raw, int width);
 void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_rgb565,
                          int width);
+void ARGBToRGB565Row_SVE2(const uint8_t* src_argb,
+                          uint8_t* dst_rgb565,
+                          int width);
 void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb1555,
                            int width);
@@ -3159,7 +3746,11 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                            int width);
 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
+                                int width);
+void ARGBToRGB565DitherRow_SVE2(const uint8_t* src_argb,
+                                uint8_t* dst_rgb,
+                                uint32_t dither4,
                                int width);
 void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -3172,23 +3763,44 @@ void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
                           int width);
 void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
                               uint8_t* dst_rgb,
-                               const uint32_t dither4,
+                               uint32_t dither4,
+                               int width);
+void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb,
+                               uint8_t* dst_rgb,
+                               uint32_t dither4,
                               int width);
 void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb,
                                uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                int width);
+void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRGB565Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_LSX(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
 void ARGBToARGB1555Row_LASX(const uint8_t* src_argb,
                            uint8_t* dst_rgb,
                            int width);
+void ARGBToARGB4444Row_LSX(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
 void ARGBToARGB4444Row_LASX(const uint8_t* src_argb,
                            uint8_t* dst_rgb,
                            int width);
+void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width);
+void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width);
+void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width);
+void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB24Row_RVV(const uint8_t* src_argb, uint8_t* dst_rgb24, int width);
+void ARGBToABGRRow_C(const uint8_t* src_argb, uint8_t* dst_abgr, int width);
+void ARGBToBGRARow_C(const uint8_t* src_argb, uint8_t* dst_bgra, int width);
 void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -3202,6 +3814,8 @@ void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
 void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
 void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
 void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void AR64ToAB64Row_C(const uint16_t* src_ar64, uint16_t* dst_ab64, int width);
+void RGBAToARGBRow_C(const uint8_t* src_rgba, uint8_t* dst_argb, int width);
 void AR64ShuffleRow_C(const uint8_t* src_ar64,
                      uint8_t* dst_ar64,
                      const uint8_t* shuffler,
@@ -3226,6 +3840,12 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
 void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
 void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
 void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
+void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
+void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
+void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void AR64ToAB64Row_RVV(const uint16_t* src_ar64, uint16_t* dst_ab64, int width);
+void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width);
 void ARGBToAR64Row_Any_SSSE3(const uint8_t* src_ptr,
                             uint16_t* dst_ptr,
                             int width);
@@ -3268,6 +3888,7 @@ void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
@@ -4189,6 +4810,10 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I400ToARGBRow_SVE2(const uint8_t* src_y,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I400ToARGBRow_MSA(const uint8_t* src_y,
                       uint8_t* dst_argb,
                       const struct YuvConstants* yuvconstants,
@@ -4197,6 +4822,10 @@ void I400ToARGBRow_LSX(const uint8_t* src_y,
                       uint8_t* dst_argb,
                       const struct YuvConstants* yuvconstants,
                       int width);
+void I400ToARGBRow_RVV(const uint8_t* src_y,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
 void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* param,
@@ -4235,6 +4864,10 @@ void ARGBBlendRow_LSX(const uint8_t* src_argb0,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width);
+void ARGBBlendRow_RVV(const uint8_t* src_argb0,
+                      const uint8_t* src_argb1,
+                      uint8_t* dst_argb,
+                      int width);
 void ARGBBlendRow_C(const uint8_t* src_argb,
                    const uint8_t* src_argb1,
                    uint8_t* dst_argb,
@@ -4261,6 +4894,11 @@ void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf,
                            const uint8_t* v_buf,
                            uint8_t* dst_ptr,
                            int width);
+void BlendPlaneRow_RVV(const uint8_t* src0,
+                       const uint8_t* src1,
+                       const uint8_t* alpha,
+                       uint8_t* dst,
+                       int width);
 void BlendPlaneRow_C(const uint8_t* src0,
                     const uint8_t* src1,
                     const uint8_t* alpha,
@@ -4305,10 +4943,18 @@ void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf,
                             const uint8_t* uv_buf,
                             uint8_t* dst_ptr,
                             int width);
+void ARGBMultiplyRow_LSX(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
 void ARGBMultiplyRow_LASX(const uint8_t* src_argb0,
                          const uint8_t* src_argb1,
                          uint8_t* dst_argb,
                          int width);
+void ARGBMultiplyRow_Any_LSX(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
 void ARGBMultiplyRow_Any_LASX(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
@@ -4351,10 +4997,18 @@ void ARGBAddRow_Any_MSA(const uint8_t* y_buf,
                        const uint8_t* uv_buf,
                        uint8_t* dst_ptr,
                        int width);
+void ARGBAddRow_LSX(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width);
 void ARGBAddRow_LASX(const uint8_t* src_argb0,
                     const uint8_t* src_argb1,
                     uint8_t* dst_argb,
                     int width);
+void ARGBAddRow_Any_LSX(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
 void ARGBAddRow_Any_LASX(const uint8_t* y_buf,
                         const uint8_t* uv_buf,
                         uint8_t* dst_ptr,
@@ -4398,10 +5052,18 @@ void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf,
                             const uint8_t* uv_buf,
                             uint8_t* dst_ptr,
                             int width);
+void ARGBSubtractRow_LSX(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
 void ARGBSubtractRow_LASX(const uint8_t* src_argb0,
                          const uint8_t* src_argb1,
                          uint8_t* dst_argb,
                          int width);
+void ARGBSubtractRow_Any_LSX(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
 void ARGBSubtractRow_Any_LASX(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
@@ -4494,21 +5156,37 @@ void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   const uint32_t param,
                                   int width);
+void ARGBToRGB565DitherRow_Any_LSX(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   const uint32_t param,
+                                   int width);
 void ARGBToRGB565DitherRow_Any_LASX(const uint8_t* src_ptr,
                                    uint8_t* dst_ptr,
                                    const uint32_t param,
                                    int width);
+void ARGBToRGB24Row_Any_LSX(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
 void ARGBToRGB24Row_Any_LASX(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
+void ARGBToRAWRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToRAWRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_LSX(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
 void ARGBToRGB565Row_Any_LASX(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
+void ARGBToARGB1555Row_Any_LSX(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
 void ARGBToARGB1555Row_Any_LASX(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int width);
+void ARGBToARGB4444Row_Any_LSX(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
 void ARGBToARGB4444Row_Any_LASX(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int width);
@@ -4531,6 +5209,42 @@ void I422ToARGBRow_Any_NEON(const uint8_t* y_buf,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I210ToARGBRow_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I410ToARGBRow_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I210ToAR30Row_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I410ToAR30Row_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I212ToARGBRow_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I212ToAR30Row_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I444AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
                                 const uint8_t* u_buf,
                                 const uint8_t* v_buf,
@@ -4545,6 +5259,20 @@ void I422AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
                                 uint8_t* dst_ptr,
                                 const struct YuvConstants* yuvconstants,
                                 int width);
+void I410AlphaToARGBRow_Any_NEON(const uint16_t* y_buf,
+                                 const uint16_t* u_buf,
+                                 const uint16_t* v_buf,
+                                 const uint16_t* a_buf,
+                                 uint8_t* dst_ptr,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void I210AlphaToARGBRow_Any_NEON(const uint16_t* y_buf,
+                                 const uint16_t* u_buf,
+                                 const uint16_t* v_buf,
+                                 const uint16_t* a_buf,
+                                 uint8_t* dst_ptr,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
 void I422ToRGBARow_Any_NEON(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
@@ -4575,6 +5303,12 @@ void I422ToRGB565Row_Any_NEON(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
+void I422ToAR30Row_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void NV12ToARGBRow_Any_NEON(const uint8_t* y_buf,
                            const uint8_t* uv_buf,
                            uint8_t* dst_ptr,
@@ -4612,6 +5346,14 @@ void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void ARGBToAR30Row_NEON(const uint8_t* src, uint8_t* dst, int width);
+void ABGRToAR30Row_NEON(const uint8_t* src, uint8_t* dst, int width);
+void ABGRToAR30Row_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBToAR30Row_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
 void P210ToARGBRow_NEON(const uint16_t* y_buf,
                        const uint16_t* uv_buf,
                        uint8_t* dst_argb,
@@ -4670,6 +5412,12 @@ void I422ToARGBRow_Any_MSA(const uint8_t* y_buf,
                           uint8_t* dst_ptr,
                           const struct YuvConstants* yuvconstants,
                           int width);
+void I422ToARGBRow_Any_LSX(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 void I422ToARGBRow_Any_LASX(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
@@ -4682,6 +5430,12 @@ void I422ToRGBARow_Any_MSA(const uint8_t* y_buf,
                           uint8_t* dst_ptr,
                           const struct YuvConstants* yuvconstants,
                           int width);
+void I422ToRGBARow_Any_LSX(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 void I422ToRGBARow_Any_LASX(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
@@ -4695,6 +5449,13 @@ void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,
                                uint8_t* dst_ptr,
                                const struct YuvConstants* yuvconstants,
                                int width);
+void I422AlphaToARGBRow_Any_LSX(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                const uint8_t* a_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
 void I422AlphaToARGBRow_Any_LASX(const uint8_t* y_buf,
                                 const uint8_t* u_buf,
                                 const uint8_t* v_buf,
@@ -4708,6 +5469,12 @@ void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I422ToRGB24Row_Any_LSX(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I422ToRGB24Row_Any_LASX(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
@@ -4720,6 +5487,12 @@ void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I422ToRGB565Row_Any_LSX(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422ToRGB565Row_Any_LASX(const uint8_t* y_buf,
                              const uint8_t* u_buf,
                              const uint8_t* v_buf,
@@ -4732,6 +5505,12 @@ void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf,
                               uint8_t* dst_ptr,
                               const struct YuvConstants* yuvconstants,
                               int width);
+void I422ToARGB4444Row_Any_LSX(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
 void I422ToARGB4444Row_Any_LASX(const uint8_t* y_buf,
                                const uint8_t* u_buf,
                                const uint8_t* v_buf,
@@ -4744,6 +5523,12 @@ void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf,
                               uint8_t* dst_ptr,
                               const struct YuvConstants* yuvconstants,
                               int width);
+void I422ToARGB1555Row_Any_LSX(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
 void I422ToARGB1555Row_Any_LASX(const uint8_t* y_buf,
                                const uint8_t* u_buf,
                                const uint8_t* v_buf,
@@ -4856,12 +5641,18 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
                         uint8_t* dst_v,
                         int width);
 void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
 void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
 void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
                     int src_stride_yuy2,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
+void YUY2ToUVRow_LSX(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
 void YUY2ToUVRow_LASX(const uint8_t* src_yuy2,
                      int src_stride_yuy2,
                      uint8_t* dst_u,
@@ -4871,6 +5662,10 @@ void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
+void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
@@ -4932,12 +5727,18 @@ void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_v,
                             int width);
 void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void YUY2ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr,
                         int src_stride_ptr,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
+void YUY2ToUVRow_Any_LSX(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
 void YUY2ToUVRow_Any_LASX(const uint8_t* src_ptr,
                          int src_stride_ptr,
                          uint8_t* dst_u,
@@ -4947,6 +5748,10 @@ void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
+void YUY2ToUV422Row_Any_LSX(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
 void YUY2ToUV422Row_Any_LASX(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
@@ -4992,12 +5797,18 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
                         uint8_t* dst_v,
                         int width);
 void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToYRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
 void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
 void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
                     int src_stride_uyvy,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
+void UYVYToUVRow_LSX(const uint8_t* src_uyvy,
+                     int src_stride_uyvy,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
 void UYVYToUVRow_LASX(const uint8_t* src_uyvy,
                      int src_stride_uyvy,
                      uint8_t* dst_u,
@@ -5007,6 +5818,10 @@ void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
+void UYVYToUV422Row_LSX(const uint8_t* src_uyvy,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 void UYVYToUV422Row_LASX(const uint8_t* src_uyvy,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
@@ -5053,12 +5868,18 @@ void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_v,
                             int width);
 void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void UYVYToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr,
                         int src_stride_ptr,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
+void UYVYToUVRow_Any_LSX(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
 void UYVYToUVRow_Any_LASX(const uint8_t* src_ptr,
                          int src_stride_ptr,
                          uint8_t* dst_u,
@@ -5068,6 +5889,10 @@ void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
+void UYVYToUV422Row_Any_LSX(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
 void UYVYToUV422Row_Any_LASX(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
@@ -5093,19 +5918,35 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
                      int src_stride_ayuv,
                      uint8_t* dst_uv,
                      int width);
+void AYUVToUVRow_SVE2(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_uv,
+                      int width);
 void AYUVToVURow_NEON(const uint8_t* src_ayuv,
                      int src_stride_ayuv,
                      uint8_t* dst_vu,
                      int width);
+void AYUVToVURow_SVE2(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_vu,
+                      int width);
 void AYUVToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void AYUVToUVRow_Any_NEON(const uint8_t* src_ptr,
                          int src_stride,
                          uint8_t* dst_vu,
                          int width);
+void AYUVToUVRow_Any_SVE2(const uint8_t* src_ptr,
+                          int src_stride,
+                          uint8_t* dst_vu,
+                          int width);
 void AYUVToVURow_Any_NEON(const uint8_t* src_ptr,
                          int src_stride,
                          uint8_t* dst_vu,
                          int width);
+void AYUVToVURow_Any_SVE2(const uint8_t* src_ptr,
+                          int src_stride,
+                          uint8_t* dst_vu,
+                          int width);
 void I422ToYUY2Row_C(const uint8_t* src_y,
                     const uint8_t* src_u,
@@ -5182,6 +6023,11 @@ void I422ToYUY2Row_MSA(const uint8_t* src_y,
                       const uint8_t* src_v,
                       uint8_t* dst_yuy2,
                       int width);
+void I422ToYUY2Row_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_yuy2,
+                       int width);
 void I422ToYUY2Row_LASX(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
@@ -5192,6 +6038,11 @@ void I422ToUYVYRow_MSA(const uint8_t* src_y,
                       const uint8_t* src_v,
                       uint8_t* dst_uyvy,
                       int width);
+void I422ToUYVYRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_uyvy,
+                       int width);
 void I422ToUYVYRow_LASX(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
@@ -5202,6 +6053,11 @@ void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf,
                           const uint8_t* v_buf,
                           uint8_t* dst_ptr,
                           int width);
+void I422ToYUY2Row_Any_LSX(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
 void I422ToYUY2Row_Any_LASX(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
@@ -5212,6 +6068,11 @@ void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf,
                           const uint8_t* v_buf,
                           uint8_t* dst_ptr,
                           int width);
+void I422ToUYVYRow_Any_LSX(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
 void I422ToUYVYRow_Any_LASX(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
@@ -5232,9 +6093,15 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
 void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
                          uint8_t* dst_argb,
                          int width);
+void ARGBAttenuateRow_LSX(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width);
 void ARGBAttenuateRow_LASX(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           int width);
+void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width);
 void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int width);
@@ -5247,6 +6114,9 @@ void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr,
 void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
+void ARGBAttenuateRow_Any_LSX(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
 void ARGBAttenuateRow_Any_LASX(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int width);
@@ -5273,12 +6143,14 @@ void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBSepiaRow_C(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width);
 void ARGBColorMatrixRow_C(const uint8_t* src_argb,
@@ -5293,6 +6165,10 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const int8_t* matrix_argb,
                             int width);
+void ARGBColorMatrixRow_NEON_I8MM(const uint8_t* src_argb,
+                                  uint8_t* dst_argb,
+                                  const int8_t* matrix_argb,
+                                  int width);
 void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            const int8_t* matrix_argb,
@@ -5358,6 +6234,10 @@ void ARGBShadeRow_MSA(const uint8_t* src_argb,
                      uint8_t* dst_argb,
                      int width,
                      uint32_t value);
+void ARGBShadeRow_LSX(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      int width,
+                      uint32_t value);
 void ARGBShadeRow_LASX(const uint8_t* src_argb,
                       uint8_t* dst_argb,
                       int width,
@@ -5430,6 +6310,11 @@ void InterpolateRow_LSX(uint8_t* dst_ptr,
                        ptrdiff_t src_stride,
                        int width,
                        int source_y_fraction);
+void InterpolateRow_RVV(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int width,
+                        int source_y_fraction);
 void InterpolateRow_Any_NEON(uint8_t* dst_ptr,
                             const uint8_t* src_ptr,
                             ptrdiff_t src_stride_ptr,
@@ -5737,7 +6622,19 @@ void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr,
                             float* dst_ptr,
                             float param,
                             int width);
+// Convert FP16 Half Floats to FP32 Floats
+void ConvertFP16ToFP32Row_NEON(const uint16_t* src,  // fp16
+                               float* dst,
+                               int width);
+// Convert a column of FP16 Half Floats to a row of FP32 Floats
+void ConvertFP16ToFP32Column_NEON(const uint16_t* src,  // fp16
+                                  int src_stride,       // stride in elements
+                                  float* dst,
+                                  int width);
+// Convert FP32 Floats to FP16 Half Floats
+void ConvertFP32ToFP16Row_NEON(const float* src,
+                               uint16_t* dst,  // fp16
+                               int width);
 void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             int width,

--- a/MomentaMedia/ThirdParty/libyuv/include/libyuv/scale.h
+++ b/MomentaMedia/ThirdParty/libyuv/include/libyuv/scale.h
@@ -27,8 +27,9 @@ typedef enum FilterMode {
 } FilterModeEnum;
 // Scale a YUV plane.
+// Returns 0 if successful.
 LIBYUV_API
-void ScalePlane(const uint8_t* src,
+int ScalePlane(const uint8_t* src,
               int src_stride,
               int src_width,
               int src_height,
@@ -39,7 +40,7 @@ void ScalePlane(const uint8_t* src,
               enum FilterMode filtering);
 LIBYUV_API
-void ScalePlane_16(const uint16_t* src,
+int ScalePlane_16(const uint16_t* src,
                  int src_stride,
                  int src_width,
                  int src_height,
@@ -51,7 +52,7 @@ void ScalePlane_16(const uint16_t* src,
 // Sample is expected to be in the low 12 bits.
 LIBYUV_API
-void ScalePlane_12(const uint16_t* src,
+int ScalePlane_12(const uint16_t* src,
                  int src_stride,
                  int src_width,
                  int src_height,

--- a/MomentaMedia/ThirdParty/libyuv/include/libyuv/scale_row.h
+++ b/MomentaMedia/ThirdParty/libyuv/include/libyuv/scale_row.h
@@ -29,7 +29,10 @@ extern "C" {
 #endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
@@ -133,6 +136,8 @@ extern "C" {
 #define HAS_SCALEROWDOWN34_NEON
 #define HAS_SCALEROWDOWN38_NEON
 #define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEUVROWDOWN2_NEON
+#define HAS_SCALEUVROWDOWN2LINEAR_NEON
 #define HAS_SCALEUVROWDOWN2BOX_NEON
 #define HAS_SCALEUVROWDOWNEVEN_NEON
 #define HAS_SCALEROWUP2_LINEAR_NEON
@@ -173,6 +178,38 @@ extern "C" {
 #define HAS_SCALEROWDOWN34_LSX
 #endif
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
+#define HAS_SCALEADDROW_RVV
+// TODO: Test ScaleARGBRowDownEven_RVV and enable it
+// #define HAS_SCALEARGBROWDOWNEVEN_RVV
+#if defined(__riscv_zve64x)
+#define HAS_SCALEUVROWDOWN4_RVV
+#endif
+#define HAS_SCALEUVROWDOWNEVEN_RVV
+#define HAS_SCALEARGBROWDOWN2_RVV
+#define HAS_SCALEARGBROWDOWN2BOX_RVV
+#define HAS_SCALEARGBROWDOWN2LINEAR_RVV
+#define HAS_SCALEARGBROWDOWNEVENBOX_RVV
+#define HAS_SCALEROWDOWN2_RVV
+#define HAS_SCALEROWDOWN2BOX_RVV
+#define HAS_SCALEROWDOWN2LINEAR_RVV
+#define HAS_SCALEROWDOWN34_0_BOX_RVV
+#define HAS_SCALEROWDOWN34_1_BOX_RVV
+#define HAS_SCALEROWDOWN34_RVV
+#define HAS_SCALEROWDOWN38_2_BOX_RVV
+#define HAS_SCALEROWDOWN38_3_BOX_RVV
+#define HAS_SCALEROWDOWN38_RVV
+#define HAS_SCALEROWDOWN4_RVV
+#define HAS_SCALEROWDOWN4BOX_RVV
+#define HAS_SCALEROWUP2_BILINEAR_RVV
+#define HAS_SCALEROWUP2_LINEAR_RVV
+#define HAS_SCALEUVROWDOWN2_RVV
+#define HAS_SCALEUVROWDOWN2BOX_RVV
+#define HAS_SCALEUVROWDOWN2LINEAR_RVV
+#define HAS_SCALEUVROWUP2_BILINEAR_RVV
+#define HAS_SCALEUVROWUP2_LINEAR_RVV
+#endif
 // Scale ARGB vertically with bilinear interpolation.
 void ScalePlaneVertical(int src_height,
                        int dst_width,
@@ -947,6 +984,18 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst,
                               int dst_width);
+void ScaleARGBRowDown2_RVV(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width);
+void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
+void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width);
 void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
                           ptrdiff_t src_stride,
                           uint8_t* dst_argb,
@@ -1059,6 +1108,16 @@ void ScaleARGBRowDownEvenBox_LSX(const uint8_t* src_argb,
                                 int src_stepx,
                                 uint8_t* dst_argb,
                                 int dst_width);
+void ScaleARGBRowDownEven_RVV(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int32_t src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width);
+void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
 void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr,
                                   ptrdiff_t src_stride,
                                   int src_stepx,
@@ -1141,6 +1200,18 @@ void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_uv,
                            int dst_width);
+void ScaleUVRowDown2_RVV(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_uv,
+                         int dst_width);
+void ScaleUVRowDown2Linear_RVV(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_uv,
+                               int dst_width);
+void ScaleUVRowDown2Box_RVV(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width);
 void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
@@ -1201,6 +1272,16 @@ void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr,
                                int src_stepx,
                                uint8_t* dst_uv,
                                int dst_width);
+void ScaleUVRowDown4_RVV(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int32_t src_stepx,
+                         uint8_t* dst_uv,
+                         int dst_width);
+void ScaleUVRowDownEven_RVV(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            int32_t src_stepx,
+                            uint8_t* dst_uv,
+                            int dst_width);
 void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            int32_t src_stepx,
@@ -1290,6 +1371,14 @@ void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
                                     uint8_t* dst_ptr,
                                     ptrdiff_t dst_stride,
                                     int dst_width);
+void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width);
 void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   int dst_width);
@@ -1742,6 +1831,61 @@ void ScaleRowDown34_1_Box_Any_LSX(const uint8_t* src_ptr,
                                  uint8_t* dst_ptr,
                                  int dst_width);
+void ScaleAddRow_RVV(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleRowDown2_RVV(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width);
+void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width);
+void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleRowDown4_RVV(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst_ptr,
+                       int dst_width);
+void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width);
+void ScaleRowDown34_RVV(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width);
+void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_RVV(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              ptrdiff_t dst_stride,
+                              int dst_width);
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/MomentaMedia/ThirdParty/libyuv/include/libyuv/version.h
+++ b/MomentaMedia/ThirdParty/libyuv/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1860
+#define LIBYUV_VERSION 1892
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/MomentaMedia/ThirdParty/libyuv/lib/libyuv.dll
+++ b/MomentaMedia/ThirdParty/libyuv/lib/libyuv.dll
--- a/MomentaMedia/ThirdParty/libyuv/lib/yuv.lib
+++ b/MomentaMedia/ThirdParty/libyuv/lib/yuv.lib
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv.h
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_H_
+#define INCLUDE_LIBYUV_H_
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/mjpeg_decoder.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/scale.h"
+#include "libyuv/scale_argb.h"
+#include "libyuv/scale_row.h"
+#include "libyuv/scale_uv.h"
+#include "libyuv/version.h"
+#include "libyuv/video_common.h"
+#endif  // INCLUDE_LIBYUV_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/basic_types.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/basic_types.h
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
+#define INCLUDE_LIBYUV_BASIC_TYPES_H_
+#include <stddef.h>  // For size_t and NULL
+#if !defined(INT_TYPES_DEFINED) && !defined(GG_LONGLONG)
+#define INT_TYPES_DEFINED
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+#include <sys/types.h>  // for uintptr_t on x86
+typedef unsigned __int64 uint64_t;
+typedef __int64 int64_t;
+typedef unsigned int uint32_t;
+typedef int int32_t;
+typedef unsigned short uint16_t;
+typedef short int16_t;
+typedef unsigned char uint8_t;
+typedef signed char int8_t;
+#else
+#include <stdint.h>  // for uintptr_t and C99 types
+#endif               // defined(_MSC_VER) && (_MSC_VER < 1600)
+// Types are deprecated.  Enable this macro for legacy types.
+#ifdef LIBYUV_LEGACY_TYPES
+typedef uint64_t uint64;
+typedef int64_t int64;
+typedef uint32_t uint32;
+typedef int32_t int32;
+typedef uint16_t uint16;
+typedef int16_t int16;
+typedef uint8_t uint8;
+typedef int8_t int8;
+#endif  // LIBYUV_LEGACY_TYPES
+#endif  // INT_TYPES_DEFINED
+#if !defined(LIBYUV_API)
+#if defined(_WIN32) || defined(__CYGWIN__)
+#if defined(LIBYUV_BUILDING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllexport)
+#elif defined(LIBYUV_USING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllimport)
+#else
+#define LIBYUV_API
+#endif  // LIBYUV_BUILDING_SHARED_LIBRARY
+#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
+    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) ||                      \
+     defined(LIBYUV_USING_SHARED_LIBRARY))
+#define LIBYUV_API __attribute__((visibility("default")))
+#else
+#define LIBYUV_API
+#endif  // __GNUC__
+#endif  // LIBYUV_API
+// TODO(fbarchard): Remove bool macros.
+#define LIBYUV_BOOL int
+#define LIBYUV_FALSE 0
+#define LIBYUV_TRUE 1
+#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/compare.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/compare.h
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_COMPARE_H_
+#define INCLUDE_LIBYUV_COMPARE_H_
+#include "libyuv/basic_types.h"
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+// Compute a hash for specified memory. Seed of 5381 recommended.
+LIBYUV_API
+uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed);
+// Hamming Distance
+LIBYUV_API
+uint64_t ComputeHammingDistance(const uint8_t* src_a,
+                                const uint8_t* src_b,
+                                int count);
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32_t ARGBDetect(const uint8_t* argb,
+                    int stride_argb,
+                    int width,
+                    int height);
+// Sum Square Error - used to compute Mean Square Error or PSNR.
+LIBYUV_API
+uint64_t ComputeSumSquareError(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count);
+LIBYUV_API
+uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
+                                    int stride_a,
+                                    const uint8_t* src_b,
+                                    int stride_b,
+                                    int width,
+                                    int height);
+static const int kMaxPsnr = 128;
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64_t sse, uint64_t count);
+LIBYUV_API
+double CalcFramePsnr(const uint8_t* src_a,
+                     int stride_a,
+                     const uint8_t* src_b,
+                     int stride_b,
+                     int width,
+                     int height);
+LIBYUV_API
+double I420Psnr(const uint8_t* src_y_a,
+                int stride_y_a,
+                const uint8_t* src_u_a,
+                int stride_u_a,
+                const uint8_t* src_v_a,
+                int stride_v_a,
+                const uint8_t* src_y_b,
+                int stride_y_b,
+                const uint8_t* src_u_b,
+                int stride_u_b,
+                const uint8_t* src_v_b,
+                int stride_v_b,
+                int width,
+                int height);
+LIBYUV_API
+double CalcFrameSsim(const uint8_t* src_a,
+                     int stride_a,
+                     const uint8_t* src_b,
+                     int stride_b,
+                     int width,
+                     int height);
+LIBYUV_API
+double I420Ssim(const uint8_t* src_y_a,
+                int stride_y_a,
+                const uint8_t* src_u_a,
+                int stride_u_a,
+                const uint8_t* src_v_a,
+                int stride_v_a,
+                const uint8_t* src_y_b,
+                int stride_y_b,
+                const uint8_t* src_u_b,
+                int stride_u_b,
+                const uint8_t* src_v_b,
+                int stride_v_b,
+                int width,
+                int height);
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_COMPARE_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/compare_row.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/compare_row.h
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_
+#define INCLUDE_LIBYUV_COMPARE_ROW_H_
+#include "libyuv/basic_types.h"
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+#if defined(__pnacl__) || defined(__CLR_VER) ||            \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
+#define LIBYUV_DISABLE_X86
+#endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
+    _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif  // clang >= 3.4
+#endif  // __clang__
+// The following are available for Visual C and GCC:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || defined(__i386__) || defined(_M_IX86))
+#define HAS_HASHDJB2_SSE41
+#define HAS_SUMSQUAREERROR_SSE2
+#define HAS_HAMMINGDISTANCE_SSE42
+#endif
+// The following are available for Visual C and clangcl 32 bit:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
+    !defined(__clang__) &&                                                   \
+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+#define HAS_HASHDJB2_AVX2
+#define HAS_SUMSQUAREERROR_AVX2
+#endif
+// The following are available for GCC and clangcl:
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_HAMMINGDISTANCE_SSSE3
+#endif
+// The following are available for GCC and clangcl:
+#if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \
+    (defined(__x86_64__) || defined(__i386__))
+#define HAS_HAMMINGDISTANCE_AVX2
+#endif
+// The following are available for Neon:
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SUMSQUAREERROR_NEON
+#define HAS_HAMMINGDISTANCE_NEON
+#endif
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_HAMMINGDISTANCE_MSA
+#define HAS_SUMSQUAREERROR_MSA
+#endif
+uint32_t HammingDistance_C(const uint8_t* src_a,
+                           const uint8_t* src_b,
+                           int count);
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count);
+uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count);
+uint32_t HammingDistance_AVX2(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count);
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count);
+uint32_t HammingDistance_MSA(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count);
+uint32_t SumSquareError_C(const uint8_t* src_a,
+                          const uint8_t* src_b,
+                          int count);
+uint32_t SumSquareError_SSE2(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count);
+uint32_t SumSquareError_AVX2(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count);
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count);
+uint32_t SumSquareError_MSA(const uint8_t* src_a,
+                            const uint8_t* src_b,
+                            int count);
+uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed);
+uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed);
+uint32_t HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed);
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_COMPARE_ROW_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/convert.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/convert.h
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_CONVERT_H_
+#define INCLUDE_LIBYUV_CONVERT_H_
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"  // For enum RotationMode.
+// TODO(fbarchard): fix WebRTC source to include following libyuv headers:
+#include "libyuv/convert_argb.h"      // For WebRTC I420ToARGB. b/620
+#include "libyuv/convert_from.h"      // For WebRTC ConvertFromI420. b/620
+#include "libyuv/planar_functions.h"  // For WebRTC I420Rect, CopyPlane. b/618
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+// Convert I444 to I420.
+LIBYUV_API
+int I444ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert I444 to NV12.
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+// Convert I444 to NV21.
+LIBYUV_API
+int I444ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+// Convert I422 to I420.
+LIBYUV_API
+int I422ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert I422 to I444.
+LIBYUV_API
+int I422ToI444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert I422 to I210.
+LIBYUV_API
+int I422ToI210(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert MM21 to NV12.
+LIBYUV_API
+int MM21ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+// Convert MM21 to I420.
+LIBYUV_API
+int MM21ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert MM21 to YUY2
+LIBYUV_API
+int MM21ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height);
+// Convert MT2T to P010
+// Note that src_y and src_uv point to packed 10-bit values, so the Y plane will
+// be 10 / 8 times the dimensions of the image. Also for this reason,
+// src_stride_y and src_stride_uv are given in bytes.
+LIBYUV_API
+int MT2TToP010(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+// Convert I422 to NV21.
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+// Copy I420 to I420.
+#define I420ToI420 I420Copy
+LIBYUV_API
+int I420Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+// Convert I420 to I444.
+LIBYUV_API
+int I420ToI444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Copy I010 to I010
+#define I010ToI010 I010Copy
+#define H010ToH010 I010Copy
+LIBYUV_API
+int I010Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+// Convert 10 bit YUV to 8 bit
+#define H010ToH420 I010ToI420
+LIBYUV_API
+int I010ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+#define H210ToH420 I210ToI420
+LIBYUV_API
+int I210ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+#define H210ToH422 I210ToI422
+LIBYUV_API
+int I210ToI422(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+#define H410ToH420 I410ToI420
+LIBYUV_API
+int I410ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+#define H410ToH444 I410ToI444
+LIBYUV_API
+int I410ToI444(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+#define H012ToH420 I012ToI420
+LIBYUV_API
+int I012ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+#define H212ToH422 I212ToI422
+LIBYUV_API
+int I212ToI422(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+#define H412ToH444 I412ToI444
+LIBYUV_API
+int I412ToI444(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+#define I412ToI012 I410ToI010
+#define H410ToH010 I410ToI010
+#define H412ToH012 I410ToI010
+LIBYUV_API
+int I410ToI010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+#define I212ToI012 I210ToI010
+#define H210ToH010 I210ToI010
+#define H212ToH012 I210ToI010
+LIBYUV_API
+int I210ToI010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert I010 to I410
+LIBYUV_API
+int I010ToI410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert I012 to I412
+#define I012ToI412 I010ToI410
+// Convert I210 to I410
+LIBYUV_API
+int I210ToI410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert I212 to I412
+#define I212ToI412 I210ToI410
+// Convert I010 to P010
+LIBYUV_API
+int I010ToP010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+// Convert I210 to P210
+LIBYUV_API
+int I210ToP210(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+// Convert I012 to P012
+LIBYUV_API
+int I012ToP012(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+// Convert I212 to P212
+LIBYUV_API
+int I212ToP212(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+// Convert I400 (grey) to I420.
+LIBYUV_API
+int I400ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert I400 (grey) to NV21.
+LIBYUV_API
+int I400ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+#define J400ToJ420 I400ToI420
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert NV21 to I420.
+LIBYUV_API
+int NV21ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert NV12 to NV24.
+LIBYUV_API
+int NV12ToNV24(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+// Convert NV16 to NV24.
+LIBYUV_API
+int NV16ToNV24(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+// Convert P010 to I010.
+LIBYUV_API
+int P010ToI010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert P012 to I012.
+LIBYUV_API
+int P012ToI012(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert P010 to P410.
+LIBYUV_API
+int P010ToP410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+// Convert P012 to P412.
+#define P012ToP412 P010ToP410
+// Convert P016 to P416.
+#define P016ToP416 P010ToP410
+// Convert P210 to P410.
+LIBYUV_API
+int P210ToP410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+// Convert P212 to P412.
+#define P212ToP412 P210ToP410
+// Convert P216 to P416.
+#define P216ToP416 P210ToP410
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert AYUV to NV12.
+LIBYUV_API
+int AYUVToNV12(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+// Convert Android420 to I420.
+LIBYUV_API
+int Android420ToI420(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height);
+// ARGB little endian (bgra in memory) to I420.
+LIBYUV_API
+int ARGBToI420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// BGRA little endian (argb in memory) to I420.
+LIBYUV_API
+int BGRAToI420(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// ABGR little endian (rgba in memory) to I420.
+LIBYUV_API
+int ABGRToI420(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// RGBA little endian (abgr in memory) to I420.
+LIBYUV_API
+int RGBAToI420(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// RGB little endian (bgr in memory) to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height);
+// RGB little endian (bgr in memory) to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height);
+// RGB big endian (rgb in memory) to I420.
+LIBYUV_API
+int RAWToI420(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height);
+// RGB big endian (rgb in memory) to J420.
+LIBYUV_API
+int RAWToJ420(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height);
+// RGB16 (RGBP fourcc) little endian to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 uint8_t* dst_u,
+                 int dst_stride_u,
+                 uint8_t* dst_v,
+                 int dst_stride_v,
+                 int width,
+                 int height);
+// RGB15 (RGBO fourcc) little endian to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height);
+// RGB12 (R444 fourcc) little endian to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height);
+// RGB little endian (bgr in memory) to J400.
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_yj,
+                int dst_stride_yj,
+                int width,
+                int height);
+// RGB big endian (rgb in memory) to J400.
+LIBYUV_API
+int RAWToJ400(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_yj,
+              int dst_stride_yj,
+              int width,
+              int height);
+// src_width/height provided by capture.
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToI420(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
+// JPEG to NV21
+LIBYUV_API
+int MJPGToNV21(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
+// JPEG to NV12
+LIBYUV_API
+int MJPGToNV12(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8_t* sample,
+             size_t sample_size,
+             int* width,
+             int* height);
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_y" number of bytes in a row of the dst_y plane.
+//   Normally this would be the same as dst_width, with recommended alignment
+//   to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected. The caller should
+//   allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+//   Normally this would be the same as (dst_width + 1) / 2, with
+//   recommended alignment to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+//   To center, crop_x = (src_width - dst_width) / 2
+//              crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+//   "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+//    Must be less than or equal to src_width/src_height
+//    Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "fourcc" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToI420(const uint8_t* sample,
+                  size_t sample_size,
+                  uint8_t* dst_y,
+                  int dst_stride_y,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
+                  enum RotationMode rotation,
+                  uint32_t fourcc);
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_CONVERT_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/convert_argb.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/convert_argb.h
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_
+#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"  // For enum RotationMode.
+#include "libyuv/scale.h"   // For enum FilterMode.
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+// Conversion matrix for YUV to RGB
+LIBYUV_API extern const struct YuvConstants kYuvI601Constants;   // BT.601
+LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants;   // BT.601 full
+LIBYUV_API extern const struct YuvConstants kYuvH709Constants;   // BT.709
+LIBYUV_API extern const struct YuvConstants kYuvF709Constants;   // BT.709 full
+LIBYUV_API extern const struct YuvConstants kYuv2020Constants;   // BT.2020
+LIBYUV_API extern const struct YuvConstants kYuvV2020Constants;  // BT.2020 full
+// Conversion matrix for YVU to BGR
+LIBYUV_API extern const struct YuvConstants kYvuI601Constants;   // BT.601
+LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants;   // BT.601 full
+LIBYUV_API extern const struct YuvConstants kYvuH709Constants;   // BT.709
+LIBYUV_API extern const struct YuvConstants kYvuF709Constants;   // BT.709 full
+LIBYUV_API extern const struct YuvConstants kYvu2020Constants;   // BT.2020
+LIBYUV_API extern const struct YuvConstants kYvuV2020Constants;  // BT.2020 full
+// Macros for end swapped destination Matrix conversions.
+// Swap UV and pass mirrored kYvuJPEGConstants matrix.
+// TODO(fbarchard): Add macro for each Matrix function.
+#define kYuvI601ConstantsVU kYvuI601Constants
+#define kYuvJPEGConstantsVU kYvuJPEGConstants
+#define kYuvH709ConstantsVU kYvuH709Constants
+#define kYuvF709ConstantsVU kYvuF709Constants
+#define kYuv2020ConstantsVU kYvu2020Constants
+#define kYuvV2020ConstantsVU kYvuV2020Constants
+#define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
+  NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
+  NV12ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV12ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
+  NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
+  NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
+#define I010ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I010ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I210ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I210ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I410ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I410ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I010ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I010ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I210ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I210ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I410ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I410ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+  I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+  I422AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+  I444AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+  I010AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+  I210AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+  I410AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+// Alias.
+#define ARGBToARGB ARGBCopy
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height);
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert U420 to ARGB.
+LIBYUV_API
+int U420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert U420 to ABGR.
+LIBYUV_API
+int U420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert U422 to ARGB.
+LIBYUV_API
+int U422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert U422 to ABGR.
+LIBYUV_API
+int U422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert J444 to ABGR.
+LIBYUV_API
+int J444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert H444 to ARGB.
+LIBYUV_API
+int H444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert H444 to ABGR.
+LIBYUV_API
+int H444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert U444 to ARGB.
+LIBYUV_API
+int U444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert U444 to ABGR.
+LIBYUV_API
+int U444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert I444 to RGB24.
+LIBYUV_API
+int I444ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+// Convert I444 to RAW.
+LIBYUV_API
+int I444ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert I010 to ABGR.
+LIBYUV_API
+int I010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert H010 to ABGR.
+LIBYUV_API
+int H010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert U010 to ARGB.
+LIBYUV_API
+int U010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert U010 to ABGR.
+LIBYUV_API
+int U010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert I210 to ARGB.
+LIBYUV_API
+int I210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert I210 to ABGR.
+LIBYUV_API
+int I210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert H210 to ARGB.
+LIBYUV_API
+int H210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert H210 to ABGR.
+LIBYUV_API
+int H210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert U210 to ARGB.
+LIBYUV_API
+int U210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert U210 to ABGR.
+LIBYUV_API
+int U210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert I420 with Alpha to preattenuated ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int attenuate);
+// Convert I420 with Alpha to preattenuated ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_abgr,
+                    int dst_stride_abgr,
+                    int width,
+                    int height,
+                    int attenuate);
+// Convert I422 with Alpha to preattenuated ARGB.
+LIBYUV_API
+int I422AlphaToARGB(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int attenuate);
+// Convert I422 with Alpha to preattenuated ABGR.
+LIBYUV_API
+int I422AlphaToABGR(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_abgr,
+                    int dst_stride_abgr,
+                    int width,
+                    int height,
+                    int attenuate);
+// Convert I444 with Alpha to preattenuated ARGB.
+LIBYUV_API
+int I444AlphaToARGB(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int attenuate);
+// Convert I444 with Alpha to preattenuated ABGR.
+LIBYUV_API
+int I444AlphaToABGR(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_abgr,
+                    int dst_stride_abgr,
+                    int width,
+                    int height,
+                    int attenuate);
+// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert J400 (jpeg grey) to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Alias.
+#define YToARGB I400ToARGB
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert NV12 to ABGR.
+LIBYUV_API
+int NV12ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert NV21 to ABGR.
+LIBYUV_API
+int NV21ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert NV12 to RGB24.
+LIBYUV_API
+int NV12ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_uv,
+                int src_stride_uv,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+// Convert NV21 to RGB24.
+LIBYUV_API
+int NV21ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+// Convert NV21 to YUV24.
+LIBYUV_API
+int NV21ToYUV24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_yuv24,
+                int dst_stride_yuv24,
+                int width,
+                int height);
+// Convert NV12 to RAW.
+LIBYUV_API
+int NV12ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_uv,
+              int src_stride_uv,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+// Convert NV21 to RAW.
+LIBYUV_API
+int NV21ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_vu,
+              int src_stride_vu,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+// Convert H010 to AR30.
+LIBYUV_API
+int H010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+// Convert I010 to AB30.
+LIBYUV_API
+int I010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+// Convert H010 to AB30.
+LIBYUV_API
+int H010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+// Convert U010 to AR30.
+LIBYUV_API
+int U010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+// Convert U010 to AB30.
+LIBYUV_API
+int U010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+// Convert I210 to AR30.
+LIBYUV_API
+int I210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+// Convert I210 to AB30.
+LIBYUV_API
+int I210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+// Convert H210 to AR30.
+LIBYUV_API
+int H210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+// Convert H210 to AB30.
+LIBYUV_API
+int H210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+// Convert U210 to AR30.
+LIBYUV_API
+int U210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+// Convert U210 to AB30.
+LIBYUV_API
+int U210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+// BGRA little endian (argb in memory) to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// ABGR little endian (rgba in memory) to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// RGBA little endian (abgr in memory) to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Deprecated function name.
+#define BG24ToARGB RGB24ToARGB
+// RGB little endian (bgr in memory) to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height);
+// RGB big endian (rgb in memory) to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height);
+// RGB big endian (rgb in memory) to RGBA.
+LIBYUV_API
+int RAWToRGBA(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_rgba,
+              int dst_stride_rgba,
+              int width,
+              int height);
+// RGB16 (RGBP fourcc) little endian to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height);
+// RGB15 (RGBO fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height);
+// RGB12 (R444 fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height);
+// Aliases
+#define AB30ToARGB AR30ToABGR
+#define AB30ToABGR AR30ToARGB
+#define AB30ToAR30 AR30ToAB30
+// Convert AR30 To ARGB.
+LIBYUV_API
+int AR30ToARGB(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert AR30 To ABGR.
+LIBYUV_API
+int AR30ToABGR(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert AR30 To AB30.
+LIBYUV_API
+int AR30ToAB30(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+// Convert AR64 to ARGB.
+LIBYUV_API
+int AR64ToARGB(const uint16_t* src_ar64,
+               int src_stride_ar64,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert AB64 to ABGR.
+#define AB64ToABGR AR64ToARGB
+// Convert AB64 to ARGB.
+LIBYUV_API
+int AB64ToARGB(const uint16_t* src_ab64,
+               int src_stride_ab64,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert AR64 to ABGR.
+#define AR64ToABGR AB64ToARGB
+// Convert AR64 To AB64.
+LIBYUV_API
+int AR64ToAB64(const uint16_t* src_ar64,
+               int src_stride_ar64,
+               uint16_t* dst_ab64,
+               int dst_stride_ab64,
+               int width,
+               int height);
+// Convert AB64 To AR64.
+#define AB64ToAR64 AR64ToAB64
+// src_width/height provided by capture
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToARGB(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
+// Convert Android420 to ARGB.
+LIBYUV_API
+int Android420ToARGB(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height);
+// Convert Android420 to ABGR.
+LIBYUV_API
+int Android420ToABGR(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_abgr,
+                     int dst_stride_abgr,
+                     int width,
+                     int height);
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height);
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height);
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+LIBYUV_API
+int I420ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height);
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+LIBYUV_API
+int I420ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height);
+LIBYUV_API
+int I420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+LIBYUV_API
+int I420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+LIBYUV_API
+int J420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+LIBYUV_API
+int J420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+// Convert I422 to RGB24.
+LIBYUV_API
+int I422ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+// Convert I422 to RAW.
+LIBYUV_API
+int I422ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+LIBYUV_API
+int J420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+LIBYUV_API
+int H420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+LIBYUV_API
+int I420ToRGB565Dither(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height);
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height);
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height);
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+// Convert I420 to AB30.
+LIBYUV_API
+int I420ToAB30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+// Convert H420 to AB30.
+LIBYUV_API
+int H420ToAB30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+// Convert I420 to ARGB with matrix.
+LIBYUV_API
+int I420ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert I422 to ARGB with matrix.
+LIBYUV_API
+int I422ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert I444 to ARGB with matrix.
+LIBYUV_API
+int I444ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert I444 to RGB24 with matrix.
+LIBYUV_API
+int I444ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+// Convert 10 bit 420 YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert 10 bit 420 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert 10 bit 444 YUV to ARGB with matrix.
+LIBYUV_API
+int I410ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert 10 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// multiply 12 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I012ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert 12 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I012ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert 10 bit 422 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert 10 bit 444 YUV to ARGB with matrix.
+LIBYUV_API
+int I410ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert P010 to ARGB with matrix.
+LIBYUV_API
+int P010ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert P210 to ARGB with matrix.
+LIBYUV_API
+int P210ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert P010 to AR30 with matrix.
+LIBYUV_API
+int P010ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert P210 to AR30 with matrix.
+LIBYUV_API
+int P210ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// P012 and P010 use most significant bits so the conversion is the same.
+// Convert P012 to ARGB with matrix.
+#define P012ToARGBMatrix P010ToARGBMatrix
+// Convert P012 to AR30 with matrix.
+#define P012ToAR30Matrix P010ToAR30Matrix
+// Convert P212 to ARGB with matrix.
+#define P212ToARGBMatrix P210ToARGBMatrix
+// Convert P212 to AR30 with matrix.
+#define P212ToAR30Matrix P210ToAR30Matrix
+// Convert P016 to ARGB with matrix.
+#define P016ToARGBMatrix P010ToARGBMatrix
+// Convert P016 to AR30 with matrix.
+#define P016ToAR30Matrix P010ToAR30Matrix
+// Convert P216 to ARGB with matrix.
+#define P216ToARGBMatrix P210ToARGBMatrix
+// Convert P216 to AR30 with matrix.
+#define P216ToAR30Matrix P210ToAR30Matrix
+// Convert I420 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I420AlphaToARGBMatrix(const uint8_t* src_y,
+                          int src_stride_y,
+                          const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          const uint8_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate);
+// Convert I422 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I422AlphaToARGBMatrix(const uint8_t* src_y,
+                          int src_stride_y,
+                          const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          const uint8_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate);
+// Convert I444 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I444AlphaToARGBMatrix(const uint8_t* src_y,
+                          int src_stride_y,
+                          const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          const uint8_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate);
+// Convert I010 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I010AlphaToARGBMatrix(const uint16_t* src_y,
+                          int src_stride_y,
+                          const uint16_t* src_u,
+                          int src_stride_u,
+                          const uint16_t* src_v,
+                          int src_stride_v,
+                          const uint16_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate);
+// Convert I210 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I210AlphaToARGBMatrix(const uint16_t* src_y,
+                          int src_stride_y,
+                          const uint16_t* src_u,
+                          int src_stride_u,
+                          const uint16_t* src_v,
+                          int src_stride_v,
+                          const uint16_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate);
+// Convert I410 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I410AlphaToARGBMatrix(const uint16_t* src_y,
+                          int src_stride_y,
+                          const uint16_t* src_u,
+                          int src_stride_u,
+                          const uint16_t* src_v,
+                          int src_stride_v,
+                          const uint16_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate);
+// Convert NV12 to ARGB with matrix.
+LIBYUV_API
+int NV12ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert NV21 to ARGB with matrix.
+LIBYUV_API
+int NV21ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_vu,
+                     int src_stride_vu,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert NV12 to RGB565 with matrix.
+LIBYUV_API
+int NV12ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_uv,
+                       int src_stride_uv,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height);
+// Convert NV12 to RGB24 with matrix.
+LIBYUV_API
+int NV12ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_uv,
+                      int src_stride_uv,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+// Convert NV21 to RGB24 with matrix.
+LIBYUV_API
+int NV21ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_vu,
+                      int src_stride_vu,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+// Convert Android420 to ARGB with matrix.
+LIBYUV_API
+int Android420ToARGBMatrix(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           int src_pixel_stride_uv,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height);
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I422ToRGBAMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_rgba,
+                     int dst_stride_rgba,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert I420 to RGBA with matrix.
+LIBYUV_API
+int I420ToRGBAMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_rgba,
+                     int dst_stride_rgba,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert I420 to RGB24 with matrix.
+LIBYUV_API
+int I420ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+// Convert I422 to RGB24 with matrix.
+LIBYUV_API
+int I422ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+// Convert I420 to RGB565 with specified color matrix.
+LIBYUV_API
+int I420ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height);
+// Convert I422 to RGB565 with specified color matrix.
+LIBYUV_API
+int I422ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height);
+// Convert I420 to AR30 with matrix.
+LIBYUV_API
+int I420ToAR30Matrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+// Convert I420 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I420ToARGBMatrixFilter(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+// Convert I422 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I422ToARGBMatrixFilter(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+// Convert I422 to RGB24 with matrix and UV filter mode.
+LIBYUV_API
+int I422ToRGB24MatrixFilter(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgb24,
+                            int dst_stride_rgb24,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height,
+                            enum FilterMode filter);
+// Convert I420 to RGB24 with matrix and UV filter mode.
+LIBYUV_API
+int I420ToRGB24MatrixFilter(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgb24,
+                            int dst_stride_rgb24,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height,
+                            enum FilterMode filter);
+// Convert I010 to AR30 with matrix and UV filter mode.
+LIBYUV_API
+int I010ToAR30MatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_u,
+                           int src_stride_u,
+                           const uint16_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_ar30,
+                           int dst_stride_ar30,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+// Convert I210 to AR30 with matrix and UV filter mode.
+LIBYUV_API
+int I210ToAR30MatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_u,
+                           int src_stride_u,
+                           const uint16_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_ar30,
+                           int dst_stride_ar30,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+// Convert I010 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I010ToARGBMatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_u,
+                           int src_stride_u,
+                           const uint16_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+// Convert I210 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I210ToARGBMatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_u,
+                           int src_stride_u,
+                           const uint16_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+// Convert I420 with Alpha to attenuated ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I420AlphaToARGBMatrixFilter(const uint8_t* src_y,
+                                int src_stride_y,
+                                const uint8_t* src_u,
+                                int src_stride_u,
+                                const uint8_t* src_v,
+                                int src_stride_v,
+                                const uint8_t* src_a,
+                                int src_stride_a,
+                                uint8_t* dst_argb,
+                                int dst_stride_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width,
+                                int height,
+                                int attenuate,
+                                enum FilterMode filter);
+// Convert I422 with Alpha to attenuated ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I422AlphaToARGBMatrixFilter(const uint8_t* src_y,
+                                int src_stride_y,
+                                const uint8_t* src_u,
+                                int src_stride_u,
+                                const uint8_t* src_v,
+                                int src_stride_v,
+                                const uint8_t* src_a,
+                                int src_stride_a,
+                                uint8_t* dst_argb,
+                                int dst_stride_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width,
+                                int height,
+                                int attenuate,
+                                enum FilterMode filter);
+// Convert I010 with Alpha to attenuated ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I010AlphaToARGBMatrixFilter(const uint16_t* src_y,
+                                int src_stride_y,
+                                const uint16_t* src_u,
+                                int src_stride_u,
+                                const uint16_t* src_v,
+                                int src_stride_v,
+                                const uint16_t* src_a,
+                                int src_stride_a,
+                                uint8_t* dst_argb,
+                                int dst_stride_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width,
+                                int height,
+                                int attenuate,
+                                enum FilterMode filter);
+// Convert I210 with Alpha to attenuated ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I210AlphaToARGBMatrixFilter(const uint16_t* src_y,
+                                int src_stride_y,
+                                const uint16_t* src_u,
+                                int src_stride_u,
+                                const uint16_t* src_v,
+                                int src_stride_v,
+                                const uint16_t* src_a,
+                                int src_stride_a,
+                                uint8_t* dst_argb,
+                                int dst_stride_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width,
+                                int height,
+                                int attenuate,
+                                enum FilterMode filter);
+// Convert P010 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int P010ToARGBMatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_uv,
+                           int src_stride_uv,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+// Convert P210 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int P210ToARGBMatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_uv,
+                           int src_stride_uv,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+// Convert P010 to AR30 with matrix and UV filter mode.
+LIBYUV_API
+int P010ToAR30MatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_uv,
+                           int src_stride_uv,
+                           uint8_t* dst_ar30,
+                           int dst_stride_ar30,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+// Convert P210 to AR30 with matrix and UV filter mode.
+LIBYUV_API
+int P210ToAR30MatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_uv,
+                           int src_stride_uv,
+                           uint8_t* dst_ar30,
+                           int dst_stride_ar30,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// "sample_size" is needed to parse MJPG.
+// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
+//   Normally this would be the same as dst_width, with recommended alignment
+//   to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected. The caller should
+//   allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+//   Normally this would be the same as (dst_width + 1) / 2, with
+//   recommended alignment to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+//   To center, crop_x = (src_width - dst_width) / 2
+//              crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+//   "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+//    Must be less than or equal to src_width/src_height
+//    Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "fourcc" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToARGB(const uint8_t* sample,
+                  size_t sample_size,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
+                  enum RotationMode rotation,
+                  uint32_t fourcc);
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/convert_from.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/convert_from.h
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_
+#define INCLUDE_LIBYUV_CONVERT_FROM_H_
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+// See Also convert.h for conversions from formats to I420.
+// Convert 8 bit YUV to 10 bit.
+#define H420ToH010 I420ToI010
+LIBYUV_API
+int I420ToI010(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert 8 bit YUV to 12 bit.
+#define H420ToH012 I420ToI012
+LIBYUV_API
+int I420ToI012(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+LIBYUV_API
+int I420ToI422(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+LIBYUV_API
+int I420ToI444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
+LIBYUV_API
+int I400Copy(const uint8_t* src_y,
+             int src_stride_y,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             int width,
+             int height);
+LIBYUV_API
+int I420ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+LIBYUV_API
+int I420ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+LIBYUV_API
+int I420ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height);
+LIBYUV_API
+int I420ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height);
+// The following are from convert_argb.h
+// DEPRECATED: The prototypes will be removed in future.  Use convert_argb.h
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert I420 to specified format.
+// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
+//    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
+LIBYUV_API
+int ConvertFromI420(const uint8_t* y,
+                    int y_stride,
+                    const uint8_t* u,
+                    int u_stride,
+                    const uint8_t* v,
+                    int v_stride,
+                    uint8_t* dst_sample,
+                    int dst_sample_stride,
+                    int width,
+                    int height,
+                    uint32_t fourcc);
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/convert_from_argb.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/convert_from_argb.h
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
+#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
+#include "libyuv/basic_types.h"
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+// Copy ARGB to ARGB.
+#define ARGBToARGB ARGBCopy
+LIBYUV_API
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height);
+// Convert ARGB To BGRA.
+LIBYUV_API
+int ARGBToBGRA(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height);
+// Convert ARGB To ABGR.
+LIBYUV_API
+int ARGBToABGR(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+// Convert ARGB To RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height);
+// Aliases
+#define ARGBToAB30 ABGRToAR30
+#define ABGRToAB30 ARGBToAR30
+// Convert ABGR To AR30.
+LIBYUV_API
+int ABGRToAR30(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+// Convert ARGB To AR30.
+LIBYUV_API
+int ARGBToAR30(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+// Aliases
+#define ABGRToRGB24 ARGBToRAW
+#define ABGRToRAW ARGBToRGB24
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8_t* src_argb,
+                int src_stride_argb,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8_t* src_argb,
+                 int src_stride_argb,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
+// const uint8_t(*dither)[4][4];
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height);
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height);
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height);
+// Convert ARGB To I444.
+LIBYUV_API
+int ARGBToI444(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert ARGB to AR64.
+LIBYUV_API
+int ARGBToAR64(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint16_t* dst_ar64,
+               int dst_stride_ar64,
+               int width,
+               int height);
+// Convert ABGR to AB64.
+#define ABGRToAB64 ARGBToAR64
+// Convert ARGB to AB64.
+LIBYUV_API
+int ARGBToAB64(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint16_t* dst_ab64,
+               int dst_stride_ab64,
+               int width,
+               int height);
+// Convert ABGR to AR64.
+#define ABGRToAR64 ARGBToAB64
+// Convert ARGB To I422.
+LIBYUV_API
+int ARGBToI422(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert ARGB To I420. (also in convert.h)
+LIBYUV_API
+int ARGBToI420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
+               int width,
+               int height);
+// Convert ARGB to J422.
+LIBYUV_API
+int ARGBToJ422(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
+               int width,
+               int height);
+// Convert ARGB to J400. (JPeg full range).
+LIBYUV_API
+int ARGBToJ400(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height);
+// Convert ABGR to J420. (JPeg full range I420).
+LIBYUV_API
+int ABGRToJ420(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
+               int width,
+               int height);
+// Convert ABGR to J422.
+LIBYUV_API
+int ABGRToJ422(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
+               int width,
+               int height);
+// Convert ABGR to J400. (JPeg full range).
+LIBYUV_API
+int ABGRToJ400(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height);
+// Convert RGBA to J400. (JPeg full range).
+LIBYUV_API
+int RGBAToJ400(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height);
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
+// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
+LIBYUV_API
+int ARGBToG(const uint8_t* src_argb,
+            int src_stride_argb,
+            uint8_t* dst_g,
+            int dst_stride_g,
+            int width,
+            int height);
+// Convert ARGB To NV12.
+LIBYUV_API
+int ARGBToNV12(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+// Convert ABGR To NV12.
+LIBYUV_API
+int ABGRToNV12(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+// Convert ABGR To NV21.
+LIBYUV_API
+int ABGRToNV21(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+// Convert ARGB To YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height);
+// Convert ARGB To UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height);
+// RAW to JNV21 full range NV21
+LIBYUV_API
+int RAWToJNV21(const uint8_t* src_raw,
+               int src_stride_raw,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/cpu_id.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/cpu_id.h
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_
+#define INCLUDE_LIBYUV_CPU_ID_H_
+#include "libyuv/basic_types.h"
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+// Internal flag to indicate cpuid requires initialization.
+static const int kCpuInitialized = 0x1;
+// These flags are only valid on ARM processors.
+static const int kCpuHasARM = 0x2;
+static const int kCpuHasNEON = 0x4;
+// 0x8 reserved for future ARM flag.
+// These flags are only valid on x86 processors.
+static const int kCpuHasX86 = 0x10;
+static const int kCpuHasSSE2 = 0x20;
+static const int kCpuHasSSSE3 = 0x40;
+static const int kCpuHasSSE41 = 0x80;
+static const int kCpuHasSSE42 = 0x100;  // unused at this time.
+static const int kCpuHasAVX = 0x200;
+static const int kCpuHasAVX2 = 0x400;
+static const int kCpuHasERMS = 0x800;
+static const int kCpuHasFMA3 = 0x1000;
+static const int kCpuHasF16C = 0x2000;
+static const int kCpuHasGFNI = 0x4000;
+static const int kCpuHasAVX512BW = 0x8000;
+static const int kCpuHasAVX512VL = 0x10000;
+static const int kCpuHasAVX512VNNI = 0x20000;
+static const int kCpuHasAVX512VBMI = 0x40000;
+static const int kCpuHasAVX512VBMI2 = 0x80000;
+static const int kCpuHasAVX512VBITALG = 0x100000;
+static const int kCpuHasAVX512VPOPCNTDQ = 0x200000;
+// These flags are only valid on MIPS processors.
+static const int kCpuHasMIPS = 0x400000;
+static const int kCpuHasMSA = 0x800000;
+// These flags are only valid on LOONGARCH processors.
+static const int kCpuHasLOONGARCH = 0x2000000;
+static const int kCpuHasLSX = 0x4000000;
+static const int kCpuHasLASX = 0x8000000;
+// Optional init function. TestCpuFlag does an auto-init.
+// Returns cpu_info flags.
+LIBYUV_API
+int InitCpuFlags(void);
+// Detect CPU has SSE2 etc.
+// Test_flag parameter should be one of kCpuHas constants above.
+// Returns non-zero if instruction set is detected
+static __inline int TestCpuFlag(int test_flag) {
+  LIBYUV_API extern int cpu_info_;
+#ifdef __ATOMIC_RELAXED
+  int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED);
+#else
+  int cpu_info = cpu_info_;
+#endif
+  return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag;
+}
+// Internal function for parsing /proc/cpuinfo.
+LIBYUV_API
+int ArmCpuCaps(const char* cpuinfo_name);
+LIBYUV_API
+int MipsCpuCaps(const char* cpuinfo_name);
+// For testing, allow CPU flags to be disabled.
+// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
+// MaskCpuFlags(-1) to enable all cpu specific optimizations.
+// MaskCpuFlags(1) to disable all cpu specific optimizations.
+// MaskCpuFlags(0) to reset state so next call will auto init.
+// Returns cpu_info flags.
+LIBYUV_API
+int MaskCpuFlags(int enable_flags);
+// Sets the CPU flags to |cpu_flags|, bypassing the detection code. |cpu_flags|
+// should be a valid combination of the kCpuHas constants above and include
+// kCpuInitialized. Use this method when running in a sandboxed process where
+// the detection code might fail (as it might access /proc/cpuinfo). In such
+// cases the cpu_info can be obtained from a non sandboxed process by calling
+// InitCpuFlags() and passed to the sandboxed process (via command line
+// parameters, IPC...) which can then call this method to initialize the CPU
+// flags.
+// Notes:
+// - when specifying 0 for |cpu_flags|, the auto initialization is enabled
+//   again.
+// - enabling CPU features that are not supported by the CPU will result in
+//   undefined behavior.
+// TODO(fbarchard): consider writing a helper function that translates from
+// other library CPU info to libyuv CPU info and add a .md doc that explains
+// CPU detection.
+static __inline void SetCpuFlags(int cpu_flags) {
+  LIBYUV_API extern int cpu_info_;
+#ifdef __ATOMIC_RELAXED
+  __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED);
+#else
+  cpu_info_ = cpu_flags;
+#endif
+}
+// Low level cpuid for X86. Returns zeros on other CPUs.
+// eax is the info type that you want.
+// ecx is typically the cpu number, and should normally be zero.
+LIBYUV_API
+void CpuId(int info_eax, int info_ecx, int* cpu_info);
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_CPU_ID_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/loongson_intrinsics.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/loongson_intrinsics.h
+/*
+ *  Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H
+#define INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * All rights reserved.
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Xiwei Gu   <guxiwei-hf@loongson.cn>
+ *                Lu Wang    <wanglu@loongson.cn>
+ *
+ * This file is a header file for loongarch builtin extension.
+ *
+ */
+#ifndef LOONGSON_INTRINSICS_H
+#define LOONGSON_INTRINSICS_H
+/**
+ * MAJOR version: Macro usage changes.
+ * MINOR version: Add new functions, or bug fixes.
+ * MICRO version: Comment changes or implementation changes.
+ */
+#define LSOM_VERSION_MAJOR 1
+#define LSOM_VERSION_MINOR 1
+#define LSOM_VERSION_MICRO 0
+#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
+  {                                               \
+    _OUT0 = _INS(_IN0);                           \
+    _OUT1 = _INS(_IN1);                           \
+  }
+#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
+  {                                                           \
+    _OUT0 = _INS(_IN0, _IN1);                                 \
+    _OUT1 = _INS(_IN2, _IN3);                                 \
+  }
+#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
+  {                                                                       \
+    _OUT0 = _INS(_IN0, _IN1, _IN2);                                       \
+    _OUT1 = _INS(_IN3, _IN4, _IN5);                                       \
+  }
+#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
+  {                                                                         \
+    DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1);                              \
+    DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3);                              \
+  }
+#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
+                  _OUT1, _OUT2, _OUT3)                                         \
+  {                                                                            \
+    DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1);                     \
+    DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3);                     \
+  }
+#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
+                  _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)             \
+  {                                                                           \
+    DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1);        \
+    DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3);      \
+  }
+#ifdef __loongarch_sx
+#include <lsxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ *               Then the results plus to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_b(__m128i in_c,
+                                        __m128i in_h,
+                                        __m128i in_l) {
+  __m128i out;
+  out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               unsigned byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ *               The results plus to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c,
+                                         __m128i in_h,
+                                         __m128i in_l) {
+  __m128i out;
+  out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ *               The results plus to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
+ *        in_c : 1,1,1,1, 1,1,1,1
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
+ *         out : -4,-24,-60,-112, 6,26,62,114
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c,
+                                           __m128i in_h,
+                                           __m128i in_l) {
+  __m128i out;
+  out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product & addition of half-word vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - __m128i
+ * Details     : Signed half-word elements from in_h are multiplied by
+ *               signed half-word elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ *               Then the results plus to signed word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_w_h(__m128i in_c,
+                                        __m128i in_h,
+                                        __m128i in_l) {
+  __m128i out;
+  out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ * Example     : out = __lsx_vdp2_h_b(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
+  __m128i out;
+  out = __lsx_vmulwev_h_b(in_h, in_l);
+  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               unsigned byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ * Example     : out = __lsx_vdp2_h_bu(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
+  __m128i out;
+  out = __lsx_vmulwev_h_bu(in_h, in_l);
+  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ * Example     : out = __lsx_vdp2_h_bu_b(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
+ *         out : 22,38,38,22, 22,38,38,6
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
+  __m128i out;
+  out = __lsx_vmulwev_h_bu_b(in_h, in_l);
+  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ * Example     : out = __lsx_vdp2_w_h(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
+  __m128i out;
+  out = __lsx_vmulwev_w_h(in_h, in_l);
+  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ *               out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
+ *               (_in))
+ * Arguments   : Inputs  - _in  (input vector)
+ *                       - min  (min threshold)
+ *                       - max  (max threshold)
+ *               Outputs - out  (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : out = __lsx_vclip_h(_in)
+ *         _in : -8,2,280,249, -8,255,280,249
+ *         min : 1,1,1,1, 1,1,1,1
+ *         max : 9,9,9,9, 9,9,9,9
+ *         out : 1,2,9,9, 1,9,9,9
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
+  __m128i out;
+  out = __lsx_vmax_h(min, _in);
+  out = __lsx_vmin_h(max, out);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments   : Inputs  - _in
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from _in are clamped between 0 and 255.
+ * Example     : out = __lsx_vclip255_h(_in)
+ *         _in : -8,255,280,249, -8,255,280,249
+ *         out : 0,255,255,249, 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_h(__m128i _in) {
+  __m128i out;
+  out = __lsx_vmaxi_h(_in, 0);
+  out = __lsx_vsat_hu(out, 7);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments   : Inputs  - _in
+ *               Outputs - out
+ *               Return Type - word
+ * Details     : Signed byte elements from _in are clamped between 0 and 255.
+ * Example     : out = __lsx_vclip255_w(_in)
+ *         _in : -8,255,280,249
+ *         out : 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_w(__m128i _in) {
+  __m128i out;
+  out = __lsx_vmaxi_w(_in, 0);
+  out = __lsx_vsat_wu(out, 7);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Swap two variables
+ * Arguments   : Inputs  - _in0, _in1
+ *               Outputs - _in0, _in1 (in-place)
+ * Details     : Swapping of two input variables using xor
+ * Example     : LSX_SWAP(_in0, _in1)
+ *        _in0 : 1,2,3,4
+ *        _in1 : 5,6,7,8
+ *   _in0(out) : 5,6,7,8
+ *   _in1(out) : 1,2,3,4
+ * =============================================================================
+ */
+#define LSX_SWAP(_in0, _in1)         \
+  {                                  \
+    _in0 = __lsx_vxor_v(_in0, _in1); \
+    _in1 = __lsx_vxor_v(_in0, _in1); \
+    _in0 = __lsx_vxor_v(_in0, _in1); \
+  }
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     :
+ * Example     :
+ *               1, 2, 3, 4            1, 5, 9,13
+ *               5, 6, 7, 8    to      2, 6,10,14
+ *               9,10,11,12  =====>    3, 7,11,15
+ *              13,14,15,16            4, 8,12,16
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    __m128i _t0, _t1, _t2, _t3;                                                \
+                                                                               \
+    _t0 = __lsx_vilvl_w(_in1, _in0);                                           \
+    _t1 = __lsx_vilvh_w(_in1, _in0);                                           \
+    _t2 = __lsx_vilvl_w(_in3, _in2);                                           \
+    _t3 = __lsx_vilvh_w(_in3, _in2);                                           \
+    _out0 = __lsx_vilvl_d(_t2, _t0);                                           \
+    _out1 = __lsx_vilvh_d(_t2, _t0);                                           \
+    _out2 = __lsx_vilvl_d(_t3, _t1);                                           \
+    _out3 = __lsx_vilvh_d(_t3, _t1);                                           \
+  }
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with byte elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *               _out7
+ * Details     : The rows of the matrix become columns, and the columns
+ *               become rows.
+ * Example     : LSX_TRANSPOSE8x8_B
+ *        _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
+ *        _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
+ *        _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
+ *        _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
+ *        _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
+ *        _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
+ *        _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
+ *        _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
+ *
+ *      _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ *      _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ *      _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ *      _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ *      _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
+ *      _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
+ *      _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
+ *      _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    __m128i zero = {0};                                                     \
+    __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110};               \
+    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                         \
+                                                                            \
+    _t0 = __lsx_vilvl_b(_in2, _in0);                                        \
+    _t1 = __lsx_vilvl_b(_in3, _in1);                                        \
+    _t2 = __lsx_vilvl_b(_in6, _in4);                                        \
+    _t3 = __lsx_vilvl_b(_in7, _in5);                                        \
+    _t4 = __lsx_vilvl_b(_t1, _t0);                                          \
+    _t5 = __lsx_vilvh_b(_t1, _t0);                                          \
+    _t6 = __lsx_vilvl_b(_t3, _t2);                                          \
+    _t7 = __lsx_vilvh_b(_t3, _t2);                                          \
+    _out0 = __lsx_vilvl_w(_t6, _t4);                                        \
+    _out2 = __lsx_vilvh_w(_t6, _t4);                                        \
+    _out4 = __lsx_vilvl_w(_t7, _t5);                                        \
+    _out6 = __lsx_vilvh_w(_t7, _t5);                                        \
+    _out1 = __lsx_vshuf_b(zero, _out0, shuf8);                              \
+    _out3 = __lsx_vshuf_b(zero, _out2, shuf8);                              \
+    _out5 = __lsx_vshuf_b(zero, _out4, shuf8);                              \
+    _out7 = __lsx_vshuf_b(zero, _out6, shuf8);                              \
+  }
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details     :
+ * Example     :
+ *              00,01,02,03,04,05,06,07           00,10,20,30,40,50,60,70
+ *              10,11,12,13,14,15,16,17           01,11,21,31,41,51,61,71
+ *              20,21,22,23,24,25,26,27           02,12,22,32,42,52,62,72
+ *              30,31,32,33,34,35,36,37    to     03,13,23,33,43,53,63,73
+ *              40,41,42,43,44,45,46,47  ======>  04,14,24,34,44,54,64,74
+ *              50,51,52,53,54,55,56,57           05,15,25,35,45,55,65,75
+ *              60,61,62,63,64,65,66,67           06,16,26,36,46,56,66,76
+ *              70,71,72,73,74,75,76,77           07,17,27,37,47,57,67,77
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;               \
+                                                                            \
+    _s0 = __lsx_vilvl_h(_in6, _in4);                                        \
+    _s1 = __lsx_vilvl_h(_in7, _in5);                                        \
+    _t0 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t1 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvh_h(_in6, _in4);                                        \
+    _s1 = __lsx_vilvh_h(_in7, _in5);                                        \
+    _t2 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t3 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvl_h(_in2, _in0);                                        \
+    _s1 = __lsx_vilvl_h(_in3, _in1);                                        \
+    _t4 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t5 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvh_h(_in2, _in0);                                        \
+    _s1 = __lsx_vilvh_h(_in3, _in1);                                        \
+    _t6 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t7 = __lsx_vilvh_h(_s1, _s0);                                          \
+                                                                            \
+    _out0 = __lsx_vpickev_d(_t0, _t4);                                      \
+    _out2 = __lsx_vpickev_d(_t1, _t5);                                      \
+    _out4 = __lsx_vpickev_d(_t2, _t6);                                      \
+    _out6 = __lsx_vpickev_d(_t3, _t7);                                      \
+    _out1 = __lsx_vpickod_d(_t0, _t4);                                      \
+    _out3 = __lsx_vpickod_d(_t1, _t5);                                      \
+    _out5 = __lsx_vpickod_d(_t2, _t6);                                      \
+    _out7 = __lsx_vpickod_d(_t3, _t7);                                      \
+  }
+/*
+ * =============================================================================
+ * Description : Transpose input 8x4 byte block into 4x8
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3      (input 8x4 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3  (output 4x8 byte block)
+ *               Return Type - as per RTYPE
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LSX_TRANSPOSE8x4_B
+ *        _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
+ *
+ *       _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ *       _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ *       _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ *       _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+                           _out0, _out1, _out2, _out3)                     \
+  {                                                                        \
+    __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                            \
+                                                                           \
+    _tmp0_m = __lsx_vpackev_w(_in4, _in0);                                 \
+    _tmp1_m = __lsx_vpackev_w(_in5, _in1);                                 \
+    _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp0_m = __lsx_vpackev_w(_in6, _in2);                                 \
+    _tmp1_m = __lsx_vpackev_w(_in7, _in3);                                 \
+                                                                           \
+    _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m);                             \
+    _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m);                             \
+                                                                           \
+    _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m);                               \
+    _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m);                               \
+    _out1 = __lsx_vilvh_d(_out2, _out0);                                   \
+    _out3 = __lsx_vilvh_d(_out0, _out2);                                   \
+  }
+/*
+ * =============================================================================
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, in8
+ *                         in9, in10, in11, in12, in13, in14, in15
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details     :
+ * Example     :
+ *              000,001,002,003,004,005,006,007
+ *              008,009,010,011,012,013,014,015
+ *              016,017,018,019,020,021,022,023
+ *              024,025,026,027,028,029,030,031
+ *              032,033,034,035,036,037,038,039
+ *              040,041,042,043,044,045,046,047        000,008,...,112,120
+ *              048,049,050,051,052,053,054,055        001,009,...,113,121
+ *              056,057,058,059,060,061,062,063   to   002,010,...,114,122
+ *              064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
+ *              072,073,074,075,076,077,078,079        004,012,...,116,124
+ *              080,081,082,083,084,085,086,087        005,013,...,117,125
+ *              088,089,090,091,092,093,094,095        006,014,...,118,126
+ *              096,097,098,099,100,101,102,103        007,015,...,119,127
+ *              104,105,106,107,108,109,110,111
+ *              112,113,114,115,116,117,118,119
+ *              120,121,122,123,124,125,126,127
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                            _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                            _out6, _out7)                                    \
+  {                                                                          \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;          \
+    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                          \
+    DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
+              _tmp0, _tmp1, _tmp2, _tmp3);                                   \
+    DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15,  \
+              _in13, _tmp4, _tmp5, _tmp6, _tmp7);                            \
+    DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2);          \
+    DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3);          \
+    DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6);          \
+    DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7);          \
+    DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4);              \
+    DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6);              \
+    DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5);              \
+    DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7);              \
+    DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2);      \
+    DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3);      \
+    DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6);      \
+    DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7);      \
+  }
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     : Butterfly operation
+ * Example     :
+ *               out0 = in0 + in3;
+ *               out1 = in1 + in2;
+ *               out2 = in1 - in2;
+ *               out3 = in0 - in3;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_b(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_b(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_b(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_b(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_h(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_h(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_h(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_h(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_w(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_w(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_w(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_w(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_d(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_d(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_d(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_d(_in0, _in3);                                         \
+  }
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     :
+ *              _out0 = _in0 + _in7;
+ *              _out1 = _in1 + _in6;
+ *              _out2 = _in2 + _in5;
+ *              _out3 = _in3 + _in4;
+ *              _out4 = _in3 - _in4;
+ *              _out5 = _in2 - _in5;
+ *              _out6 = _in1 - _in6;
+ *              _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_b(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_b(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_b(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_b(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_b(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_b(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_b(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_b(_in0, _in7);                                      \
+  }
+#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_h(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_h(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_h(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_h(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_h(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_h(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_h(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_h(_in0, _in7);                                      \
+  }
+#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_w(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_w(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_w(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_w(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_w(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_w(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_w(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_w(_in0, _in7);                                      \
+  }
+#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_d(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_d(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_d(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_d(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_d(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_d(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_d(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_d(_in0, _in7);                                      \
+  }
+#endif  // LSX
+#ifdef __loongarch_asx
+#include <lasxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvmulwev_h_bu(in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Signed byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this multiplication results of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvmulwev_h_b(in_h, in_l);
+  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the out vector.
+ * Example     : out = __lasx_xvdp2_w_h(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product of word vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed double
+ * Details     : Signed word elements from in_h are multiplied with
+ *               signed word elements from in_l producing a result
+ *               twice the size of input i.e. signed double-word.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the out vector.
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvmulwev_d_w(in_h, in_l);
+  out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. unsigned word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,
+                                          __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c,
+                                           __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c,
+                                             __m256i in_h,
+                                             __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - per RTYPE
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c,
+                                          __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               unsigned halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c,
+                                           __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c,
+                                             __m256i in_h,
+                                             __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Vector Unsigned Dot Product and Subtract
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added together and subtracted from double width elements
+ *               in_c vector.
+ * Example     : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c,
+                                           __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvmulwev_h_bu(in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  out = __lasx_xvsub_h(in_c, out);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Vector Signed Dot Product and Subtract
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               Signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added together and subtracted from double width elements
+ *               in_c vector.
+ * Example     : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ *        in_c : 0,0,0,0, 0,0,0,0
+ *        in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
+ *        in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ *         out : -7,-3,0,0, 0,-1,0,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c,
+                                          __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  out = __lasx_xvsub_w(in_c, out);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               four times the size of input i.e. signed doubleword.
+ *               Then this multiplication results of four adjacent elements
+ *               are added together and stored to the out vector.
+ * Example     : out = __lasx_xvdp4_d_h(in_h, in_l)
+ *        in_h :  3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
+ *        in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
+ *         out : -2,0,1,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  out = __lasx_xvhaddw_d_w(out, out);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               higher half of the two-fold sign extension (signed byte
+ *               to signed halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwh_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvilvh_b(in_h, in_l);
+  out = __lasx_xvhaddw_h_b(out, out);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               higher half of the two-fold sign extension (signed halfword
+ *               to signed word) and stored to the out vector.
+ * Example     : out = __lasx_xvaddwh_w_h(in_h, in_l)
+ *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ *         out : 1,0,0,-1, 1,0,0, 2
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvilvh_h(in_h, in_l);
+  out = __lasx_xvhaddw_w_h(out, out);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               lower half of the two-fold sign extension (signed byte
+ *               to signed halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvilvl_b(in_h, in_l);
+  out = __lasx_xvhaddw_h_b(out, out);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               lower half of the two-fold sign extension (signed halfword
+ *               to signed word) and stored to the out vector.
+ * Example     : out = __lasx_xvaddwl_w_h(in_h, in_l)
+ *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ *         out : 5,-1,4,2, 1,0,2,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvilvl_h(in_h, in_l);
+  out = __lasx_xvhaddw_w_h(out, out);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The out vector and the out vector are added after the
+ *               lower half of the two-fold zero extension (unsigned byte
+ *               to unsigned halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvilvl_b(in_h, in_l);
+  out = __lasx_xvhaddw_hu_bu(out, out);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_l vector after double zero extension (unsigned byte to
+ *               signed halfword)，added to the in_h vector.
+ * Example     : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvsllwil_hu_bu(in_l, 0);
+  out = __lasx_xvadd_h(in_h, out);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_l vector after double sign extension (signed halfword to
+ *               signed word), added to the in_h vector.
+ * Example     : out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ *        in_h : 0, 1,0,0, -1,0,0,1,
+ *        in_l : 2,-1,1,2,  1,0,0,0, 0,0,1,0, 1,0,0,1,
+ *         out : 2, 0,1,2, -1,0,1,1,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+  out = __lasx_xvsllwil_w_h(in_l, 0);
+  out = __lasx_xvadd_w(in_h, out);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ *               of the lower half of the vector.
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed halfword
+ *               to signed word), and the result is added to the vector in_c,
+ *               then stored to the out vector.
+ * Example     : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 5,6,7,8
+ *        in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
+ *        in_l : 200, 300, 400, 500,  2000, 3000, 4000, 5000,
+ *              -200,-300,-400,-500, -2000,-3000,-4000,-5000
+ *         out : 201, 602,1203,2004, -995, -1794,-2793,-3992
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c,
+                                          __m256i in_h,
+                                          __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+  tmp0 = __lasx_xvmul_w(tmp0, tmp1);
+  out = __lasx_xvadd_w(tmp0, in_c);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ *               of the higher half of the vector.
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the higher half of the two-fold sign extension (signed
+ *               halfword to signed word), and the result is added to
+ *               the vector in_c, then stored to the out vector.
+ * Example     : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c,
+                                          __m256i in_h,
+                                          __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+  tmp0 = __lasx_xvilvh_h(in_h, in_h);
+  tmp1 = __lasx_xvilvh_h(in_l, in_l);
+  tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
+  out = __lasx_xvadd_w(tmp0, in_c);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ *               half of the vector.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed
+ *               halfword to signed word), then stored to the out vector.
+ * Example     : out = __lasx_xvmulwl_w_h(in_h, in_l)
+ *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ *         out : 6,1,3,0, 0,0,1,0
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+  out = __lasx_xvmul_w(tmp0, tmp1);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ *               half of the vector.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed
+ *               halfword to signed word), then stored to the out vector.
+ * Example     : out = __lasx_xvmulwh_w_h(in_h, in_l)
+ *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ *         out : 0,0,0,0, 0,0,0,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+  tmp0 = __lasx_xvilvh_h(in_h, in_h);
+  tmp1 = __lasx_xvilvh_h(in_l, in_l);
+  out = __lasx_xvmulwev_w_h(tmp0, tmp1);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are added to the high half
+ *               after being doubled, then saturated.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector adds the in_l vector after the lower half of
+ *               the two-fold zero extension (unsigned byte to unsigned
+ *               halfword) and then saturated. The results are stored to the out
+ *               vector.
+ * Example     : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
+ *        in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ *        in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
+ *               0,0,0,1
+ *        out  : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
+  __m256i tmp1, out;
+  __m256i zero = {0};
+  tmp1 = __lasx_xvilvl_b(zero, in_l);
+  out = __lasx_xvsadd_hu(in_h, tmp1);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ *               out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
+ * Arguments   : Inputs  - in    (input vector)
+ *                       - min   (min threshold)
+ *                       - max   (max threshold)
+ *               Outputs - in    (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : out = __lasx_xvclip_h(in, min, max)
+ *          in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
+ *         min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
+ *         max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
+ *         out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
+  __m256i out;
+  out = __lasx_xvmax_h(min, in);
+  out = __lasx_xvmin_h(max, out);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Clip all signed halfword elements of input vector
+ *               between 0 & 255
+ * Arguments   : Inputs  - in   (input vector)
+ *               Outputs - out  (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : See out = __lasx_xvclip255_w(in)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_h(__m256i in) {
+  __m256i out;
+  out = __lasx_xvmaxi_h(in, 0);
+  out = __lasx_xvsat_hu(out, 7);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Clip all signed word elements of input vector
+ *               between 0 & 255
+ * Arguments   : Inputs - in   (input vector)
+ *               Output - out  (output vector with clipped elements)
+ *               Return Type - signed word
+ * Example     : out = __lasx_xvclip255_w(in)
+ *          in : -8,255,280,249, -8,255,280,249
+ *         out :  0,255,255,249,  0,255,255,249
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_w(__m256i in) {
+  __m256i out;
+  out = __lasx_xvmaxi_w(in, 0);
+  out = __lasx_xvsat_wu(out, 7);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ *               if 'idx >= 8' use xvsplati_h_*.
+ * Arguments   : Inputs - in, idx
+ *               Output - out
+ * Details     : Idx element value from in vector is replicated to all
+ *               elements in out vector.
+ *               Valid index range for halfword operation is 0-7
+ * Example     : out = __lasx_xvsplati_l_h(in, idx)
+ *          in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
+ *         idx : 0x02
+ *         out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
+  __m256i out;
+  out = __lasx_xvpermi_q(in, in, 0x02);
+  out = __lasx_xvreplve_h(out, idx);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ *               if 'idx >= 8' use xvsplati_h_*.
+ * Arguments   : Inputs - in, idx
+ *               Output - out
+ * Details     : Idx element value from in vector is replicated to all
+ *               elements in out vector.
+ *               Valid index range for halfword operation is 0-7
+ * Example     : out = __lasx_xvsplati_h_h(in, idx)
+ *          in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
+ *         idx : 0x09
+ *         out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
+  __m256i out;
+  out = __lasx_xvpermi_q(in, in, 0x13);
+  out = __lasx_xvreplve_h(out, idx);
+  return out;
+}
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with double-word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Example     : LASX_TRANSPOSE4x4_D
+ *        _in0 : 1,2,3,4
+ *        _in1 : 1,2,3,4
+ *        _in2 : 1,2,3,4
+ *        _in3 : 1,2,3,4
+ *
+ *       _out0 : 1,1,1,1
+ *       _out1 : 2,2,2,2
+ *       _out2 : 3,3,3,3
+ *       _out3 : 4,4,4,4
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+                            _out3)                                       \
+  {                                                                      \
+    __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                  \
+    _tmp0 = __lasx_xvilvl_d(_in1, _in0);                                 \
+    _tmp1 = __lasx_xvilvh_d(_in1, _in0);                                 \
+    _tmp2 = __lasx_xvilvl_d(_in3, _in2);                                 \
+    _tmp3 = __lasx_xvilvh_d(_in3, _in2);                                 \
+    _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20);                        \
+    _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31);                        \
+    _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20);                        \
+    _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31);                        \
+  }
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *               _out7
+ * Example     : LASX_TRANSPOSE8x8_W
+ *        _in0 : 1,2,3,4,5,6,7,8
+ *        _in1 : 2,2,3,4,5,6,7,8
+ *        _in2 : 3,2,3,4,5,6,7,8
+ *        _in3 : 4,2,3,4,5,6,7,8
+ *        _in4 : 5,2,3,4,5,6,7,8
+ *        _in5 : 6,2,3,4,5,6,7,8
+ *        _in6 : 7,2,3,4,5,6,7,8
+ *        _in7 : 8,2,3,4,5,6,7,8
+ *
+ *       _out0 : 1,2,3,4,5,6,7,8
+ *       _out1 : 2,2,2,2,2,2,2,2
+ *       _out2 : 3,3,3,3,3,3,3,3
+ *       _out3 : 4,4,4,4,4,4,4,4
+ *       _out4 : 5,5,5,5,5,5,5,5
+ *       _out5 : 6,6,6,6,6,6,6,6
+ *       _out6 : 7,7,7,7,7,7,7,7
+ *       _out7 : 8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _s0_m, _s1_m;                                                    \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+                                                                             \
+    _s0_m = __lasx_xvilvl_w(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvl_w(_in3, _in1);                                     \
+    _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_w(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvh_w(_in3, _in1);                                     \
+    _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvl_w(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvl_w(_in7, _in5);                                     \
+    _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_w(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvh_w(_in7, _in5);                                     \
+    _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20);                        \
+    _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20);                        \
+    _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20);                        \
+    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20);                        \
+    _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31);                        \
+    _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31);                        \
+    _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31);                        \
+    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31);                        \
+  }
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ *                         (input 16x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x16 byte block)
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : See LASX_TRANSPOSE16x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                             _out6, _out7)                                    \
+  {                                                                           \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
+                                                                              \
+    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvl_b(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvl_b(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvl_b(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvl_b(_in15, _in13);                                  \
+    _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                \
+    _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                \
+    _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                \
+    _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                \
+    _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m);                                \
+    _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m);                                \
+    _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m);                                \
+    _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m);                                \
+    _tmp0_m = __lasx_xvilvl_w(_out2, _out0);                                  \
+    _tmp2_m = __lasx_xvilvh_w(_out2, _out0);                                  \
+    _tmp4_m = __lasx_xvilvl_w(_out3, _out1);                                  \
+    _tmp6_m = __lasx_xvilvh_w(_out3, _out1);                                  \
+    _tmp1_m = __lasx_xvilvl_w(_out6, _out4);                                  \
+    _tmp3_m = __lasx_xvilvh_w(_out6, _out4);                                  \
+    _tmp5_m = __lasx_xvilvl_w(_out7, _out5);                                  \
+    _tmp7_m = __lasx_xvilvh_w(_out7, _out5);                                  \
+    _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m);                                \
+    _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m);                                \
+    _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m);                                \
+    _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m);                                \
+    _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m);                                \
+    _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m);                                \
+    _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m);                                \
+    _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m);                                \
+  }
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ *                         (input 16x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x16 byte block)
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LASX_TRANSPOSE16x8_H
+ *        _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *
+ *       _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
+ *       _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
+ *       _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
+ *       _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
+ *       _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+ *       _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
+ *       _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+ *       _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                             _out6, _out7)                                    \
+  {                                                                           \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
+    __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                           \
+                                                                              \
+    _tmp0_m = __lasx_xvilvl_h(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvl_h(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvl_h(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvl_h(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvl_h(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvl_h(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvl_h(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvl_h(_in15, _in13);                                  \
+    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
+    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
+    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
+    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
+    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
+    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
+    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
+    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
+    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
+    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
+    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
+    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
+    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
+    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
+    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
+    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
+    _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
+    _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
+    _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
+    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
+                                                                              \
+    _tmp0_m = __lasx_xvilvh_h(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvh_h(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvh_h(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvh_h(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvh_h(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvh_h(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvh_h(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvh_h(_in15, _in13);                                  \
+    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
+    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
+    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
+    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
+    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
+    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
+    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
+    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
+    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
+    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
+    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
+    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
+    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
+    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
+    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
+    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
+    _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
+    _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
+    _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
+    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
+  }
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with halfword elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ *               Return Type - signed halfword
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+                            _out3)                                       \
+  {                                                                      \
+    __m256i _s0_m, _s1_m;                                                \
+                                                                         \
+    _s0_m = __lasx_xvilvl_h(_in1, _in0);                                 \
+    _s1_m = __lasx_xvilvl_h(_in3, _in2);                                 \
+    _out0 = __lasx_xvilvl_w(_s1_m, _s0_m);                               \
+    _out2 = __lasx_xvilvh_w(_s1_m, _s0_m);                               \
+    _out1 = __lasx_xvilvh_d(_out0, _out0);                               \
+    _out3 = __lasx_xvilvh_d(_out2, _out2);                               \
+  }
+/*
+ * =============================================================================
+ * Description : Transpose input 8x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *                         (input 8x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x8 byte block)
+ * Example     : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                   \
+    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                   \
+    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                   \
+    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                   \
+    _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                             \
+    _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                             \
+    _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                             \
+    _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m);                               \
+    _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m);                               \
+    _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m);                               \
+    _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m);                               \
+    _out1 = __lasx_xvbsrl_v(_out0, 8);                                       \
+    _out3 = __lasx_xvbsrl_v(_out2, 8);                                       \
+    _out5 = __lasx_xvbsrl_v(_out4, 8);                                       \
+    _out7 = __lasx_xvbsrl_v(_out6, 8);                                       \
+  }
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with halfword elements in vectors.
+ * Arguments   : Inputs  - _in0, _in1, ~
+ *               Outputs - _out0, _out1, ~
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LASX_TRANSPOSE8x8_H
+ *        _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ *        _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ *        _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ *        _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ *
+ *       _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
+ *       _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ *       _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
+ *       _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
+ *       _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
+ *       _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
+ *       _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
+ *       _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _s0_m, _s1_m;                                                    \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+                                                                             \
+    _s0_m = __lasx_xvilvl_h(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvl_h(_in7, _in5);                                     \
+    _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_h(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvh_h(_in7, _in5);                                     \
+    _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+                                                                             \
+    _s0_m = __lasx_xvilvl_h(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvl_h(_in3, _in1);                                     \
+    _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_h(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvh_h(_in3, _in1);                                     \
+    _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+                                                                             \
+    _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m);                             \
+    _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m);                             \
+    _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m);                             \
+    _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m);                             \
+    _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m);                             \
+    _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m);                             \
+    _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m);                             \
+    _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m);                             \
+  }
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Details     : Butterfly operation
+ * Example     : LASX_BUTTERFLY_4
+ *               _out0 = _in0 + _in3;
+ *               _out1 = _in1 + _in2;
+ *               _out2 = _in1 - _in2;
+ *               _out3 = _in0 - _in3;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_b(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_b(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_b(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_b(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_h(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_h(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_h(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_h(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_w(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_w(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_w(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_w(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_d(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_d(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_d(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_d(_in0, _in3);                                        \
+  }
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     : LASX_BUTTERFLY_8
+ *               _out0 = _in0 + _in7;
+ *               _out1 = _in1 + _in6;
+ *               _out2 = _in2 + _in5;
+ *               _out3 = _in3 + _in4;
+ *               _out4 = _in3 - _in4;
+ *               _out5 = _in2 - _in5;
+ *               _out6 = _in1 - _in6;
+ *               _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_b(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_b(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_b(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_b(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_b(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_b(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_b(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_b(_in0, _in7);                                     \
+  }
+#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_h(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_h(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_h(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_h(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_h(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_h(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_h(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_h(_in0, _in7);                                     \
+  }
+#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_w(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_w(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_w(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_w(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_w(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_w(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_w(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_w(_in0, _in7);                                     \
+  }
+#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_d(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_d(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_d(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_d(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_d(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_d(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_d(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_d(_in0, _in7);                                     \
+  }
+#endif  // LASX
+/*
+ * =============================================================================
+ * Description : Print out elements in vector.
+ * Arguments   : Inputs  - RTYPE, _element_num, _in0, _enter
+ *               Outputs -
+ * Details     : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
+ *               '_enter' is TRUE, prefix "\nVP:" will be added first.
+ * Example     : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
+ *               VP:1,2,3,4,
+ * =============================================================================
+ */
+#define VECT_PRINT(RTYPE, element_num, in0, enter) \
+  {                                                \
+    RTYPE _tmp0 = (RTYPE)in0;                      \
+    int _i = 0;                                    \
+    if (enter)                                     \
+      printf("\nVP:");                             \
+    for (_i = 0; _i < element_num; _i++)           \
+      printf("%d,", _tmp0[_i]);                    \
+  }
+#endif /* LOONGSON_INTRINSICS_H */
+#endif /* INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H */
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/macros_msa.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/macros_msa.h
+/*
+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_MACROS_MSA_H_
+#define INCLUDE_LIBYUV_MACROS_MSA_H_
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include <msa.h>
+#include <stdint.h>
+#if (__mips_isa_rev >= 6)
+#define LW(psrc)                                       \
+  ({                                                   \
+    const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
+    uint32_t val_m;                                    \
+    asm volatile("lw  %[val_m],  %[psrc_lw_m]  \n"     \
+                 : [val_m] "=r"(val_m)                 \
+                 : [psrc_lw_m] "m"(*psrc_lw_m));       \
+    val_m;                                             \
+  })
+#if (__mips == 64)
+#define LD(psrc)                                       \
+  ({                                                   \
+    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
+    uint64_t val_m = 0;                                \
+    asm volatile("ld  %[val_m],  %[psrc_ld_m]  \n"     \
+                 : [val_m] "=r"(val_m)                 \
+                 : [psrc_ld_m] "m"(*psrc_ld_m));       \
+    val_m;                                             \
+  })
+#else  // !(__mips == 64)
+#define LD(psrc)                                                         \
+  ({                                                                     \
+    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);                   \
+    uint32_t val0_m, val1_m;                                             \
+    uint64_t val_m = 0;                                                  \
+    val0_m = LW(psrc_ld_m);                                              \
+    val1_m = LW(psrc_ld_m + 4);                                          \
+    val_m = (uint64_t)(val1_m);                             /* NOLINT */ \
+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           /* NOLINT */ \
+    val_m;                                                               \
+  })
+#endif  // (__mips == 64)
+#define SW(val, pdst)                                   \
+  ({                                                    \
+    uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
+    uint32_t val_m = (val);                             \
+    asm volatile("sw  %[val_m],  %[pdst_sw_m]  \n"      \
+                 : [pdst_sw_m] "=m"(*pdst_sw_m)         \
+                 : [val_m] "r"(val_m));                 \
+  })
+#if (__mips == 64)
+#define SD(val, pdst)                                   \
+  ({                                                    \
+    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
+    uint64_t val_m = (val);                             \
+    asm volatile("sd  %[val_m],  %[pdst_sd_m]  \n"      \
+                 : [pdst_sd_m] "=m"(*pdst_sd_m)         \
+                 : [val_m] "r"(val_m));                 \
+  })
+#else  // !(__mips == 64)
+#define SD(val, pdst)                                        \
+  ({                                                         \
+    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */      \
+    uint32_t val0_m, val1_m;                                 \
+    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
+    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+    SW(val0_m, pdst_sd_m);                                   \
+    SW(val1_m, pdst_sd_m + 4);                               \
+  })
+#endif  // !(__mips == 64)
+#else   // !(__mips_isa_rev >= 6)
+#define LW(psrc)                                \
+  ({                                            \
+    uint8_t* psrc_lw_m = (uint8_t*)(psrc);      \
+    uint32_t val_lw_m;                          \
+                                                \
+    __asm__ volatile(                           \
+        "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
+        "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
+                                                \
+        : [val_lw_m] "=&r"(val_lw_m)            \
+        : [psrc_lw_m] "r"(psrc_lw_m));          \
+                                                \
+    val_lw_m;                                   \
+  })
+#if (__mips == 64)
+#define LD(psrc)                                \
+  ({                                            \
+    uint8_t* psrc_ld_m = (uint8_t*)(psrc);      \
+    uint64_t val_ld_m = 0;                      \
+                                                \
+    __asm__ volatile(                           \
+        "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
+        "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
+                                                \
+        : [val_ld_m] "=&r"(val_ld_m)            \
+        : [psrc_ld_m] "r"(psrc_ld_m));          \
+                                                \
+    val_ld_m;                                   \
+  })
+#else  // !(__mips == 64)
+#define LD(psrc)                                                         \
+  ({                                                                     \
+    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);                   \
+    uint32_t val0_m, val1_m;                                             \
+    uint64_t val_m = 0;                                                  \
+    val0_m = LW(psrc_ld_m);                                              \
+    val1_m = LW(psrc_ld_m + 4);                                          \
+    val_m = (uint64_t)(val1_m);                             /* NOLINT */ \
+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           /* NOLINT */ \
+    val_m;                                                               \
+  })
+#endif  // (__mips == 64)
+#define SW(val, pdst)                                   \
+  ({                                                    \
+    uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
+    uint32_t val_m = (val);                             \
+    asm volatile("usw  %[val_m],  %[pdst_sw_m]  \n"     \
+                 : [pdst_sw_m] "=m"(*pdst_sw_m)         \
+                 : [val_m] "r"(val_m));                 \
+  })
+#define SD(val, pdst)                                        \
+  ({                                                         \
+    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */      \
+    uint32_t val0_m, val1_m;                                 \
+    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
+    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+    SW(val0_m, pdst_sd_m);                                   \
+    SW(val1_m, pdst_sd_m + 4);                               \
+  })
+#endif  // (__mips_isa_rev >= 6)
+// TODO(fbarchard): Consider removing __VAR_ARGS versions.
+#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
+#define LD_UB(...) LD_B(const v16u8, __VA_ARGS__)
+#define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
+#define LD_UH(...) LD_H(const v8u16, __VA_ARGS__)
+#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
+#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
+/* Description : Load two vectors with 16 'byte' sized elements
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Load 16 byte elements in 'out0' from (psrc)
+                 Load 16 byte elements in 'out1' from (psrc + stride)
+*/
+#define LD_B2(RTYPE, psrc, stride, out0, out1) \
+  {                                            \
+    out0 = LD_B(RTYPE, (psrc));                \
+    out1 = LD_B(RTYPE, (psrc) + stride);       \
+  }
+#define LD_UB2(...) LD_B2(const v16u8, __VA_ARGS__)
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+  {                                                        \
+    LD_B2(RTYPE, (psrc), stride, out0, out1);              \
+    LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+  }
+#define LD_UB4(...) LD_B4(const v16u8, __VA_ARGS__)
+/* Description : Store two vectors with stride each having 16 'byte' sized
+                 elements
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 16 byte elements from 'in0' to (pdst)
+                 Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2(RTYPE, in0, in1, pdst, stride) \
+  {                                          \
+    ST_B(RTYPE, in0, (pdst));                \
+    ST_B(RTYPE, in1, (pdst) + stride);       \
+  }
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
+  {                                                      \
+    ST_B2(RTYPE, in0, in1, (pdst), stride);              \
+    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+  }
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+/* Description : Store vectors of 8 halfword elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 8 halfword elements from 'in0' to (pdst)
+                 Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride) \
+  {                                          \
+    ST_H(RTYPE, in0, (pdst));                \
+    ST_H(RTYPE, in1, (pdst) + stride);       \
+  }
+#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
+// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly.
+/* Description : Shuffle byte vector elements as per mask vector
+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
+                 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
+  {                                                                   \
+    out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
+    out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
+  }
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+/* Description : Interleave both left and right half of input vectors
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements from 'in0' and 'in1' are
+                 interleaved and written to 'out0'
+*/
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+    out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+  }
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
+#endif  // INCLUDE_LIBYUV_MACROS_MSA_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/mjpeg_decoder.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/mjpeg_decoder.h
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_
+#define INCLUDE_LIBYUV_MJPEG_DECODER_H_
+#include "libyuv/basic_types.h"
+#ifdef __cplusplus
+// NOTE: For a simplified public API use convert.h MJPGToI420().
+struct jpeg_common_struct;
+struct jpeg_decompress_struct;
+struct jpeg_source_mgr;
+namespace libyuv {
+#ifdef __cplusplus
+extern "C" {
+#endif
+LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+static const uint32_t kUnknownDataSize = 0xFFFFFFFF;
+enum JpegSubsamplingType {
+  kJpegYuv420,
+  kJpegYuv422,
+  kJpegYuv444,
+  kJpegYuv400,
+  kJpegUnknown
+};
+struct Buffer {
+  const uint8_t* data;
+  int len;
+};
+struct BufferVector {
+  Buffer* buffers;
+  int len;
+  int pos;
+};
+struct SetJmpErrorMgr;
+// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
+// simply independent JPEG images with a fixed huffman table (which is omitted).
+// It is rarely used in video transmission, but is common as a camera capture
+// format, especially in Logitech devices. This class implements a decoder for
+// MJPEG frames.
+//
+// See http://tools.ietf.org/html/rfc2435
+class LIBYUV_API MJpegDecoder {
+ public:
+  typedef void (*CallbackFunction)(void* opaque,
+                                   const uint8_t* const* data,
+                                   const int* strides,
+                                   int rows);
+  static const int kColorSpaceUnknown;
+  static const int kColorSpaceGrayscale;
+  static const int kColorSpaceRgb;
+  static const int kColorSpaceYCbCr;
+  static const int kColorSpaceCMYK;
+  static const int kColorSpaceYCCK;
+  MJpegDecoder();
+  ~MJpegDecoder();
+  // Loads a new frame, reads its headers, and determines the uncompressed
+  // image format.
+  // Returns LIBYUV_TRUE if image looks valid and format is supported.
+  // If return value is LIBYUV_TRUE, then the values for all the following
+  // getters are populated.
+  // src_len is the size of the compressed mjpeg frame in bytes.
+  LIBYUV_BOOL LoadFrame(const uint8_t* src, size_t src_len);
+  // Returns width of the last loaded frame in pixels.
+  int GetWidth();
+  // Returns height of the last loaded frame in pixels.
+  int GetHeight();
+  // Returns format of the last loaded frame. The return value is one of the
+  // kColorSpace* constants.
+  int GetColorSpace();
+  // Number of color components in the color space.
+  int GetNumComponents();
+  // Sample factors of the n-th component.
+  int GetHorizSampFactor(int component);
+  int GetVertSampFactor(int component);
+  int GetHorizSubSampFactor(int component);
+  int GetVertSubSampFactor(int component);
+  // Public for testability.
+  int GetImageScanlinesPerImcuRow();
+  // Public for testability.
+  int GetComponentScanlinesPerImcuRow(int component);
+  // Width of a component in bytes.
+  int GetComponentWidth(int component);
+  // Height of a component.
+  int GetComponentHeight(int component);
+  // Width of a component in bytes with padding for DCTSIZE. Public for testing.
+  int GetComponentStride(int component);
+  // Size of a component in bytes.
+  int GetComponentSize(int component);
+  // Call this after LoadFrame() if you decide you don't want to decode it
+  // after all.
+  LIBYUV_BOOL UnloadFrame();
+  // Decodes the entire image into a one-buffer-per-color-component format.
+  // dst_width must match exactly. dst_height must be <= to image height; if
+  // less, the image is cropped. "planes" must have size equal to at least
+  // GetNumComponents() and they must point to non-overlapping buffers of size
+  // at least GetComponentSize(i). The pointers in planes are incremented
+  // to point to after the end of the written data.
+  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+  LIBYUV_BOOL DecodeToBuffers(uint8_t** planes, int dst_width, int dst_height);
+  // Decodes the entire image and passes the data via repeated calls to a
+  // callback function. Each call will get the data for a whole number of
+  // image scanlines.
+  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn,
+                               void* opaque,
+                               int dst_width,
+                               int dst_height);
+  // The helper function which recognizes the jpeg sub-sampling type.
+  static JpegSubsamplingType JpegSubsamplingTypeHelper(
+      int* subsample_x,
+      int* subsample_y,
+      int number_of_components);
+ private:
+  void AllocOutputBuffers(int num_outbufs);
+  void DestroyOutputBuffers();
+  LIBYUV_BOOL StartDecode();
+  LIBYUV_BOOL FinishDecode();
+  void SetScanlinePointers(uint8_t** data);
+  LIBYUV_BOOL DecodeImcuRow();
+  int GetComponentScanlinePadding(int component);
+  // A buffer holding the input data for a frame.
+  Buffer buf_;
+  BufferVector buf_vec_;
+  jpeg_decompress_struct* decompress_struct_;
+  jpeg_source_mgr* source_mgr_;
+  SetJmpErrorMgr* error_mgr_;
+  // LIBYUV_TRUE iff at least one component has scanline padding. (i.e.,
+  // GetComponentScanlinePadding() != 0.)
+  LIBYUV_BOOL has_scanline_padding_;
+  // Temporaries used to point to scanline outputs.
+  int num_outbufs_;  // Outermost size of all arrays below.
+  uint8_t*** scanlines_;
+  int* scanlines_sizes_;
+  // Temporary buffer used for decoding when we can't decode directly to the
+  // output buffers. Large enough for just one iMCU row.
+  uint8_t** databuf_;
+  int* databuf_strides_;
+};
+}  // namespace libyuv
+#endif  //  __cplusplus
+#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/planar_functions.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/planar_functions.h
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+// TODO(fbarchard): Move cpu macros to row.h
+#if defined(__pnacl__) || defined(__CLR_VER) ||            \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
+#define LIBYUV_DISABLE_X86
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_ARGBAFFINEROW_SSE2
+#endif
+// Copy a plane of data.
+LIBYUV_API
+void CopyPlane(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
+LIBYUV_API
+void CopyPlane_16(const uint16_t* src_y,
+                  int src_stride_y,
+                  uint16_t* dst_y,
+                  int dst_stride_y,
+                  int width,
+                  int height);
+LIBYUV_API
+void Convert16To8Plane(const uint16_t* src_y,
+                       int src_stride_y,
+                       uint8_t* dst_y,
+                       int dst_stride_y,
+                       int scale,  // 16384 for 10 bits
+                       int width,
+                       int height);
+LIBYUV_API
+void Convert8To16Plane(const uint8_t* src_y,
+                       int src_stride_y,
+                       uint16_t* dst_y,
+                       int dst_stride_y,
+                       int scale,  // 1024 for 10 bits
+                       int width,
+                       int height);
+// Set a plane of data to a 32 bit value.
+LIBYUV_API
+void SetPlane(uint8_t* dst_y,
+              int dst_stride_y,
+              int width,
+              int height,
+              uint32_t value);
+// Convert a plane of tiles of 16 x H to linear.
+LIBYUV_API
+int DetilePlane(const uint8_t* src_y,
+                int src_stride_y,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                int width,
+                int height,
+                int tile_height);
+// Convert a plane of 16 bit tiles of 16 x H to linear.
+LIBYUV_API
+int DetilePlane_16(const uint16_t* src_y,
+                   int src_stride_y,
+                   uint16_t* dst_y,
+                   int dst_stride_y,
+                   int width,
+                   int height,
+                   int tile_height);
+// Convert a UV plane of tiles of 16 x H into linear U and V planes.
+LIBYUV_API
+void DetileSplitUVPlane(const uint8_t* src_uv,
+                        int src_stride_uv,
+                        uint8_t* dst_u,
+                        int dst_stride_u,
+                        uint8_t* dst_v,
+                        int dst_stride_v,
+                        int width,
+                        int height,
+                        int tile_height);
+// Convert a Y and UV plane of tiles into interlaced YUY2.
+LIBYUV_API
+void DetileToYUY2(const uint8_t* src_y,
+                  int src_stride_y,
+                  const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_yuy2,
+                  int dst_stride_yuy2,
+                  int width,
+                  int height,
+                  int tile_height);
+// Split interleaved UV plane into separate U and V planes.
+LIBYUV_API
+void SplitUVPlane(const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int width,
+                  int height);
+// Merge separate U and V planes into one interleaved UV plane.
+LIBYUV_API
+void MergeUVPlane(const uint8_t* src_u,
+                  int src_stride_u,
+                  const uint8_t* src_v,
+                  int src_stride_v,
+                  uint8_t* dst_uv,
+                  int dst_stride_uv,
+                  int width,
+                  int height);
+// Split interleaved msb UV plane into separate lsb U and V planes.
+LIBYUV_API
+void SplitUVPlane_16(const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint16_t* dst_u,
+                     int dst_stride_u,
+                     uint16_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height,
+                     int depth);
+// Merge separate lsb U and V planes into one interleaved msb UV plane.
+LIBYUV_API
+void MergeUVPlane_16(const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint16_t* dst_uv,
+                     int dst_stride_uv,
+                     int width,
+                     int height,
+                     int depth);
+// Convert lsb plane to msb plane
+LIBYUV_API
+void ConvertToMSBPlane_16(const uint16_t* src_y,
+                          int src_stride_y,
+                          uint16_t* dst_y,
+                          int dst_stride_y,
+                          int width,
+                          int height,
+                          int depth);
+// Convert msb plane to lsb plane
+LIBYUV_API
+void ConvertToLSBPlane_16(const uint16_t* src_y,
+                          int src_stride_y,
+                          uint16_t* dst_y,
+                          int dst_stride_y,
+                          int width,
+                          int height,
+                          int depth);
+// Scale U and V to half width and height and merge into interleaved UV plane.
+// width and height are source size, allowing odd sizes.
+// Use for converting I444 or I422 to NV12.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int dst_stride_uv,
+                      int width,
+                      int height);
+// Swap U and V channels in interleaved UV plane.
+LIBYUV_API
+void SwapUVPlane(const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_vu,
+                 int dst_stride_vu,
+                 int width,
+                 int height);
+// Split interleaved RGB plane into separate R, G and B planes.
+LIBYUV_API
+void SplitRGBPlane(const uint8_t* src_rgb,
+                   int src_stride_rgb,
+                   uint8_t* dst_r,
+                   int dst_stride_r,
+                   uint8_t* dst_g,
+                   int dst_stride_g,
+                   uint8_t* dst_b,
+                   int dst_stride_b,
+                   int width,
+                   int height);
+// Merge separate R, G and B planes into one interleaved RGB plane.
+LIBYUV_API
+void MergeRGBPlane(const uint8_t* src_r,
+                   int src_stride_r,
+                   const uint8_t* src_g,
+                   int src_stride_g,
+                   const uint8_t* src_b,
+                   int src_stride_b,
+                   uint8_t* dst_rgb,
+                   int dst_stride_rgb,
+                   int width,
+                   int height);
+// Split interleaved ARGB plane into separate R, G, B and A planes.
+// dst_a can be NULL to discard alpha plane.
+LIBYUV_API
+void SplitARGBPlane(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_r,
+                    int dst_stride_r,
+                    uint8_t* dst_g,
+                    int dst_stride_g,
+                    uint8_t* dst_b,
+                    int dst_stride_b,
+                    uint8_t* dst_a,
+                    int dst_stride_a,
+                    int width,
+                    int height);
+// Merge separate R, G, B and A planes into one interleaved ARGB plane.
+// src_a can be NULL to fill opaque value to alpha.
+LIBYUV_API
+void MergeARGBPlane(const uint8_t* src_r,
+                    int src_stride_r,
+                    const uint8_t* src_g,
+                    int src_stride_g,
+                    const uint8_t* src_b,
+                    int src_stride_b,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height);
+// Merge separate 'depth' bit R, G and B planes stored in lsb
+// into one interleaved XR30 plane.
+// depth should in range [10, 16]
+LIBYUV_API
+void MergeXR30Plane(const uint16_t* src_r,
+                    int src_stride_r,
+                    const uint16_t* src_g,
+                    int src_stride_g,
+                    const uint16_t* src_b,
+                    int src_stride_b,
+                    uint8_t* dst_ar30,
+                    int dst_stride_ar30,
+                    int width,
+                    int height,
+                    int depth);
+// Merge separate 'depth' bit R, G, B and A planes stored in lsb
+// into one interleaved AR64 plane.
+// src_a can be NULL to fill opaque value to alpha.
+// depth should in range [1, 16]
+LIBYUV_API
+void MergeAR64Plane(const uint16_t* src_r,
+                    int src_stride_r,
+                    const uint16_t* src_g,
+                    int src_stride_g,
+                    const uint16_t* src_b,
+                    int src_stride_b,
+                    const uint16_t* src_a,
+                    int src_stride_a,
+                    uint16_t* dst_ar64,
+                    int dst_stride_ar64,
+                    int width,
+                    int height,
+                    int depth);
+// Merge separate 'depth' bit R, G, B and A planes stored in lsb
+// into one interleaved ARGB plane.
+// src_a can be NULL to fill opaque value to alpha.
+// depth should in range [8, 16]
+LIBYUV_API
+void MergeARGB16To8Plane(const uint16_t* src_r,
+                         int src_stride_r,
+                         const uint16_t* src_g,
+                         int src_stride_g,
+                         const uint16_t* src_b,
+                         int src_stride_b,
+                         const uint16_t* src_a,
+                         int src_stride_a,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height,
+                         int depth);
+// Copy I400.  Supports inverting.
+LIBYUV_API
+int I400ToI400(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
+#define J400ToJ400 I400ToI400
+// Copy I422 to I422.
+#define I422ToI422 I422Copy
+LIBYUV_API
+int I422Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+// Copy I444 to I444.
+#define I444ToI444 I444Copy
+LIBYUV_API
+int I444Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+// Copy I210 to I210.
+#define I210ToI210 I210Copy
+LIBYUV_API
+int I210Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+// Copy I410 to I410.
+#define I410ToI410 I410Copy
+LIBYUV_API
+int I410Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+// Copy NV12. Supports inverting.
+LIBYUV_API
+int NV12Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_uv,
+             int src_stride_uv,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_uv,
+             int dst_stride_uv,
+             int width,
+             int height);
+// Copy NV21. Supports inverting.
+LIBYUV_API
+int NV21Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_vu,
+             int src_stride_vu,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_vu,
+             int dst_stride_vu,
+             int width,
+             int height);
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+LIBYUV_API
+int YUY2ToNV12(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+LIBYUV_API
+int UYVYToNV12(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+// Convert NV21 to NV12.
+LIBYUV_API
+int NV21ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+LIBYUV_API
+int YUY2ToY(const uint8_t* src_yuy2,
+            int src_stride_yuy2,
+            uint8_t* dst_y,
+            int dst_stride_y,
+            int width,
+            int height);
+LIBYUV_API
+int UYVYToY(const uint8_t* src_uyvy,
+            int src_stride_uyvy,
+            uint8_t* dst_y,
+            int dst_stride_y,
+            int width,
+            int height);
+// Convert I420 to I400. (calls CopyPlane ignoring u/v).
+LIBYUV_API
+int I420ToI400(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
+// Alias
+#define J420ToJ400 I420ToI400
+#define I420ToI420Mirror I420Mirror
+// I420 mirror.
+LIBYUV_API
+int I420Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+// Alias
+#define I400ToI400Mirror I400Mirror
+// I400 mirror.  A single plane is mirrored horizontally.
+// Pass negative height to achieve 180 degree rotation.
+LIBYUV_API
+int I400Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
+// Alias
+#define NV12ToNV12Mirror NV12Mirror
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+// Alias
+#define ARGBToARGBMirror ARGBMirror
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Alias
+#define RGB24ToRGB24Mirror RGB24Mirror
+// RGB24 mirror.
+LIBYUV_API
+int RGB24Mirror(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+// Mirror a plane of data.
+LIBYUV_API
+void MirrorPlane(const uint8_t* src_y,
+                 int src_stride_y,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 int width,
+                 int height);
+// Mirror a plane of UV data.
+LIBYUV_API
+void MirrorUVPlane(const uint8_t* src_uv,
+                   int src_stride_uv,
+                   uint8_t* dst_uv,
+                   int dst_stride_uv,
+                   int width,
+                   int height);
+// Alias
+#define RGB24ToRAW RAWToRGB24
+LIBYUV_API
+int RAWToRGB24(const uint8_t* src_raw,
+               int src_stride_raw,
+               uint8_t* dst_rgb24,
+               int dst_stride_rgb24,
+               int width,
+               int height);
+// Draw a rectangle into I420.
+LIBYUV_API
+int I420Rect(uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int x,
+             int y,
+             int width,
+             int height,
+             int value_y,
+             int value_u,
+             int value_v);
+// Draw a rectangle into ARGB.
+LIBYUV_API
+int ARGBRect(uint8_t* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height,
+             uint32_t value);
+// Convert ARGB to gray scale ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8_t* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height);
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8_t* dst_argb,
+              int dst_stride_argb,
+              int dst_x,
+              int dst_y,
+              int width,
+              int height);
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The next 4 coefficients apply to B, G, R, A and produce R of the output.
+// The last 4 coefficients apply to B, G, R, A and produce A of the output.
+LIBYUV_API
+int ARGBColorMatrix(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    const int8_t* matrix_argb,
+                    int width,
+                    int height);
+// Deprecated. Use ARGBColorMatrix instead.
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The last 4 coefficients apply to B, G, R, A and produce R of the output.
+LIBYUV_API
+int RGBColorMatrix(uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const int8_t* matrix_rgb,
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height);
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const uint8_t* table_argb,
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height);
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  const uint8_t* table_argb,
+                  int dst_x,
+                  int dst_y,
+                  int width,
+                  int height);
+// Apply a luma/color table each ARGB pixel but preserve destination alpha.
+// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
+// RGB (YJ style) and C is an 8 bit color component (R, G or B).
+LIBYUV_API
+int ARGBLumaColorTable(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_argb,
+                       int dst_stride_argb,
+                       const uint8_t* luma,
+                       int width,
+                       int height);
+// Apply a 3 term polynomial to ARGB values.
+// poly points to a 4x4 matrix.  The first row is constants.  The 2nd row is
+// coefficients for b, g, r and a.  The 3rd row is coefficients for b squared,
+// g squared, r squared and a squared.  The 4rd row is coefficients for b to
+// the 3, g to the 3, r to the 3 and a to the 3.  The values are summed and
+// result clamped to 0 to 255.
+// A polynomial approximation can be dirived using software such as 'R'.
+LIBYUV_API
+int ARGBPolynomial(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const float* poly,
+                   int width,
+                   int height);
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
+LIBYUV_API
+int HalfFloatPlane(const uint16_t* src_y,
+                   int src_stride_y,
+                   uint16_t* dst_y,
+                   int dst_stride_y,
+                   float scale,
+                   int width,
+                   int height);
+// Convert a buffer of bytes to floats, scale the values and store as floats.
+LIBYUV_API
+int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width);
+// Quantize a rectangle of ARGB. Alpha unaffected.
+// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
+// interval_size should be a value between 1 and 255.
+// interval_offset should be a value between 0 and 255.
+LIBYUV_API
+int ARGBQuantize(uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int scale,
+                 int interval_size,
+                 int interval_offset,
+                 int dst_x,
+                 int dst_y,
+                 int width,
+                 int height);
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height);
+// Copy Alpha channel of ARGB to alpha of ARGB.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height);
+// Extract the alpha channel from ARGB.
+LIBYUV_API
+int ARGBExtractAlpha(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_a,
+                     int dst_stride_a,
+                     int width,
+                     int height);
+// Copy Y channel to Alpha of ARGB.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8_t* src_y,
+                     int src_stride_y,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height);
+typedef void (*ARGBBlendRow)(const uint8_t* src_argb0,
+                             const uint8_t* src_argb1,
+                             uint8_t* dst_argb,
+                             int width);
+// Get function to Alpha Blend ARGB pixels and store to destination.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend();
+// Alpha Blend ARGB images and store to destination.
+// Source is pre-multiplied by alpha using ARGBAttenuate.
+// Alpha of destination is set to 255.
+LIBYUV_API
+int ARGBBlend(const uint8_t* src_argb0,
+              int src_stride_argb0,
+              const uint8_t* src_argb1,
+              int src_stride_argb1,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height);
+// Alpha Blend plane and store to destination.
+// Source is not pre-multiplied by alpha.
+LIBYUV_API
+int BlendPlane(const uint8_t* src_y0,
+               int src_stride_y0,
+               const uint8_t* src_y1,
+               int src_stride_y1,
+               const uint8_t* alpha,
+               int alpha_stride,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
+// Alpha Blend YUV images and store to destination.
+// Source is not pre-multiplied by alpha.
+// Alpha is full width x height and subsampled to half size to apply to UV.
+LIBYUV_API
+int I420Blend(const uint8_t* src_y0,
+              int src_stride_y0,
+              const uint8_t* src_u0,
+              int src_stride_u0,
+              const uint8_t* src_v0,
+              int src_stride_v0,
+              const uint8_t* src_y1,
+              int src_stride_y1,
+              const uint8_t* src_u1,
+              int src_stride_u1,
+              const uint8_t* src_v1,
+              int src_stride_v1,
+              const uint8_t* alpha,
+              int alpha_stride,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height);
+// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
+LIBYUV_API
+int ARGBMultiply(const uint8_t* src_argb0,
+                 int src_stride_argb0,
+                 const uint8_t* src_argb1,
+                 int src_stride_argb1,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height);
+// Add ARGB image with ARGB image. Saturates to 255.
+LIBYUV_API
+int ARGBAdd(const uint8_t* src_argb0,
+            int src_stride_argb0,
+            const uint8_t* src_argb1,
+            int src_stride_argb1,
+            uint8_t* dst_argb,
+            int dst_stride_argb,
+            int width,
+            int height);
+// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
+LIBYUV_API
+int ARGBSubtract(const uint8_t* src_argb0,
+                 int src_stride_argb0,
+                 const uint8_t* src_argb1,
+                 int src_stride_argb1,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height);
+// Convert I422 to YUY2.
+LIBYUV_API
+int I422ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height);
+// Convert I422 to UYVY.
+LIBYUV_API
+int I422ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height);
+// Convert unattentuated ARGB to preattenuated ARGB.
+LIBYUV_API
+int ARGBAttenuate(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height);
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height);
+// Internal function - do not call directly.
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8_t* src_argb,
+                             int src_stride_argb,
+                             int32_t* dst_cumsum,
+                             int dst_stride32_cumsum,
+                             int width,
+                             int height);
+// Blur ARGB image.
+// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
+//   16 byte boundary.
+// dst_stride32_cumsum is number of ints in a row (width * 4).
+// radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.
+// Blur is optimized for radius of 5 (11x11) or less.
+LIBYUV_API
+int ARGBBlur(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int32_t* dst_cumsum,
+             int dst_stride32_cumsum,
+             int width,
+             int height,
+             int radius);
+// Gaussian 5x5 blur a float plane.
+// Coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+                   int src_stride,
+                   float* dst,
+                   int dst_stride,
+                   int width,
+                   int height);
+// Multiply ARGB image by ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height,
+              uint32_t value);
+// Interpolate between two images using specified amount of interpolation
+// (0 to 255) and store to destination.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
+// and 255 means 1% src0 and 99% src1.
+LIBYUV_API
+int InterpolatePlane(const uint8_t* src0,
+                     int src_stride0,
+                     const uint8_t* src1,
+                     int src_stride1,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int width,
+                     int height,
+                     int interpolation);
+// Interpolate between two images using specified amount of interpolation
+// (0 to 255) and store to destination.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
+// and 255 means 1% src0 and 99% src1.
+LIBYUV_API
+int InterpolatePlane_16(const uint16_t* src0,
+                        int src_stride0,  // measured in 16 bit pixels
+                        const uint16_t* src1,
+                        int src_stride1,
+                        uint16_t* dst,
+                        int dst_stride,
+                        int width,
+                        int height,
+                        int interpolation);
+// Interpolate between two ARGB images using specified amount of interpolation
+// Internally calls InterpolatePlane with width * 4 (bpp).
+LIBYUV_API
+int ARGBInterpolate(const uint8_t* src_argb0,
+                    int src_stride_argb0,
+                    const uint8_t* src_argb1,
+                    int src_stride_argb1,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int interpolation);
+// Interpolate between two YUV images using specified amount of interpolation
+// Internally calls InterpolatePlane on each plane where the U and V planes
+// are half width and half height.
+LIBYUV_API
+int I420Interpolate(const uint8_t* src0_y,
+                    int src0_stride_y,
+                    const uint8_t* src0_u,
+                    int src0_stride_u,
+                    const uint8_t* src0_v,
+                    int src0_stride_v,
+                    const uint8_t* src1_y,
+                    int src1_stride_y,
+                    const uint8_t* src1_u,
+                    int src1_stride_u,
+                    const uint8_t* src1_v,
+                    int src1_stride_v,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    int width,
+                    int height,
+                    int interpolation);
+// Row function for copying pixels from a source with a slope to a row
+// of destination. Useful for scaling, rotation, mirror, texture mapping.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8_t* src_argb,
+                     int src_argb_stride,
+                     uint8_t* dst_argb,
+                     const float* uv_dudv,
+                     int width);
+// TODO(fbarchard): Move ARGBAffineRow_SSE2 to row.h
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+                        int src_argb_stride,
+                        uint8_t* dst_argb,
+                        const float* uv_dudv,
+                        int width);
+// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
+// shuffler is 16 bytes.
+LIBYUV_API
+int ARGBShuffle(const uint8_t* src_bgra,
+                int src_stride_bgra,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                const uint8_t* shuffler,
+                int width,
+                int height);
+// Shuffle AR64 channel order.  e.g. AR64 to AB64.
+// shuffler is 16 bytes.
+LIBYUV_API
+int AR64Shuffle(const uint16_t* src_ar64,
+                int src_stride_ar64,
+                uint16_t* dst_ar64,
+                int dst_stride_ar64,
+                const uint8_t* shuffler,
+                int width,
+                int height);
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     int width,
+                     int height);
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height);
+// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
+LIBYUV_API
+int ARGBSobelXY(const uint8_t* src_argb,
+                int src_stride_argb,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height);
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/rotate.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/rotate.h
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_ROTATE_H_
+#define INCLUDE_LIBYUV_ROTATE_H_
+#include "libyuv/basic_types.h"
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+// Supported rotation.
+typedef enum RotationMode {
+  kRotate0 = 0,      // No rotation.
+  kRotate90 = 90,    // Rotate 90 degrees clockwise.
+  kRotate180 = 180,  // Rotate 180 degrees.
+  kRotate270 = 270,  // Rotate 270 degrees clockwise.
+  // Deprecated.
+  kRotateNone = 0,
+  kRotateClockwise = 90,
+  kRotateCounterClockwise = 270,
+} RotationModeEnum;
+// Rotate I420 frame.
+LIBYUV_API
+int I420Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+// Rotate I422 frame.
+LIBYUV_API
+int I422Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+// Rotate I444 frame.
+LIBYUV_API
+int I444Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+// Rotate I010 frame.
+LIBYUV_API
+int I010Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+// Rotate I210 frame.
+LIBYUV_API
+int I210Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+// Rotate I410 frame.
+LIBYUV_API
+int I410Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+// Rotate NV12 input and store in I420.
+LIBYUV_API
+int NV12ToI420Rotate(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height,
+                     enum RotationMode mode);
+// Convert Android420 to I420 with rotation.
+// "rotation" can be 0, 90, 180 or 270.
+LIBYUV_API
+int Android420ToI420Rotate(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           int src_pixel_stride_uv,
+                           uint8_t* dst_y,
+                           int dst_stride_y,
+                           uint8_t* dst_u,
+                           int dst_stride_u,
+                           uint8_t* dst_v,
+                           int dst_stride_v,
+                           int width,
+                           int height,
+                           enum RotationMode rotation);
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane(const uint8_t* src,
+                int src_stride,
+                uint8_t* dst,
+                int dst_stride,
+                int width,
+                int height,
+                enum RotationMode mode);
+// Rotate planes by 90, 180, 270. Deprecated.
+LIBYUV_API
+void RotatePlane90(const uint8_t* src,
+                   int src_stride,
+                   uint8_t* dst,
+                   int dst_stride,
+                   int width,
+                   int height);
+LIBYUV_API
+void RotatePlane180(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
+LIBYUV_API
+void RotatePlane270(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane_16(const uint16_t* src,
+                   int src_stride,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int width,
+                   int height,
+                   enum RotationMode mode);
+// Rotations for when U and V are interleaved.
+// These functions take one UV input pointer and
+// split the data into two buffers while
+// rotating them.
+// width and height expected to be half size for NV12.
+LIBYUV_API
+int SplitRotateUV(const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int width,
+                  int height,
+                  enum RotationMode mode);
+LIBYUV_API
+void SplitRotateUV90(const uint8_t* src,
+                     int src_stride,
+                     uint8_t* dst_a,
+                     int dst_stride_a,
+                     uint8_t* dst_b,
+                     int dst_stride_b,
+                     int width,
+                     int height);
+LIBYUV_API
+void SplitRotateUV180(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height);
+LIBYUV_API
+void SplitRotateUV270(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height);
+// The 90 and 270 functions are based on transposes.
+// Doing a transpose with reversing the read/write
+// order will result in a rotation by +- 90 degrees.
+// Deprecated.
+LIBYUV_API
+void TransposePlane(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
+LIBYUV_API
+void SplitTransposeUV(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height);
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_ROTATE_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/rotate_argb.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/rotate_argb.h
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_
+#define INCLUDE_LIBYUV_ROTATE_ARGB_H_
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"  // For RotationMode.
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+// Rotate ARGB frame
+LIBYUV_API
+int ARGBRotate(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int src_width,
+               int src_height,
+               enum RotationMode mode);
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/rotate_row.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/rotate_row.h
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_
+#define INCLUDE_LIBYUV_ROTATE_ROW_H_
+#include "libyuv/basic_types.h"
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+#if defined(__pnacl__) || defined(__CLR_VER) ||            \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
+#define LIBYUV_DISABLE_X86
+#endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// The following are available for Visual C 32 bit:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
+    !defined(__clang__)
+#define HAS_TRANSPOSEWX8_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+// The following are available for GCC 32 or 64 bit:
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
+#define HAS_TRANSPOSEWX8_SSSE3
+#endif
+// The following are available for 64 bit GCC:
+#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__)
+#define HAS_TRANSPOSEWX8_FAST_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_TRANSPOSEWX8_NEON
+#define HAS_TRANSPOSEUVWX8_NEON
+#endif
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_TRANSPOSEWX16_MSA
+#define HAS_TRANSPOSEUVWX16_MSA
+#endif
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#define HAS_TRANSPOSEWX16_LSX
+#define HAS_TRANSPOSEUVWX16_LSX
+#endif
+void TransposeWxH_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
+void TransposeWx8_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width);
+void TransposeWx16_C(const uint8_t* src,
+                     int src_stride,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int width);
+void TransposeWx8_NEON(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width);
+void TransposeWx8_SSSE3(const uint8_t* src,
+                        int src_stride,
+                        uint8_t* dst,
+                        int dst_stride,
+                        int width);
+void TransposeWx8_Fast_SSSE3(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst,
+                             int dst_stride,
+                             int width);
+void TransposeWx16_MSA(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width);
+void TransposeWx16_LSX(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width);
+void TransposeWx8_Any_NEON(const uint8_t* src,
+                           int src_stride,
+                           uint8_t* dst,
+                           int dst_stride,
+                           int width);
+void TransposeWx8_Any_SSSE3(const uint8_t* src,
+                            int src_stride,
+                            uint8_t* dst,
+                            int dst_stride,
+                            int width);
+void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src,
+                                 int src_stride,
+                                 uint8_t* dst,
+                                 int dst_stride,
+                                 int width);
+void TransposeWx16_Any_MSA(const uint8_t* src,
+                           int src_stride,
+                           uint8_t* dst,
+                           int dst_stride,
+                           int width);
+void TransposeWx16_Any_LSX(const uint8_t* src,
+                           int src_stride,
+                           uint8_t* dst,
+                           int dst_stride,
+                           int width);
+void TransposeUVWxH_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height);
+void TransposeUVWx8_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width);
+void TransposeUVWx16_C(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst_a,
+                       int dst_stride_a,
+                       uint8_t* dst_b,
+                       int dst_stride_b,
+                       int width);
+void TransposeUVWx8_SSE2(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width);
+void TransposeUVWx8_NEON(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width);
+void TransposeUVWx16_MSA(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width);
+void TransposeUVWx16_LSX(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width);
+void TransposeUVWx8_Any_SSE2(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst_a,
+                             int dst_stride_a,
+                             uint8_t* dst_b,
+                             int dst_stride_b,
+                             int width);
+void TransposeUVWx8_Any_NEON(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst_a,
+                             int dst_stride_a,
+                             uint8_t* dst_b,
+                             int dst_stride_b,
+                             int width);
+void TransposeUVWx16_Any_MSA(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst_a,
+                             int dst_stride_a,
+                             uint8_t* dst_b,
+                             int dst_stride_b,
+                             int width);
+void TransposeUVWx16_Any_LSX(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst_a,
+                             int dst_stride_a,
+                             uint8_t* dst_b,
+                             int dst_stride_b,
+                             int width);
+void TransposeWxH_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width,
+                       int height);
+void TransposeWx8_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width);
+void TransposeWx1_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width);
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_NEON(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
+void Transpose4x4_32_C(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width);
+// Transpose 32 bit values (ARGB)
+void Transpose8x8_32_NEON(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/row.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/row.h
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/scale.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/scale.h
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_SCALE_H_
+#define INCLUDE_LIBYUV_SCALE_H_
+#include "libyuv/basic_types.h"
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+// Supported filtering.
+typedef enum FilterMode {
+  kFilterNone = 0,      // Point sample; Fastest.
+  kFilterLinear = 1,    // Filter horizontally only.
+  kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
+  kFilterBox = 3        // Highest quality.
+} FilterModeEnum;
+// Scale a YUV plane.
+LIBYUV_API
+void ScalePlane(const uint8_t* src,
+                int src_stride,
+                int src_width,
+                int src_height,
+                uint8_t* dst,
+                int dst_stride,
+                int dst_width,
+                int dst_height,
+                enum FilterMode filtering);
+LIBYUV_API
+void ScalePlane_16(const uint16_t* src,
+                   int src_stride,
+                   int src_width,
+                   int src_height,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int dst_width,
+                   int dst_height,
+                   enum FilterMode filtering);
+// Sample is expected to be in the low 12 bits.
+LIBYUV_API
+void ScalePlane_12(const uint16_t* src,
+                   int src_stride,
+                   int src_width,
+                   int src_height,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int dst_width,
+                   int dst_height,
+                   enum FilterMode filtering);
+// Scales a YUV 4:2:0 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+LIBYUV_API
+int I420Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering);
+LIBYUV_API
+int I420Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering);
+LIBYUV_API
+int I420Scale_12(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering);
+// Scales a YUV 4:4:4 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+LIBYUV_API
+int I444Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering);
+LIBYUV_API
+int I444Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering);
+LIBYUV_API
+int I444Scale_12(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering);
+// Scales a YUV 4:2:2 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+LIBYUV_API
+int I422Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering);
+LIBYUV_API
+int I422Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering);
+LIBYUV_API
+int I422Scale_12(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering);
+// Scales an NV12 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// kFilterBox is not supported for the UV channel and will be treated as
+// bilinear.
+// Returns 0 if successful.
+LIBYUV_API
+int NV12Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_uv,
+              int src_stride_uv,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_uv,
+              int dst_stride_uv,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering);
+#ifdef __cplusplus
+// Legacy API.  Deprecated.
+LIBYUV_API
+int Scale(const uint8_t* src_y,
+          const uint8_t* src_u,
+          const uint8_t* src_v,
+          int src_stride_y,
+          int src_stride_u,
+          int src_stride_v,
+          int src_width,
+          int src_height,
+          uint8_t* dst_y,
+          uint8_t* dst_u,
+          uint8_t* dst_v,
+          int dst_stride_y,
+          int dst_stride_u,
+          int dst_stride_v,
+          int dst_width,
+          int dst_height,
+          LIBYUV_BOOL interpolate);
+// For testing, allow disabling of specialized scalers.
+LIBYUV_API
+void SetUseReferenceImpl(LIBYUV_BOOL use);
+#endif  // __cplusplus
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_SCALE_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/scale_argb.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/scale_argb.h
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_
+#define INCLUDE_LIBYUV_SCALE_ARGB_H_
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"  // For FilterMode
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+LIBYUV_API
+int ARGBScale(const uint8_t* src_argb,
+              int src_stride_argb,
+              int src_width,
+              int src_height,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering);
+// Clipped scale takes destination rectangle coordinates for clip values.
+LIBYUV_API
+int ARGBScaleClip(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  int src_width,
+                  int src_height,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int dst_width,
+                  int dst_height,
+                  int clip_x,
+                  int clip_y,
+                  int clip_width,
+                  int clip_height,
+                  enum FilterMode filtering);
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint32_t src_fourcc,
+                       int src_width,
+                       int src_height,
+                       uint8_t* dst_argb,
+                       int dst_stride_argb,
+                       uint32_t dst_fourcc,
+                       int dst_width,
+                       int dst_height,
+                       int clip_x,
+                       int clip_y,
+                       int clip_width,
+                       int clip_height,
+                       enum FilterMode filtering);
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/scale_rgb.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/scale_rgb.h
+/*
+ *  Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_SCALE_RGB_H_
+#define INCLUDE_LIBYUV_SCALE_RGB_H_
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"  // For FilterMode
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+// RGB can be RAW, RGB24 or YUV24
+// RGB scales 24 bit images by converting a row at a time to ARGB
+// and using ARGB row functions to scale, then convert to RGB.
+// TODO(fbarchard): Allow input/output formats to be specified.
+LIBYUV_API
+int RGBScale(const uint8_t* src_rgb,
+             int src_stride_rgb,
+             int src_width,
+             int src_height,
+             uint8_t* dst_rgb,
+             int dst_stride_rgb,
+             int dst_width,
+             int dst_height,
+             enum FilterMode filtering);
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_SCALE_UV_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/scale_row.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/scale_row.h
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_
+#define INCLUDE_LIBYUV_SCALE_ROW_H_
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+#if defined(__pnacl__) || defined(__CLR_VER) ||            \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
+#define LIBYUV_DISABLE_X86
+#endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif  // GNUC >= 4.7
+#endif  // __GNUC__
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif  // clang >= 3.4
+#endif  // __clang__
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
+    _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_FIXEDDIV1_X86
+#define HAS_FIXEDDIV_X86
+#define HAS_SCALEADDROW_SSE2
+#define HAS_SCALEARGBCOLS_SSE2
+#define HAS_SCALEARGBCOLSUP2_SSE2
+#define HAS_SCALEARGBFILTERCOLS_SSSE3
+#define HAS_SCALEARGBROWDOWN2_SSE2
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+#define HAS_SCALECOLSUP2_SSE2
+#define HAS_SCALEFILTERCOLS_SSSE3
+#define HAS_SCALEROWDOWN2_SSSE3
+#define HAS_SCALEROWDOWN34_SSSE3
+#define HAS_SCALEROWDOWN38_SSSE3
+#define HAS_SCALEROWDOWN4_SSSE3
+#endif
+// The following are available for gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_SCALEUVROWDOWN2BOX_SSSE3
+#define HAS_SCALEROWUP2_LINEAR_SSE2
+#define HAS_SCALEROWUP2_LINEAR_SSSE3
+#define HAS_SCALEROWUP2_BILINEAR_SSE2
+#define HAS_SCALEROWUP2_BILINEAR_SSSE3
+#define HAS_SCALEROWUP2_LINEAR_12_SSSE3
+#define HAS_SCALEROWUP2_BILINEAR_12_SSSE3
+#define HAS_SCALEROWUP2_LINEAR_16_SSE2
+#define HAS_SCALEROWUP2_BILINEAR_16_SSE2
+#define HAS_SCALEUVROWUP2_LINEAR_SSSE3
+#define HAS_SCALEUVROWUP2_BILINEAR_SSSE3
+#define HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+#define HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+#endif
+// The following are available for gcc/clang x86 platforms, but
+// require clang 3.4 or gcc 4.7.
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) &&               \
+    (defined(__x86_64__) || defined(__i386__)) && \
+    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_SCALEUVROWDOWN2BOX_AVX2
+#define HAS_SCALEROWUP2_LINEAR_AVX2
+#define HAS_SCALEROWUP2_BILINEAR_AVX2
+#define HAS_SCALEROWUP2_LINEAR_12_AVX2
+#define HAS_SCALEROWUP2_BILINEAR_12_AVX2
+#define HAS_SCALEROWUP2_LINEAR_16_AVX2
+#define HAS_SCALEROWUP2_BILINEAR_16_AVX2
+#define HAS_SCALEUVROWUP2_LINEAR_AVX2
+#define HAS_SCALEUVROWUP2_BILINEAR_AVX2
+#define HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+#define HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+#endif
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) &&                          \
+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
+     defined(GCC_HAS_AVX2))
+#define HAS_SCALEADDROW_AVX2
+#define HAS_SCALEROWDOWN2_AVX2
+#define HAS_SCALEROWDOWN4_AVX2
+#endif
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEADDROW_NEON
+#define HAS_SCALEARGBCOLS_NEON
+#define HAS_SCALEARGBFILTERCOLS_NEON
+#define HAS_SCALEARGBROWDOWN2_NEON
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+#define HAS_SCALEFILTERCOLS_NEON
+#define HAS_SCALEROWDOWN2_NEON
+#define HAS_SCALEROWDOWN34_NEON
+#define HAS_SCALEROWDOWN38_NEON
+#define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEUVROWDOWN2BOX_NEON
+#define HAS_SCALEUVROWDOWNEVEN_NEON
+#define HAS_SCALEROWUP2_LINEAR_NEON
+#define HAS_SCALEROWUP2_BILINEAR_NEON
+#define HAS_SCALEROWUP2_LINEAR_12_NEON
+#define HAS_SCALEROWUP2_BILINEAR_12_NEON
+#define HAS_SCALEROWUP2_LINEAR_16_NEON
+#define HAS_SCALEROWUP2_BILINEAR_16_NEON
+#define HAS_SCALEUVROWUP2_LINEAR_NEON
+#define HAS_SCALEUVROWUP2_BILINEAR_NEON
+#define HAS_SCALEUVROWUP2_LINEAR_16_NEON
+#define HAS_SCALEUVROWUP2_BILINEAR_16_NEON
+#endif
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_SCALEADDROW_MSA
+#define HAS_SCALEARGBCOLS_MSA
+#define HAS_SCALEARGBFILTERCOLS_MSA
+#define HAS_SCALEARGBROWDOWN2_MSA
+#define HAS_SCALEARGBROWDOWNEVEN_MSA
+#define HAS_SCALEFILTERCOLS_MSA
+#define HAS_SCALEROWDOWN2_MSA
+#define HAS_SCALEROWDOWN34_MSA
+#define HAS_SCALEROWDOWN38_MSA
+#define HAS_SCALEROWDOWN4_MSA
+#endif
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#define HAS_SCALEARGBROWDOWN2_LSX
+#define HAS_SCALEARGBROWDOWNEVEN_LSX
+#define HAS_SCALEROWDOWN2_LSX
+#define HAS_SCALEROWDOWN4_LSX
+#define HAS_SCALEROWDOWN38_LSX
+#define HAS_SCALEFILTERCOLS_LSX
+#define HAS_SCALEADDROW_LSX
+#define HAS_SCALEARGBCOLS_LSX
+#define HAS_SCALEARGBFILTERCOLS_LSX
+#define HAS_SCALEROWDOWN34_LSX
+#endif
+// Scale ARGB vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+                        int dst_width,
+                        int dst_height,
+                        int src_stride,
+                        int dst_stride,
+                        const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        int x,
+                        int y,
+                        int dy,
+                        int bpp,
+                        enum FilterMode filtering);
+void ScalePlaneVertical_16(int src_height,
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint16_t* src_argb,
+                           uint16_t* dst_argb,
+                           int x,
+                           int y,
+                           int dy,
+                           int wpp,
+                           enum FilterMode filtering);
+void ScalePlaneVertical_16To8(int src_height,
+                              int dst_width,
+                              int dst_height,
+                              int src_stride,
+                              int dst_stride,
+                              const uint16_t* src_argb,
+                              uint8_t* dst_argb,
+                              int x,
+                              int y,
+                              int dy,
+                              int wpp,
+                              int scale,
+                              enum FilterMode filtering);
+void ScalePlaneDown2_16To8(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint16_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           int scale,
+                           enum FilterMode filtering);
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
+                                  enum FilterMode filtering);
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div);
+int FixedDiv_X86(int num, int div);
+int FixedDiv_MIPS(int num, int div);
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div);
+int FixedDiv1_X86(int num, int div);
+int FixedDiv1_MIPS(int num, int div);
+#ifdef HAS_FIXEDDIV_X86
+#define FixedDiv FixedDiv_X86
+#define FixedDiv1 FixedDiv1_X86
+#elif defined HAS_FIXEDDIV_MIPS
+#define FixedDiv FixedDiv_MIPS
+#define FixedDiv1 FixedDiv1_MIPS
+#else
+#define FixedDiv FixedDiv_C
+#define FixedDiv1 FixedDiv1_C
+#endif
+// Compute slope values for stepping.
+void ScaleSlope(int src_width,
+                int src_height,
+                int dst_width,
+                int dst_height,
+                enum FilterMode filtering,
+                int* x,
+                int* y,
+                int* dx,
+                int* dy);
+void ScaleRowDown2_C(const uint8_t* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t* dst,
+                     int dst_width);
+void ScaleRowDown2_16_C(const uint16_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16_t* dst,
+                        int dst_width);
+void ScaleRowDown2_16To8_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width,
+                           int scale);
+void ScaleRowDown2_16To8_Odd_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width,
+                               int scale);
+void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width);
+void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint16_t* dst,
+                              int dst_width);
+void ScaleRowDown2Linear_16To8_C(const uint16_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst,
+                                 int dst_width,
+                                 int scale);
+void ScaleRowDown2Linear_16To8_Odd_C(const uint16_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst,
+                                     int dst_width,
+                                     int scale);
+void ScaleRowDown2Box_C(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width);
+void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16_t* dst,
+                           int dst_width);
+void ScaleRowDown2Box_16To8_C(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width,
+                              int scale);
+void ScaleRowDown2Box_16To8_Odd_C(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst,
+                                  int dst_width,
+                                  int scale);
+void ScaleRowDown4_C(const uint8_t* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t* dst,
+                     int dst_width);
+void ScaleRowDown4_16_C(const uint16_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16_t* dst,
+                        int dst_width);
+void ScaleRowDown4Box_C(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16_t* dst,
+                           int dst_width);
+void ScaleRowDown34_C(const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t* dst,
+                      int dst_width);
+void ScaleRowDown34_16_C(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width);
+void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* d,
+                            int dst_width);
+void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* d,
+                               int dst_width);
+void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* d,
+                            int dst_width);
+void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* d,
+                               int dst_width);
+void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
+                          uint8_t* dst_ptr,
+                          int dst_width);
+void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            ptrdiff_t dst_stride,
+                            int dst_width);
+void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
+                             uint16_t* dst_ptr,
+                             int dst_width);
+void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width);
+void ScaleRowUp2_Linear_Any_C(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width);
+void ScaleRowUp2_Linear_16_Any_C(const uint16_t* src_ptr,
+                                 uint16_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint16_t* dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+void ScaleCols_C(uint8_t* dst_ptr,
+                 const uint8_t* src_ptr,
+                 int dst_width,
+                 int x,
+                 int dx);
+void ScaleCols_16_C(uint16_t* dst_ptr,
+                    const uint16_t* src_ptr,
+                    int dst_width,
+                    int x,
+                    int dx);
+void ScaleColsUp2_C(uint8_t* dst_ptr,
+                    const uint8_t* src_ptr,
+                    int dst_width,
+                    int,
+                    int);
+void ScaleColsUp2_16_C(uint16_t* dst_ptr,
+                       const uint16_t* src_ptr,
+                       int dst_width,
+                       int,
+                       int);
+void ScaleFilterCols_C(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx);
+void ScaleFilterCols_16_C(uint16_t* dst_ptr,
+                          const uint16_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx);
+void ScaleFilterCols64_C(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         int dst_width,
+                         int x32,
+                         int dx);
+void ScaleFilterCols64_16_C(uint16_t* dst_ptr,
+                            const uint16_t* src_ptr,
+                            int dst_width,
+                            int x32,
+                            int dx);
+void ScaleRowDown38_C(const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t* dst,
+                      int dst_width);
+void ScaleRowDown38_16_C(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width);
+void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* dst_ptr,
+                               int dst_width);
+void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_16_C(const uint16_t* src_ptr,
+                      uint32_t* dst_ptr,
+                      int src_width);
+void ScaleARGBRowDown2_C(const uint8_t* src_argb,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_argb,
+                         int dst_width);
+void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBRowDown2Box_C(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_argb,
+                            int dst_width);
+void ScaleARGBRowDownEven_C(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
+                            int src_stepx,
+                            uint8_t* dst_argb,
+                            int dst_width);
+void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBCols_C(uint8_t* dst_argb,
+                     const uint8_t* src_argb,
+                     int dst_width,
+                     int x,
+                     int dx);
+void ScaleARGBCols64_C(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x32,
+                       int dx);
+void ScaleARGBColsUp2_C(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int,
+                        int);
+void ScaleARGBFilterCols_C(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx);
+void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x32,
+                             int dx);
+void ScaleUVRowDown2_C(const uint8_t* src_uv,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst_uv,
+                       int dst_width);
+void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_uv,
+                             int dst_width);
+void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_uv,
+                          int dst_width);
+void ScaleUVRowDownEven_C(const uint8_t* src_uv,
+                          ptrdiff_t src_stride,
+                          int src_stepx,
+                          uint8_t* dst_uv,
+                          int dst_width);
+void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
+                             ptrdiff_t src_stride,
+                             int src_stepx,
+                             uint8_t* dst_uv,
+                             int dst_width);
+void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              ptrdiff_t dst_stride,
+                              int dst_width);
+void ScaleUVRowUp2_Linear_Any_C(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr,
+                               uint16_t* dst_ptr,
+                               int dst_width);
+void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint16_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width);
+void ScaleUVRowUp2_Linear_16_Any_C(const uint16_t* src_ptr,
+                                   uint16_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint16_t* dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width);
+void ScaleUVCols_C(uint8_t* dst_uv,
+                   const uint8_t* src_uv,
+                   int dst_width,
+                   int x,
+                   int dx);
+void ScaleUVCols64_C(uint8_t* dst_uv,
+                     const uint8_t* src_uv,
+                     int dst_width,
+                     int x32,
+                     int dx);
+void ScaleUVColsUp2_C(uint8_t* dst_uv,
+                      const uint8_t* src_uv,
+                      int dst_width,
+                      int,
+                      int);
+void ScaleUVFilterCols_C(uint8_t* dst_uv,
+                         const uint8_t* src_uv,
+                         int dst_width,
+                         int x,
+                         int dx);
+void ScaleUVFilterCols64_C(uint8_t* dst_uv,
+                           const uint8_t* src_uv,
+                           int dst_width,
+                           int x32,
+                           int dx);
+// Specialized scalers for x86.
+void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width);
+void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width);
+void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width);
+void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width);
+void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width);
+void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width);
+void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width);
+void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
+                                 uint16_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint16_t* dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width);
+void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width);
+void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width);
+void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width);
+void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width);
+void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleRowUp2_Linear_Any_SSE2(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+void ScaleRowUp2_Linear_12_Any_SSSE3(const uint16_t* src_ptr,
+                                     uint16_t* dst_ptr,
+                                     int dst_width);
+void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t* src_ptr,
+                                       ptrdiff_t src_stride,
+                                       uint16_t* dst_ptr,
+                                       ptrdiff_t dst_stride,
+                                       int dst_width);
+void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+void ScaleRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8_t* dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width);
+void ScaleRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+void ScaleRowUp2_Linear_12_Any_AVX2(const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowUp2_Bilinear_12_Any_AVX2(const uint16_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown2Box_Odd_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown2_Any_AVX2(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown2Linear_Any_AVX2(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown2Box_Odd_AVX2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown4_Any_SSSE3(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowDown4Box_Any_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown4_Any_AVX2(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown4Box_Any_AVX2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown34_Any_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowDown38_Any_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_Any_SSE2(const uint8_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          int src_width);
+void ScaleAddRow_Any_AVX2(const uint8_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          int src_width);
+void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx);
+void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx);
+// ARGB Column functions
+void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx);
+void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+                               const uint8_t* src_argb,
+                               int dst_width,
+                               int x,
+                               int dx);
+void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx);
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+                              const uint8_t* src_argb,
+                              int dst_width,
+                              int x,
+                              int dx);
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx);
+void ScaleARGBFilterCols_Any_NEON(uint8_t* dst_ptr,
+                                  const uint8_t* src_ptr,
+                                  int dst_width,
+                                  int x,
+                                  int dx);
+void ScaleARGBCols_Any_NEON(uint8_t* dst_ptr,
+                            const uint8_t* src_ptr,
+                            int dst_width,
+                            int x,
+                            int dx);
+void ScaleARGBFilterCols_MSA(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x,
+                             int dx);
+void ScaleARGBCols_MSA(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x,
+                       int dx);
+void ScaleARGBFilterCols_Any_MSA(uint8_t* dst_ptr,
+                                 const uint8_t* src_ptr,
+                                 int dst_width,
+                                 int x,
+                                 int dx);
+void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx);
+// ARGB Row functions
+void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_argb,
+                            int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width);
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width);
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width);
+void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width);
+void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
+void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width);
+void ScaleARGBRowDown2_LSX(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width);
+void ScaleARGBRowDown2Linear_LSX(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
+void ScaleARGBRowDown2Box_LSX(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width);
+void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleARGBRowDown2Linear_Any_SSE2(const uint8_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8_t* dst_ptr,
+                                      int dst_width);
+void ScaleARGBRowDown2Box_Any_SSE2(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleARGBRowDown2_Any_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleARGBRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8_t* dst_ptr,
+                                      int dst_width);
+void ScaleARGBRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleARGBRowDown2_Any_LSX(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleARGBRowDown2Linear_Any_LSX(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+void ScaleARGBRowDown2Box_Any_LSX(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8_t* dst_argb,
+                                  int dst_width);
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8_t* dst_argb,
+                                  int dst_width);
+void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int32_t src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width);
+void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
+void ScaleARGBRowDownEven_LSX(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int32_t src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width);
+void ScaleARGBRowDownEvenBox_LSX(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
+void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      int src_stepx,
+                                      uint8_t* dst_ptr,
+                                      int dst_width);
+void ScaleARGBRowDownEven_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleARGBRowDownEvenBox_Any_NEON(const uint8_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      int src_stepx,
+                                      uint8_t* dst_ptr,
+                                      int dst_width);
+void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  int32_t src_stepx,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     int src_stepx,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+void ScaleARGBRowDownEven_Any_LSX(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  int32_t src_stepx,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleARGBRowDownEvenBox_Any_LSX(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     int src_stepx,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+// UV Row functions
+void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_uv,
+                           int dst_width);
+void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_uv,
+                                 int dst_width);
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_uv,
+                              int dst_width);
+void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_uv,
+                             int dst_width);
+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_uv,
+                                int dst_width);
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width);
+void ScaleUVRowDown2_MSA(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_uv,
+                         int dst_width);
+void ScaleUVRowDown2Linear_MSA(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_uv,
+                               int dst_width);
+void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_uv,
+                            int dst_width);
+void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleUVRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleUVRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleUVRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleUVRowDown2_Any_MSA(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleUVRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              int src_stepx,
+                              uint8_t* dst_uv,
+                              int dst_width);
+void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_uv,
+                                 int dst_width);
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             int src_stepx,
+                             uint8_t* dst_uv,
+                             int dst_width);
+void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                int src_stepx,
+                                uint8_t* dst_uv,
+                                int dst_width);
+void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            int32_t src_stepx,
+                            uint8_t* dst_uv,
+                            int dst_width);
+void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_uv,
+                               int dst_width);
+void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     int src_stepx,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+void ScaleUVRowDownEven_Any_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleUVRowDownEvenBox_Any_NEON(const uint8_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    int src_stepx,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleUVRowDownEven_Any_MSA(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                int32_t src_stepx,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleUVRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width);
+void ScaleUVRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width);
+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width);
+void ScaleUVRowUp2_Linear_Any_NEON(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width);
+void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
+                                   uint16_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint16_t* dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width);
+void ScaleUVRowUp2_Linear_16_Any_SSE41(const uint16_t* src_ptr,
+                                       uint16_t* dst_ptr,
+                                       int dst_width);
+void ScaleUVRowUp2_Bilinear_16_Any_SSE41(const uint16_t* src_ptr,
+                                         ptrdiff_t src_stride,
+                                         uint16_t* dst_ptr,
+                                         ptrdiff_t dst_stride,
+                                         int dst_width);
+void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+                                  uint16_t* dst_ptr,
+                                  int dst_width);
+void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint16_t* dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width);
+void ScaleUVRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr,
+                                      uint16_t* dst_ptr,
+                                      int dst_width);
+void ScaleUVRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr,
+                                        ptrdiff_t src_stride,
+                                        uint16_t* dst_ptr,
+                                        ptrdiff_t dst_stride,
+                                        int dst_width);
+void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+                                  uint16_t* dst_ptr,
+                                  int dst_width);
+void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint16_t* dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width);
+void ScaleUVRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr,
+                                      uint16_t* dst_ptr,
+                                      int dst_width);
+void ScaleUVRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr,
+                                        ptrdiff_t src_stride,
+                                        uint16_t* dst_ptr,
+                                        ptrdiff_t dst_stride,
+                                        int dst_width);
+// ScaleRowDown2Box also used by planar functions
+// NEON downscalers with interpolation.
+// Note - not static due to reuse in convert for 444 to 420.
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width);
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width);
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width);
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+//  to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width);
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown2_Any_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown2Box_Odd_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown4_Any_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown4Box_Any_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown34_Any_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowDown34_0_Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleRowDown34_1_Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+// 32 -> 12
+void ScaleRowDown38_Any_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width);
+void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width);
+void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width);
+void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleRowUp2_Linear_Any_NEON(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+void ScaleRowUp2_Linear_12_Any_NEON(const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowUp2_Bilinear_12_Any_NEON(const uint16_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+void ScaleRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_Any_NEON(const uint8_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          int src_width);
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx);
+void ScaleFilterCols_Any_NEON(uint8_t* dst_ptr,
+                              const uint8_t* src_ptr,
+                              int dst_width,
+                              int x,
+                              int dx);
+void ScaleRowDown2_MSA(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width);
+void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width);
+void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleRowDown4_MSA(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width);
+void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleRowDown38_MSA(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleFilterCols_MSA(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         int dst_width,
+                         int x,
+                         int dx);
+void ScaleRowDown34_MSA(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width);
+void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width);
+void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleAddRow_Any_MSA(const uint8_t* src_ptr,
+                         uint16_t* dst_ptr,
+                         int src_width);
+void ScaleFilterCols_Any_MSA(uint8_t* dst_ptr,
+                             const uint8_t* src_ptr,
+                             int dst_width,
+                             int x,
+                             int dx);
+void ScaleRowDown34_Any_MSA(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown34_0_Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown2_LSX(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width);
+void ScaleRowDown2Linear_LSX(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width);
+void ScaleRowDown2Box_LSX(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleRowDown4_LSX(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width);
+void ScaleRowDown4Box_LSX(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleRowDown38_LSX(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown38_2_Box_LSX(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_3_Box_LSX(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleAddRow_LSX(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleFilterCols_LSX(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         int dst_width,
+                         int x,
+                         int dx);
+void ScaleARGBFilterCols_LSX(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x,
+                             int dx);
+void ScaleARGBCols_LSX(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x,
+                       int dx);
+void ScaleRowDown34_LSX(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown34_0_Box_LSX(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width);
+void ScaleRowDown34_1_Box_LSX(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width);
+void ScaleRowDown2_Any_LSX(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown2Linear_Any_LSX(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowDown2Box_Any_LSX(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown4_Any_LSX(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown4Box_Any_LSX(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_Any_LSX(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown38_2_Box_Any_LSX(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown38_3_Box_Any_LSX(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleAddRow_Any_LSX(const uint8_t* src_ptr,
+                         uint16_t* dst_ptr,
+                         int src_width);
+void ScaleFilterCols_Any_LSX(uint8_t* dst_ptr,
+                             const uint8_t* src_ptr,
+                             int dst_width,
+                             int x,
+                             int dx);
+void ScaleARGBCols_Any_LSX(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx);
+void ScaleARGBFilterCols_Any_LSX(uint8_t* dst_ptr,
+                                 const uint8_t* src_ptr,
+                                 int dst_width,
+                                 int x,
+                                 int dx);
+void ScaleRowDown34_Any_LSX(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown34_0_Box_Any_LSX(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown34_1_Box_Any_LSX(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/scale_uv.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/scale_uv.h
+/*
+ *  Copyright 2020 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_SCALE_UV_H_
+#define INCLUDE_LIBYUV_SCALE_UV_H_
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"  // For FilterMode
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+LIBYUV_API
+int UVScale(const uint8_t* src_uv,
+            int src_stride_uv,
+            int src_width,
+            int src_height,
+            uint8_t* dst_uv,
+            int dst_stride_uv,
+            int dst_width,
+            int dst_height,
+            enum FilterMode filtering);
+// Scale a 16 bit UV image.
+// This function is currently incomplete, it can't handle all cases.
+LIBYUV_API
+int UVScale_16(const uint16_t* src_uv,
+               int src_stride_uv,
+               int src_width,
+               int src_height,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int dst_width,
+               int dst_height,
+               enum FilterMode filtering);
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_SCALE_UV_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/version.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/version.h
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_VERSION_H_
+#define INCLUDE_LIBYUV_VERSION_H_
+#define LIBYUV_VERSION 1860
+#endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/MomentaMedia/ThirdParty/libyuv2/include/libyuv/video_common.h
+++ b/MomentaMedia/ThirdParty/libyuv2/include/libyuv/video_common.h
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+// Common definitions for video, including fourcc and VideoFormat.
+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_
+#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
+#include "libyuv/basic_types.h"
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+//////////////////////////////////////////////////////////////////////////////
+// Definition of FourCC codes
+//////////////////////////////////////////////////////////////////////////////
+// Convert four characters to a FourCC code.
+// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
+// constants are used in a switch.
+#ifdef __cplusplus
+#define FOURCC(a, b, c, d)                                        \
+  ((static_cast<uint32_t>(a)) | (static_cast<uint32_t>(b) << 8) | \
+   (static_cast<uint32_t>(c) << 16) | /* NOLINT */                \
+   (static_cast<uint32_t>(d) << 24))  /* NOLINT */
+#else
+#define FOURCC(a, b, c, d)                                     \
+  (((uint32_t)(a)) | ((uint32_t)(b) << 8) |       /* NOLINT */ \
+   ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */
+#endif
+// Some pages discussing FourCC codes:
+//   http://www.fourcc.org/yuv.php
+//   http://v4l2spec.bytesex.org/spec/book1.htm
+//   http://developer.apple.com/quicktime/icefloe/dispatch020.html
+//   http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
+//   http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
+// FourCC codes grouped according to implementation efficiency.
+// Primary formats should convert in 1 efficient step.
+// Secondary formats are converted in 2 steps.
+// Auxilliary formats call primary converters.
+enum FourCC {
+  // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+  FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+  FOURCC_I422 = FOURCC('I', '4', '2', '2'),
+  FOURCC_I444 = FOURCC('I', '4', '4', '4'),
+  FOURCC_I400 = FOURCC('I', '4', '0', '0'),
+  FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+  FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+  FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+  FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+  FOURCC_I010 = FOURCC('I', '0', '1', '0'),  // bt.601 10 bit 420
+  FOURCC_I210 = FOURCC('I', '2', '1', '0'),  // bt.601 10 bit 422
+  // 1 Secondary YUV format: row biplanar.  deprecated.
+  FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+  // 13 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc 2 64 bpp
+  FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+  FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+  FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+  FOURCC_AR30 = FOURCC('A', 'R', '3', '0'),  // 10 bit per channel. 2101010.
+  FOURCC_AB30 = FOURCC('A', 'B', '3', '0'),  // ABGR version of 10 bit
+  FOURCC_AR64 = FOURCC('A', 'R', '6', '4'),  // 16 bit per channel.
+  FOURCC_AB64 = FOURCC('A', 'B', '6', '4'),  // ABGR version of 16 bit
+  FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+  FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
+  FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+  FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
+  FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
+  FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
+  // 1 Primary Compressed YUV format.
+  FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+  // 14 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+  FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+  FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+  FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
+  FOURCC_J420 =
+      FOURCC('J', '4', '2', '0'),  // jpeg (bt.601 full), unofficial fourcc
+  FOURCC_J422 =
+      FOURCC('J', '4', '2', '2'),  // jpeg (bt.601 full), unofficial fourcc
+  FOURCC_J444 =
+      FOURCC('J', '4', '4', '4'),  // jpeg (bt.601 full), unofficial fourcc
+  FOURCC_J400 =
+      FOURCC('J', '4', '0', '0'),  // jpeg (bt.601 full), unofficial fourcc
+  FOURCC_F420 = FOURCC('F', '4', '2', '0'),  // bt.709 full, unofficial fourcc
+  FOURCC_F422 = FOURCC('F', '4', '2', '2'),  // bt.709 full, unofficial fourcc
+  FOURCC_F444 = FOURCC('F', '4', '4', '4'),  // bt.709 full, unofficial fourcc
+  FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // bt.709, unofficial fourcc
+  FOURCC_H422 = FOURCC('H', '4', '2', '2'),  // bt.709, unofficial fourcc
+  FOURCC_H444 = FOURCC('H', '4', '4', '4'),  // bt.709, unofficial fourcc
+  FOURCC_U420 = FOURCC('U', '4', '2', '0'),  // bt.2020, unofficial fourcc
+  FOURCC_U422 = FOURCC('U', '4', '2', '2'),  // bt.2020, unofficial fourcc
+  FOURCC_U444 = FOURCC('U', '4', '4', '4'),  // bt.2020, unofficial fourcc
+  FOURCC_F010 = FOURCC('F', '0', '1', '0'),  // bt.709 full range 10 bit 420
+  FOURCC_H010 = FOURCC('H', '0', '1', '0'),  // bt.709 10 bit 420
+  FOURCC_U010 = FOURCC('U', '0', '1', '0'),  // bt.2020 10 bit 420
+  FOURCC_F210 = FOURCC('F', '2', '1', '0'),  // bt.709 full range 10 bit 422
+  FOURCC_H210 = FOURCC('H', '2', '1', '0'),  // bt.709 10 bit 422
+  FOURCC_U210 = FOURCC('U', '2', '1', '0'),  // bt.2020 10 bit 422
+  FOURCC_P010 = FOURCC('P', '0', '1', '0'),
+  FOURCC_P210 = FOURCC('P', '2', '1', '0'),
+  // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
+  FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
+  FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'),  // Alias for I422.
+  FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'),  // Alias for I444.
+  FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2.
+  FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac.
+  FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY.
+  FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY on Mac.
+  FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG.
+  FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'),  // Alias for MJPG on Mac.
+  FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR.
+  FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW.
+  FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG.
+  FOURCC_CM32 = FOURCC(0, 0, 0, 32),  // Alias for BGRA kCMPixelFormat_32ARGB
+  FOURCC_CM24 = FOURCC(0, 0, 0, 24),  // Alias for RAW kCMPixelFormat_24RGB
+  FOURCC_L555 = FOURCC('L', '5', '5', '5'),  // Alias for RGBO.
+  FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
+  FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
+  // deprecated formats.  Not supported, but defined for backward compatibility.
+  FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+  FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+  // Match any fourcc.
+  FOURCC_ANY = -1,
+};
+enum FourCCBpp {
+  // Canonical fourcc codes used in our code.
+  FOURCC_BPP_I420 = 12,
+  FOURCC_BPP_I422 = 16,
+  FOURCC_BPP_I444 = 24,
+  FOURCC_BPP_I411 = 12,
+  FOURCC_BPP_I400 = 8,
+  FOURCC_BPP_NV21 = 12,
+  FOURCC_BPP_NV12 = 12,
+  FOURCC_BPP_YUY2 = 16,
+  FOURCC_BPP_UYVY = 16,
+  FOURCC_BPP_M420 = 12,  // deprecated
+  FOURCC_BPP_Q420 = 12,
+  FOURCC_BPP_ARGB = 32,
+  FOURCC_BPP_BGRA = 32,
+  FOURCC_BPP_ABGR = 32,
+  FOURCC_BPP_RGBA = 32,
+  FOURCC_BPP_AR30 = 32,
+  FOURCC_BPP_AB30 = 32,
+  FOURCC_BPP_AR64 = 64,
+  FOURCC_BPP_AB64 = 64,
+  FOURCC_BPP_24BG = 24,
+  FOURCC_BPP_RAW = 24,
+  FOURCC_BPP_RGBP = 16,
+  FOURCC_BPP_RGBO = 16,
+  FOURCC_BPP_R444 = 16,
+  FOURCC_BPP_RGGB = 8,
+  FOURCC_BPP_BGGR = 8,
+  FOURCC_BPP_GRBG = 8,
+  FOURCC_BPP_GBRG = 8,
+  FOURCC_BPP_YV12 = 12,
+  FOURCC_BPP_YV16 = 16,
+  FOURCC_BPP_YV24 = 24,
+  FOURCC_BPP_YU12 = 12,
+  FOURCC_BPP_J420 = 12,
+  FOURCC_BPP_J400 = 8,
+  FOURCC_BPP_H420 = 12,
+  FOURCC_BPP_H422 = 16,
+  FOURCC_BPP_I010 = 15,
+  FOURCC_BPP_I210 = 20,
+  FOURCC_BPP_H010 = 15,
+  FOURCC_BPP_H210 = 20,
+  FOURCC_BPP_P010 = 15,
+  FOURCC_BPP_P210 = 20,
+  FOURCC_BPP_MJPG = 0,  // 0 means unknown.
+  FOURCC_BPP_H264 = 0,
+  FOURCC_BPP_IYUV = 12,
+  FOURCC_BPP_YU16 = 16,
+  FOURCC_BPP_YU24 = 24,
+  FOURCC_BPP_YUYV = 16,
+  FOURCC_BPP_YUVS = 16,
+  FOURCC_BPP_HDYC = 16,
+  FOURCC_BPP_2VUY = 16,
+  FOURCC_BPP_JPEG = 1,
+  FOURCC_BPP_DMB1 = 1,
+  FOURCC_BPP_BA81 = 8,
+  FOURCC_BPP_RGB3 = 24,
+  FOURCC_BPP_BGR3 = 24,
+  FOURCC_BPP_CM32 = 32,
+  FOURCC_BPP_CM24 = 24,
+  // Match any fourcc.
+  FOURCC_BPP_ANY = 0,  // 0 means unknown.
+};
+// Converts fourcc aliases into canonical ones.
+LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc);
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_
--- a/MomentaMedia/ThirdParty/libyuv2/lib/jpeg.lib
+++ b/MomentaMedia/ThirdParty/libyuv2/lib/jpeg.lib
--- a/MomentaMedia/ThirdParty/libyuv2/lib/libyuv.dll
+++ b/MomentaMedia/ThirdParty/libyuv2/lib/libyuv.dll
--- a/MomentaMedia/ThirdParty/libyuv2/lib/libyuv.dll1
+++ b/MomentaMedia/ThirdParty/libyuv2/lib/libyuv.dll1
--- a/MomentaMedia/ThirdParty/libyuv2/lib/yuv.lib
+++ b/MomentaMedia/ThirdParty/libyuv2/lib/yuv.lib
--- a/MomentaMedia/ThirdParty/libyuv2/lib/yuv.lib1
+++ b/MomentaMedia/ThirdParty/libyuv2/lib/yuv.lib1
--- a/MomentaMedia/include/BlackMagicDesign/DeckLinkOutputDevice.h
+++ b/MomentaMedia/include/BlackMagicDesign/DeckLinkOutputDevice.h
@@ -62,6 +62,7 @@ public:
 	void SetSendSdiParams(SendSdiParams params);
 	void ClearFrontQueue();
 	void AddPicFrame(std::shared_ptr<VideoFrameWithMask> frame);
+	void GetInOutVideoParams(BmdVideoParams& input_params,BmdVideoParams& output_params);
 public slots:
 	void AddAudioFrame(std::shared_ptr<AudioPacket> audio_packet);
 	void AddFrame(std::shared_ptr<Image> image);
@@ -134,6 +135,8 @@ private:
 	qint32 queue_max_size;
 	BMDDisplayMode outputDisplayMode;
+	BMDDisplayMode getOutputDisplayMode() { return outputDisplayMode; }
 	int				m_fps;
 	uint64_t		m_lastRecvTS;
@@ -162,4 +165,8 @@ private:
 	qint64 last_start_tm{0};
 	SafeMap<qint64, std::shared_ptr<VideoFrameWithMask>> output_video_frame_map;
+	BmdVideoParams output_params;
+	BmdVideoParams input_params;
 };
--- a/MomentaMedia/include/Threads/ProcessMaskThread.h
+++ b/MomentaMedia/include/Threads/ProcessMaskThread.h
@@ -8,8 +8,11 @@
 #include "ZoomThread.h"
 #include "Utils/Algorithm.h"
 #include "CropThread.h"
+#include "VideoScaleThread.h"
-class ProcessMaskThread : public QThread, public CropThread::Listener
+class ProcessMaskThread : public QThread, 
+						  public CropThread::Listener,
+						  public VideoScaleThread::Listener
 {
 	Q_OBJECT
 public:
@@ -22,12 +25,14 @@ public slots:
 signals:
 	void PushFrame(std::shared_ptr<VideoFrameWithMask> image);
 	void PushFrameToReplay(std::shared_ptr<VideoFrameWithMask> image);
+	void PushScaleFrameToReplay(std::shared_ptr<VideoFrameWithMask> image);
 public:
 	void ClearQueue();
 	void SetRecordStore(RecordStore* store) { p_store = store; }
 	void SetZoomThread(std::shared_ptr<ZoomThread> thread) { zoom_thread = thread; }
 	void StartRecord(const uint64_t& start_time, const uint64_t& end_time);
 	CropThread* GetCropThread();
+	void SetBmdInOutVideoParams(const BmdVideoParams& input, const BmdVideoParams& output);
 protected:
 	enum MaskStatus
@@ -38,6 +43,7 @@ protected:
 	};
 private:
 	void OnRecvViedoFrame(std::shared_ptr<VideoFrameWithMask>) override;
+	void OnRecvScaleViedoFrame(std::shared_ptr<VideoFrameWithMask>) override;
 	void OnChange() override;
 	void run() override;
 	void process();
@@ -80,6 +86,7 @@ private:
 	std::shared_ptr<ZoomThread> zoom_thread{nullptr};
 	std::shared_ptr<CropThread> crop_thread{ nullptr };
+	std::shared_ptr<VideoScaleThread> scale_thread{ nullptr };
 	Rect mask_rect{0,0,0,0};
 	int				m_fps{0};
@@ -100,4 +107,11 @@ private:
 	CropMessage crop_msg;
 	qint32 dynamic_out_num {0};
 	bool dynamic_flag{false};
+	std::atomic_bool replay_flag_clear { false };
+	BMDDisplayMode output_display_mode;
+	BmdVideoParams output_params;
+	BmdVideoParams input_params;
+	std::atomic_bool scale_flag{ false };
 };
\ No newline at end of file
--- a/MomentaMedia/include/Threads/ReplayThread.h
+++ b/MomentaMedia/include/Threads/ReplayThread.h
@@ -6,8 +6,9 @@
 #include <queue>
 #include "Utils/SafeMap.h"
 #include "CropThread.h"
+#include "VideoScaleThread.h"
-class ReplayThread : public QThread,public CropThread::Listener
+class ReplayThread : public QThread,public CropThread::Listener, public VideoScaleThread::Listener
 {
 	Q_OBJECT
 public:
@@ -16,6 +17,7 @@ public:
 public slots:
 	void addFrame(std::shared_ptr<videoFrameData> frameData);
 	void addCropFrame(std::shared_ptr<VideoFrameWithMask> frame);
+	void addScaleFrame(std::shared_ptr<VideoFrameWithMask> frame);
 signals:
 	//void PushFrame(std::shared_ptr<Image> image);
 	void PushFrame(std::shared_ptr<videoFrameData> frameData);
@@ -24,10 +26,12 @@ public:
 	bool CanReplay(const ReplayParams& params);
 	void recvReplayParams(const ReplayParams& params);
 	CropThread* GetCropThread();
+	void SetBmdInOutVideoParams(const BmdVideoParams& input, const BmdVideoParams& output);
 protected:
 	void run() override;
 private:
 	void OnRecvViedoFrame(std::shared_ptr<VideoFrameWithMask> crop_frame) override;
+	void OnRecvScaleViedoFrame(std::shared_ptr<VideoFrameWithMask> scale_frame) override;
 	void OnChange() override;
 	void SendFrameFunc();
@@ -45,6 +49,7 @@ private:
 	/*std::queue<std::shared_ptr<videoFrameData>> replayVideoQueue1;
 	std::queue<std::shared_ptr<videoFrameData>> replayVideoQueue2;*/
 	uint32_t max_store_size;
+	uint32_t max_store_scale_size;
 	uint32_t max_store_crop_size;
 	uint32_t max_store_back_size;
 	std::atomic_bool replay_flag{ false };
@@ -57,7 +62,16 @@ private:
 	bool send_err_flag {false};
 	std::shared_ptr<CropThread> crop_thread{ nullptr };
+	std::shared_ptr<VideoScaleThread> scale_thread{ nullptr };
 	SafeMap<qint64, std::shared_ptr<VideoFrameWithMask>> storeCropMap;
+	SafeMap<qint64, std::shared_ptr<VideoFrameWithMask>> storeScaleMap;
 	SampleQueue<std::shared_ptr<videoFrameData>> needCropQueue;
 	std::thread send_frame_thread;
+	BMDDisplayMode output_display_mode;
+	BmdVideoParams output_params;
+	BmdVideoParams input_params;
+	std::atomic_bool scale_flag{ false };
 };
\ No newline at end of file
--- a/MomentaMedia/include/Threads/VideoScaleThread.h
+++ b/MomentaMedia/include/Threads/VideoScaleThread.h
+#pragma once
+#include <QThread>
+#include "Utils/SampleQueue.h"
+class VideoScaleThread : public QThread
+{
+	Q_OBJECT
+public:
+	class Listener
+	{
+	public:
+		virtual void OnRecvScaleViedoFrame(std::shared_ptr<VideoFrameWithMask>) = 0;
+	};
+public:
+	VideoScaleThread(Listener* listener);
+	~VideoScaleThread();
+public:
+	void addVideoFrame(std::shared_ptr<VideoFrameWithMask> frame);
+	void addScaleMsg(const ScaleMsg& msg);
+protected:
+	void run() override;
+private:
+	void scale();
+private:
+	SampleQueue<std::shared_ptr<VideoFrameWithMask>> video_queue;
+	SampleQueue<ScaleMsg> scale_msg_queue;
+	Listener* p_listener{NULL};
+	uint8_t* uyvy_to_i422{ NULL };
+	uint8_t* i422_scale{NULL};
+	qint32 i422_width{K4WIDTH};
+	qint32 i422_height{K4HEIGHT};
+	ScaleMsg scale_msg;
+};
\ No newline at end of file
--- a/MomentaMedia/include/Utils/Common.h
+++ b/MomentaMedia/include/Utils/Common.h
@@ -30,6 +30,12 @@
 #define K4WIDTH 3840
 #define K4HEIGHT 2160
+#define HDWIDHT 1920
+#define HDHEIGHT 1080
+#define FRAME50P 50
+#define FRAME50I 25
+#define FRAME60P 60
 enum ReplayStatus
 {
@@ -75,6 +81,14 @@ enum CropDirection
 	CD_RIGHT_LOW,
 };
+enum OutputFormat
+{
+	OF_SAME_INPUT = 0,
+	OF_1080_50I,
+	OF_1080_50P,
+	OF_2160_50P
+};
 long GetRowBytesFromPixelFormat(long width, BMDPixelFormat pixelFormat);
 static QString GetCurrDateTimeStr()
@@ -105,6 +119,73 @@ struct SendSdiParams
 	qint64 start_time;
 };
+struct BmdVideoParams
+{
+	qint32 width{0};
+	qint32 height{0};
+	qint32 frame{0};
+};
+//void OutputFmtToOutParams(BMDDisplayMode mode, OutputParams& params)
+//{
+//	switch (mode)
+//	{
+//	case bmdModeHD1080p50:
+//		params.width = HDWIDHT;
+//		params.height = HDHEIGHT;
+//		params.frame = FRAME50P;
+//		break;
+//	case bmdModeHD1080i50:
+//		params.width = HDWIDHT;
+//		params.height = HDHEIGHT;
+//		params.frame = FRAME50I;
+//		break;
+//	case bmdMode4K2160p50:
+//		params.width = K4WIDTH;
+//		params.height = K4HEIGHT;
+//		params.frame = FRAME50P;
+//		break;
+//	case bmdMode4K2160p5994:
+//	case bmdMode4K2160p60:
+//		params.width = K4WIDTH;
+//		params.height = K4HEIGHT;
+//		params.frame = FRAME60P;
+//		break;
+//	default:
+//		break;
+//	}
+//}
+static void BmdDisplayModeToBmdVideoParams(BMDDisplayMode mode, BmdVideoParams& params)
+{
+	switch (mode)
+	{
+	case bmdModeHD1080p50:
+		params.width = HDWIDHT;
+		params.height = HDHEIGHT;
+		params.frame = FRAME50P;
+		break;
+	case bmdModeHD1080i50:
+		params.width = HDWIDHT;
+		params.height = HDHEIGHT;
+		params.frame = FRAME50I;
+		break;
+	case bmdMode4K2160p50:
+		params.width = K4WIDTH;
+		params.height = K4HEIGHT;
+		params.frame = FRAME50P;
+		break;
+	case bmdMode4K2160p5994:
+	case bmdMode4K2160p60:
+		params.width = K4WIDTH;
+		params.height = K4HEIGHT;
+		params.frame = FRAME60P;
+		break;
+	default:
+		break;
+	}
+}
 struct HDRMetadata {
 	INT64 EOTF;
@@ -125,6 +206,12 @@ struct HDRMetadata {
 enum class EOTF { SDR = 0, HDR = 1, PQ = 2, HLG = 3 };
+struct ScaleMsg
+{
+	int32_t scale_width{0};
+	int32_t scale_height{0};
+};
 class CropMessage
 {
 public:

--- a/MomentaMedia/include/Utils/Settings.h
+++ b/MomentaMedia/include/Utils/Settings.h
@@ -70,6 +70,7 @@ public:
 	static CropMessage CropMsg;
 	/*static uint8_t* PicData;
 	static std::atomic_bool PicFlag;*/
 };

--- a/MomentaMedia/include/Utils/yuv4k.h
+++ b/MomentaMedia/include/Utils/yuv4k.h
@@ -16,6 +16,50 @@ protected:
 		size_t h;
 	};
 public:
+	static void UYVYToARGB4KWithMatrix(uint8_t* src, const size_t& src_step, uint8_t* dst, const size_t& dst_step, const uint32_t& width, const uint32_t& height,const BMDColorspace& space)
+	{
+		uint8_t* src1 = src;
+		uint8_t* src2 = src1 + src_step;
+		uint8_t* src3 = src2 + src_step;
+		uint8_t* src4 = src3 + src_step;
+		uint8_t* dst1 = dst;
+		uint8_t* dst2 = dst1 + dst_step;
+		uint8_t* dst3 = dst2 + dst_step;
+		uint8_t* dst4 = dst3 + dst_step;
+		{
+			switch (space)
+			{
+			case bmdColorspaceRec601:
+				libyuv::UYVYToARGBMatrix(src1, width << 1, dst1, width << 2, &libyuv::kYuvI601Constants, width, height);
+				libyuv::UYVYToARGBMatrix(src1, width << 1, dst1, width << 2, &libyuv::kYuvI601Constants, width, height);
+				libyuv::UYVYToARGBMatrix(src1, width << 1, dst1, width << 2, &libyuv::kYuvI601Constants, width, height);
+				libyuv::UYVYToARGBMatrix(src1, width << 1, dst1, width << 2, &libyuv::kYuvI601Constants, width, height);
+				break;
+			case bmdColorspaceRec709:
+				libyuv::UYVYToARGBMatrix(src1, width << 1, dst1, width << 2, &libyuv::kYuvH709Constants, width, height);
+				libyuv::UYVYToARGBMatrix(src1, width << 1, dst1, width << 2, &libyuv::kYuvH709Constants, width, height);
+				libyuv::UYVYToARGBMatrix(src1, width << 1, dst1, width << 2, &libyuv::kYuvH709Constants, width, height);
+				libyuv::UYVYToARGBMatrix(src1, width << 1, dst1, width << 2, &libyuv::kYuvH709Constants, width, height);
+				break;
+			case bmdColorspaceRec2020:
+				libyuv::UYVYToARGBMatrix(src1, width << 1, dst1, width << 2, &libyuv::kYuv2020Constants, width, height);
+				libyuv::UYVYToARGBMatrix(src1, width << 1, dst1, width << 2, &libyuv::kYuv2020Constants, width, height);
+				libyuv::UYVYToARGBMatrix(src1, width << 1, dst1, width << 2, &libyuv::kYuv2020Constants, width, height);
+				libyuv::UYVYToARGBMatrix(src1, width << 1, dst1, width << 2, &libyuv::kYuv2020Constants, width, height);
+				break;
+			default:
+				break;
+			}
+			/*libyuv::UYVYToARGB(src1, width << 1, dst1, width << 2, width, height);
+			libyuv::UYVYToARGB(src2, width << 1, dst2, width << 2, width, height);
+			libyuv::UYVYToARGB(src3, width << 1, dst3, width << 2, width, height);
+			libyuv::UYVYToARGB(src4, width << 1, dst4, width << 2, width, height);*/
+		}
+	}
 	static void UYVYToARGB4K(uint8_t* src,const size_t& src_step,uint8_t* dst,const size_t& dst_step,const uint32_t& width,const uint32_t& height)
 	{
 		uint8_t* src1 = src;
@@ -159,6 +203,104 @@ public:
 	}
+	static void I422Scale(uint8_t* src, const size_t& src_w, const size_t& src_h, uint8_t* dst, const size_t& dst_w, const size_t& dst_h, const int32_t& omp, int32_t filter = 2)
+	{
+		size_t half_src_w = (src_w >> 1);
+		size_t half_src_h = (src_h >> 1);
+		size_t half_dst_w = (dst_w >> 1);
+		size_t half_dst_h = (dst_h >> 1);
+		uint32_t src_y_size = src_w * src_h;
+		uint32_t src_u_size = (src_w * src_h >> 1);
+		size_t  src_mode_h = src_h % 2;
+		size_t  src_mode_w = src_w % 2;
+		uint8_t* src_y = src;
+		uint8_t* src_u = src_y + src_y_size;
+		uint8_t* src_v = src_u + src_u_size;
+		uint8_t* src_y_1 = src_y;
+		uint8_t* src_y_2 = src_y + half_src_w;
+		uint8_t* src_y_3 = src_y + src_w * half_src_h;
+		uint8_t* src_y_4 = src_y + src_w * half_src_h + half_src_w;
+		uint8_t* src_u_1 = src_u;
+		uint8_t* src_u_2 = src_u + (half_src_w >> 1);
+		uint8_t* src_u_3 = src_u + (src_w * half_src_h >> 1);
+		uint8_t* src_u_4 = src_u + (src_w * half_src_h >> 1) + (half_src_w >> 1);
+		uint8_t* src_v_1 = src_v;
+		uint8_t* src_v_2 = src_v + (half_src_w >> 1);
+		uint8_t* src_v_3 = src_v + (src_w * half_src_h >> 1);
+		uint8_t* src_v_4 = src_v + (half_src_w >> 1) + (src_w * half_src_h >> 1);
+		uint32_t dst_y_size = dst_w * dst_h;
+		uint32_t dst_u_size = (dst_w * dst_h >> 1);
+		uint8_t* dst_y = dst;
+		uint8_t* dst_u = dst_y + dst_y_size;
+		uint8_t* dst_v = dst_u + dst_u_size;
+		uint8_t* dst_y_1 = dst_y;
+		uint8_t* dst_y_2 = dst_y + half_dst_w;
+		uint8_t* dst_y_3 = dst_y + dst_w * half_dst_h;
+		uint8_t* dst_y_4 = dst_y + dst_w * half_dst_h + half_dst_w;
+		uint8_t* dst_u_1 = dst_u;
+		uint8_t* dst_u_2 = dst_u + (half_dst_w >> 1);
+		uint8_t* dst_u_3 = dst_u + (dst_w * half_dst_h >> 1);
+		uint8_t* dst_u_4 = dst_u + (dst_w * half_dst_h >> 1) + (half_dst_w >> 1);
+		uint8_t* dst_v_1 = dst_v;
+		uint8_t* dst_v_2 = dst_v + (half_dst_w >> 1);
+		uint8_t* dst_v_3 = dst_v + (dst_w * half_dst_h >> 1);
+		uint8_t* dst_v_4 = dst_v + (half_dst_w >> 1) + (dst_w * half_dst_h >> 1);
+		std::vector<WH> src_wh_vec = { { half_src_w, half_src_h },{half_src_w + src_mode_w,half_src_h},{half_src_w,half_src_h + src_mode_h},{half_src_w + src_mode_w,half_src_h + src_mode_h} };
+		if (omp)
+		{
+			uint8_t* src_data[4][3] = { {src_y_1,src_u_1,src_v_1},{src_y_2,src_u_2,src_v_2},{src_y_3,src_u_3,src_v_3},{src_y_4,src_u_4,src_v_4} };
+			uint8_t* dst_data[4][3] = { {dst_y_1,dst_u_1,dst_v_1},{dst_y_2,dst_u_2,dst_v_2},{dst_y_3,dst_u_3,dst_v_3},{dst_y_4,dst_u_4,dst_v_4} };
+			int thread_num = 4;
+			omp_set_num_threads(thread_num);
+#pragma omp parallel
+			{
+#pragma omp for
+				for (int i = 0; i < thread_num; i++)
+				{
+					uint8_t* src_y = src_data[i][0];
+					uint8_t* src_u = src_data[i][1];
+					uint8_t* src_v = src_data[i][2];
+					uint8_t* dst_y = dst_data[i][0];
+					uint8_t* dst_u = dst_data[i][1];
+					uint8_t* dst_v = dst_data[i][2];
+					libyuv::I422Scale(src_y, src_w, src_u, src_w >> 1, src_v, src_w >> 1, src_wh_vec[i].w, src_wh_vec[i].h,
+						dst_y, dst_w, dst_u, dst_w >> 1, dst_v, dst_w >> 1, half_dst_w, half_dst_h, (libyuv::FilterMode)filter);
+				}
+			}
+		}
+		else
+		{
+			{
+				libyuv::I422Scale(src_y_1, src_w, src_u_1, src_w >> 1, src_v_1, src_w >> 1, src_wh_vec[0].w, src_wh_vec[0].h,
+					dst_y_1, dst_w, dst_u_1, dst_w >> 1, dst_v_1, dst_w >> 1, half_dst_w, half_dst_h, (libyuv::FilterMode)filter);
+				libyuv::I422Scale(src_y_2, src_w, src_u_2, src_w >> 1, src_v_2, src_w >> 1, src_wh_vec[1].w, src_wh_vec[1].h,
+					dst_y_2, dst_w, dst_u_2, dst_w >> 1, dst_v_2, dst_w >> 1, half_dst_w, half_dst_h, (libyuv::FilterMode)filter);
+				libyuv::I422Scale(src_y_3, src_w, src_u_3, src_w >> 1, src_v_3, src_w >> 1, src_wh_vec[2].w, src_wh_vec[2].h,
+					dst_y_3, dst_w, dst_u_3, dst_w >> 1, dst_v_3, dst_w >> 1, half_dst_w, half_dst_h, (libyuv::FilterMode)filter);
+				libyuv::I422Scale(src_y_4, src_w, src_u_4, src_w >> 1, src_v_4, src_w >> 1, src_wh_vec[3].w, src_wh_vec[3].h,
+					dst_y_4, dst_w, dst_u_4, dst_w >> 1, dst_v_4, dst_w >> 1, half_dst_w, half_dst_h, (libyuv::FilterMode)filter);
+			}
+		}
+	}
 	static void I422Scale4K(uint8_t* src, const size_t& src_w, const size_t& src_h, uint8_t* dst, const size_t& dst_w, const size_t& dst_h, const int32_t& omp, int32_t filter = 2)
 	{
 		size_t half_src_w = (src_w >> 1);

--- a/MomentaMedia/src/BlackMagicDesign/DeckLinkInputDevice.cpp
+++ b/MomentaMedia/src/BlackMagicDesign/DeckLinkInputDevice.cpp
@@ -217,6 +217,7 @@ HRESULT DeckLinkInputDevice::VideoInputFormatChanged(BMDVideoInputFormatChangedE
 		else
 			// Invalid color depth for YUV
 			return E_INVALIDARG;
+		pixelFormat = bmdFormat8BitYUV;
 	}
 	else
 		// Unexpected detected video input format flags
@@ -230,7 +231,7 @@ HRESULT DeckLinkInputDevice::VideoInputFormatChanged(BMDVideoInputFormatChangedE
 		// Set the video input mode
 		//pixelFormat = bmdFormat8BitYUV;
-		result = DeckLinkInput->EnableVideoInput(displayMode, pixelFormat, bmdVideoInputEnableFormatDetection);
+		result = DeckLinkInput->EnableVideoInput(displayMode, pixelFormat, bmdVideoInputFlagDefault);
 		if (result == S_OK)
 			// Start the capture
@@ -297,7 +298,7 @@ bool DeckLinkInputDevice::StartCapture(BMDDisplayMode displayMode, IDeckLinkScre
 	DeckLinkInput->SetCallback(this);
 	// Set the video input mode
-	if (DeckLinkInput->EnableVideoInput(bmdMode4K2160p50, bmdFormat8BitYUV, bmdVideoInputFlagDefault) != S_OK)
+	if (DeckLinkInput->EnableVideoInput(displayMode, bmdFormat8BitYUV, videoInputFlags) != S_OK)
 		return false;
 	if (DeckLinkInput->EnableAudioInput(bmdAudioSampleRate48kHz, bmdAudioSampleType32bitInteger, Settings::AudioChannel) != S_OK)

--- a/MomentaMedia/src/BlackMagicDesign/DeckLinkInputPage.cpp
+++ b/MomentaMedia/src/BlackMagicDesign/DeckLinkInputPage.cpp
@@ -158,7 +158,7 @@ void DeckLinkInputPage::StartCapture()
 	if (!SelectedDevice)
 		return;
-	BMDDisplayMode displayMode = bmdModeUnknown;
+	BMDDisplayMode displayMode = bmdModeNTSC;
 	bool applyDetectedInputMode = AutoDetectCheckBox->isChecked();
 	displayMode = (BMDDisplayMode)VideoFormatCombo->currentData().value<unsigned int>();

--- a/MomentaMedia/src/BlackMagicDesign/DeckLinkOutputDevice.cpp
+++ b/MomentaMedia/src/BlackMagicDesign/DeckLinkOutputDevice.cpp
@@ -18,7 +18,7 @@
 //extern int SdiOutWaitNums;
 //extern std::map<qint32, qint32> map_output_delay;
-#define OUTPUT_1080 1
+//#define OUTPUT_1080 1
 #define AUDIOMAXSIZE (OutputDeleyTime + 3) * 50
 #define TESTWRITEFILE 0
 #define USETHREADS 1
@@ -148,25 +148,38 @@ HRESULT DeckLinkOutputDevice::ScheduledPlaybackHasStopped()
 	return S_OK;
 }
+void DeckLinkOutputDevice::GetInOutVideoParams(BmdVideoParams& input_params, BmdVideoParams& output_params)
+{
+	input_params = this->input_params;
+	output_params = this->output_params;
+}
 bool DeckLinkOutputDevice::StartPlayback(BMDDisplayMode displayMode, bool enable3D, BMDPixelFormat pixelFormat, bool requireReferenceLocked, IDeckLinkScreenPreviewCallback* screenPreviewCallback)
 {
-	BMDDisplayMode outputDisplayMode;
+	//BMDDisplayMode outputDisplayMode;
-#if OUTPUT_1080
 	switch (Settings::OutputPlayMode)
 	{
-	case 1:
+	case OF_SAME_INPUT:
+		outputDisplayMode = displayMode;
+		break;
+	case OF_1080_50I:
 		outputDisplayMode = bmdModeHD1080i50;
 		break;
+	case OF_1080_50P:
+		outputDisplayMode = bmdModeHD1080p50;
+		break;
+	case OF_2160_50P:
+		outputDisplayMode = bmdMode4K2160p50;
+		break;
 	default:
 		//outputDisplayMode = bmdModeHD1080p25;
 		outputDisplayMode = displayMode;
 		break;
 	}
-#else
-	outputDisplayMode = BMDDisplayMode::bmdModeHD720p50;
-#endif
+	BmdDisplayModeToBmdVideoParams(outputDisplayMode, output_params);
+	BmdDisplayModeToBmdVideoParams(displayMode, input_params);
 	// Pass through RP188 timecode and VANC from input frame. VITC timecode is forwarded with VANC
 	BMDVideoOutputFlags outputFlags = (BMDVideoOutputFlags)(bmdVideoOutputRP188 | bmdVideoOutputVANC);
@@ -188,10 +201,10 @@ bool DeckLinkOutputDevice::StartPlayback(BMDDisplayMode displayMode, bool enable
 		return false;
 	}
-	if (outputDisplayMode == bmdModeHD1080i50)
+	/*if (outputDisplayMode == bmdModeHD1080i50)
 		qDebug() << "output display HD1080i50";
 	else if (outputDisplayMode == bmdModeHD1080p25)
-		qDebug() << "output display HD1080p25";
+		qDebug() << "output display HD1080p25";*/
 	if (deckLinkOutput->GetDisplayMode(outputDisplayMode, deckLinkDisplayMode.ReleaseAndGetAddressOf()) != S_OK)
 	{

--- a/MomentaMedia/src/BlackMagicDesign/DeckLinkOutputPage.cpp
+++ b/MomentaMedia/src/BlackMagicDesign/DeckLinkOutputPage.cpp
@@ -309,6 +309,7 @@ void DeckLinkOutputPage::RequestedDeviceGranted(ComPtr<IDeckLink>& device)
 	//connect(this, &DeckLinkOutputPage::PushMask, ProcessMask.get(), &ProcessMaskThread::addMaskBuffer, Qt::DirectConnection);
 	connect(ProcessMask.get(), &ProcessMaskThread::PushFrame, SelectedDevice.Get(), &DeckLinkOutputDevice::AddVideoFrameMask, Qt::DirectConnection);
 	connect(ProcessMask.get(), &ProcessMaskThread::PushFrameToReplay, BindingInputPage->GetReplay(), &ReplayThread::addCropFrame, Qt::DirectConnection);
+	connect(ProcessMask.get(), &ProcessMaskThread::PushScaleFrameToReplay, BindingInputPage->GetReplay(), &ReplayThread::addScaleFrame, Qt::DirectConnection);
 	if (Zoom)
 	{
@@ -352,11 +353,11 @@ void DeckLinkOutputPage::RequestedDeviceGranted(ComPtr<IDeckLink>& device)
 		NDIOutput->start();
 	}
 	StartOutput();
-#if  USE_TIMEPLUS
+//#if  USE_TIMEPLUS
-	SelectedDevice->StartPlayback(bmdMode4K2160p50, false, bmdFormat8BitYUV, false, PreviewView->GetDelegate());
+//	SelectedDevice->StartPlayback(bmdMode4K2160p50, false, bmdFormat8BitYUV, false, PreviewView->GetDelegate());
-#elif USE_H2V
+//#elif USE_H2V
-	SelectedDevice->StartPlayback(bmdModeHD1080p50, false, bmdFormat10BitYUV, false, PreviewView->GetDelegate());
+//	SelectedDevice->StartPlayback(bmdModeHD1080p50, false, bmdFormat10BitYUV, false, PreviewView->GetDelegate());
-#endif //  USE_TIMEPLUS
+//#endif //  USE_TIMEPLUS
@@ -488,10 +489,16 @@ void DeckLinkOutputPage::FormatChanged(BMDDisplayMode displayMode)
 	if (firstFmtChanged) firstFmtChanged = false;
 	DisplayMode = displayMode;
 	//BMDPixelFormat pixelFormat = bmdFormat10BitYUV;
-	PixelFormat = bmdFormat10BitYUV;
+	PixelFormat = bmdFormat8BitYUV;
 	SelectedDevice->StartPlayback(DisplayMode, false, PixelFormat, false, PreviewView->GetDelegate());
+	BmdVideoParams input_params, output_params;
+	SelectedDevice->GetInOutVideoParams(input_params, output_params);
+	if (ProcessMask) ProcessMask->SetBmdInOutVideoParams(input_params,output_params);
+	if (BindingInputPage->GetReplay()) BindingInputPage->GetReplay()->SetBmdInOutVideoParams(input_params, output_params);
 	int indexToSelect = DeviceListCombo->findData(QVariant::fromValue((void*)SelectedDevice->GetDeckLinkInstance().Get()));
 	bool active = SelectedDevice->isPlaybackActive();
 	QToolBox* toolBox = findParent<QToolBox>(this);

--- a/MomentaMedia/src/BlackMagicDesign/OpenFile.cpp
+++ b/MomentaMedia/src/BlackMagicDesign/OpenFile.cpp
@@ -24,16 +24,17 @@ void COpenFile::doOpenFile()
            qDebug() << "read pic fail........." << "\n";
            return;
        }
-        if (bk_mat.cols != K4WIDTH || bk_mat.rows != K4HEIGHT) return;
+        //if (bk_mat.cols != K4WIDTH || bk_mat.rows != K4HEIGHT) return;
+        qint32 w = bk_mat.cols, h = bk_mat.rows;
        //cv::cvtColor(bk_mat, bk_mat, cv::COLOR_RGB2RGBA);
        emit tellFilePath(path);
-        qint32 size = K4WIDTH * K4HEIGHT << 1;
+        qint32 size = w * h << 1;
-        uint8_t* data = new uint8_t[K4WIDTH * K4HEIGHT << 1];
+        uint8_t* data = new uint8_t[w * h << 1];
        cv::cvtColor(bk_mat, bk_mat, cv::COLOR_BGR2BGRA);
-        Yuv4k::ARGBToUYVY4K(bk_mat.data, K4WIDTH * K4HEIGHT, data, K4WIDTH * K4HEIGHT >> 1, K4WIDTH >> 1, K4HEIGHT >> 1);
+        Yuv4k::ARGBToUYVY4K(bk_mat.data, w * h, data, w * h >> 1, w >> 1, h >> 1);
-        std::shared_ptr<VideoFrameWithMask> frame = std::make_shared<VideoFrameWithMask>(K4WIDTH,K4HEIGHT, data,bmdFormat8BitYUV, BS_IDEL, kDefaultHLGBT2020HDRMetadata);
+        std::shared_ptr<VideoFrameWithMask> frame = std::make_shared<VideoFrameWithMask>(w,h, data,bmdFormat8BitYUV, BS_IDEL, kDefaultHLGBT2020HDRMetadata);
        emit SendPicData(frame);
    }
 }
\ No newline at end of file
--- a/MomentaMedia/src/NDI/NDIOutputThread.cpp
+++ b/MomentaMedia/src/NDI/NDIOutputThread.cpp
@@ -220,6 +220,16 @@ void NDIOutputThread::run()
 				VideoMaskQueue.Pop(frame_mask);
 				if (!frame_mask || (!frame_mask->data_ && !frame_mask->pImage && !frame_mask->pImage->data)) continue;
+				if (frame_mask->flag_ == BS_START)
+				{
+					if (send_one_frame_flag)
+					{
+						send_one_frame_flag = false;
+						send_count = 0;
+						end_blend_frame_ = nullptr;
+					}
+				}
 				if (Settings::NdiOneFrameDuration > 0 && frame_mask->flag_ == BS_END)
 				{
 					end_blend_frame_ = frame_mask;
@@ -252,6 +262,9 @@ void NDIOutputThread::run()
 					if (frame_mask->data_) Frame.p_data = frame_mask->data_;
 					else Frame.p_data = frame_mask->pImage->uyvy_data;
 				}
+				Frame.xres = frame_mask->width_;
+				Frame.yres = frame_mask->height_;
+				Frame.line_stride_in_bytes = Frame.xres * 2;
 				Frame.timestamp = frame_mask->timestamp_;
 				Frame.timecode = frame_mask->timestamp_  * 1000;
 				NDIlib_send_send_video_v2(Instance, &Frame);

--- a/MomentaMedia/src/Threads/CaptureThread.cpp
+++ b/MomentaMedia/src/Threads/CaptureThread.cpp
@@ -5,11 +5,18 @@
 #include "Utils/Memory4k.h"
 #include <omp.h>
 #include "Utils/Settings.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/row.h"
 #pragma intrinsic(memcpy)
 //extern int RecordFlag;
 //extern int OpenOMP;
+//extern "C"
+//{
+//	 extern const struct libyuv::YuvConstants libyuv::kYuv2020Constants;
+//}
 static int64_t GetCurrTimeMS()
 {
@@ -155,7 +162,8 @@ void CaptureThread::run()
 					if (!Settings::OpenOMP)
 					{
 						size_t yuv_size = width_ * height_ << 1;
-						Yuv4k::UYVYToARGB4K(videoFrame->uyvy_data, yuv_size, videoFrame->data, once_size, width_, height_);
+						//Yuv4k::UYVYToARGB4K(videoFrame->uyvy_data, yuv_size, videoFrame->data, once_size, width_, height_);
+						Yuv4k::UYVYToARGB4KWithMatrix(videoFrame->uyvy_data, yuv_size, videoFrame->data, once_size, width_, height_,videoFrame->meta_.colorspace);
 					}
 					else
 					{
@@ -175,7 +183,24 @@ void CaptureThread::run()
 								auto dst = videoFrame->data + i * once_size;
 								auto src = videoFrame->uyvy_data + i * yuv_size;
+								switch (videoFrame->meta_.colorspace)
+								{
+								case bmdColorspaceRec601:
+									libyuv::UYVYToARGBMatrix(src, width_ << 1, dst, width_ << 2, &libyuv::kYuvI601Constants, width_, height_);
+									break;
+								case bmdColorspaceRec709:
+									libyuv::UYVYToARGBMatrix(src, width_ << 1, dst, width_ << 2, &libyuv::kYuvI601Constants, width_, height_);
+									break;
+								case bmdColorspaceRec2020:
+									libyuv::UYVYToARGBMatrix(src, width_ << 1, dst, width_ << 2, &libyuv::kYuv2020Constants, width_, height_);
+									break;
+								default:
 									libyuv::UYVYToARGB(src, width_ << 1, dst, width_ << 2, width_, height_);
+									break;
+								}
+								//libyuv::UYVYToARGBMatrix(src, width_ << 1, dst, width_ << 2, &libyuv::kYuvI601Constants, width_, height_);
+								//libyuv::UYVYToARGB(src, width_ << 1, dst, width_ << 2, width_, height_);
 								//qDebug() << "get omp thread id:" << omp_get_thread_num() << "\n";
 								//memcpy(dst, src, once_size);
 							}

--- a/MomentaMedia/src/Threads/CropThread.cpp
+++ b/MomentaMedia/src/Threads/CropThread.cpp
@@ -103,10 +103,10 @@ void CropThread::CropScale()
 					{
 					}
-					else if (p_frame->fmt_ == bmdFormat8BitYUV && (p_frame->data_ || (p_frame->pImage && p_frame->pImage->data)))
+					else if (p_frame->fmt_ == bmdFormat8BitYUV && (p_frame->data_ /*|| (p_frame->pImage && p_frame->pImage->data)*/))
 					{
-						uyvy_data = (p_frame->data_ ? p_frame->data_ : p_frame->pImage->data);
+						//uyvy_data = (p_frame->data_ ? p_frame->data_ : p_frame->pImage->data);
+						uyvy_data = p_frame->data_;
 						Yuv4k::UYVYCopy(uyvy_data, crop_msg.crop_x, crop_msg.crop_y, p_frame->width_, p_frame->height_, uyvy_crop_data, crop_msg.crop_w, crop_msg.crop_h);
 						Yuv4k::UYVYToI422(uyvy_crop_data, i422_crop_data, crop_msg.crop_w, crop_msg.crop_h);
 						Yuv4k::I422Scale4K(i422_crop_data, crop_msg.crop_w, crop_msg.crop_h, i422_4k_data, p_frame->width_, p_frame->height_, Settings::ZoomUseOmp);

--- a/MomentaMedia/src/Threads/ProcessMaskThread.cpp
+++ b/MomentaMedia/src/Threads/ProcessMaskThread.cpp
@@ -135,9 +135,27 @@ CropThread* ProcessMaskThread::GetCropThread()
 	return crop_thread.get();
 }
+void ProcessMaskThread::SetBmdInOutVideoParams(const BmdVideoParams& input, const BmdVideoParams& output)
+{
+	input_params = input;
+	output_params = output;
+	if (input_params.width != output_params.width || input_params.height != output_params.height) {
+		scale_flag = true;
+		if (!scale_thread)
+		{
+			scale_thread = std::make_shared<VideoScaleThread>(this);
+			ScaleMsg msg{output.width,output.height};
+			scale_thread->addScaleMsg(msg);
+			scale_thread->start();
+		}
+	}
+	else scale_flag = false;
+}
 void ProcessMaskThread::ClearQueue()
 {
 	if(taskImageQueue.Size()) taskImageQueue.Reset();
+	if (!replay_flag_clear) replay_flag_clear = true;
 }
 void ProcessMaskThread::run()
@@ -242,6 +260,18 @@ void ProcessMaskThread::OnRecvViedoFrame(std::shared_ptr<VideoFrameWithMask> fra
 	}
 }
+void ProcessMaskThread::OnRecvScaleViedoFrame(std::shared_ptr<VideoFrameWithMask> frame)
+{
+	if (frame)
+	{
+		emit PushFrame(frame);
+		if (!frame->replay_flag && frame->flag_ == BS_IDEL)
+		{
+			emit PushScaleFrameToReplay(frame);
+		}
+	}
+}
 void ProcessMaskThread::CropScaleWithUYVY()
 {
 	//while (true)
@@ -339,6 +369,21 @@ void ProcessMaskThread::workMaskWithUYVY(const std::shared_ptr<videoFrameData>& 
 	uint8_t* alpha = NULL;
 	bool err = false;
+	if (replay_flag_clear)
+	{
+		if (dynamic_flag)
+		{
+			dynamic_flag = false;
+			dynamic_out_num = 0;
+		}
+		status = BS_IDEL;
+		memset(tmp_alpha, 0, width * height);
+		last_masked = nullptr;
+		if(masked_map.size()) masked_map.clear();
+		memset(bk_argb, 0, width * height << 2);
+		replay_flag_clear = false;
+	}
 	if (mask_flag && mask_buffer)
 	{
 		x = mask_buffer->upper_left_point.x;
@@ -403,6 +448,7 @@ void ProcessMaskThread::workMaskWithUYVY(const std::shared_ptr<videoFrameData>& 
 			if (dynamic_flag)
 			{
 				dynamic_flag = false;
+				dynamic_out_num = 0;
 				status = BS_IDEL;
 				memset(tmp_alpha, 0, width * height);
 				last_masked = nullptr;
@@ -491,7 +537,9 @@ void ProcessMaskThread::workMaskWithUYVY(const std::shared_ptr<videoFrameData>& 
 				libyuv::ARGBAttenuate(crop_buffer_argb, crop_width << 2, crop_buffer_argb, crop_width << 2, crop_width, crop_height);
 				libyuv::ARGBBlend(crop_buffer_argb, crop_width << 2, bk_argb + offset, width << 2, bk_argb + offset, width << 2, crop_width, crop_height);
 				Rect cross_rect;
 				//outputAlpha2(buffer, last_masked, cross_rect);
 				outputAlphaRect(buffer, last_rect, cross_rect);
@@ -590,6 +638,7 @@ void ProcessMaskThread::workMaskWithUYVY(const std::shared_ptr<videoFrameData>& 
 		libyuv::ARGBCopy(src_bgra + offset_b, width << 2, tmp_bgra, width << 2, mask_rect.width, mask_rect.height);
 		libyuv::ARGBBlend(bk_argb + offset_b, width << 2, tmp_bgra, width << 2, tmp_bgra, width << 2, mask_rect.width, mask_rect.height);
 		qint32 x_ = ((mask_rect.x + 1) >> 1 << 1);
 		qint32 width_ = (x_ == mask_rect.x ? (mask_rect.width >> 1 << 1) : ((mask_rect.width - 1) >> 1 << 1));
 		qint32 height_ = (mask_rect.height);
@@ -627,6 +676,10 @@ void ProcessMaskThread::workMaskWithUYVY(const std::shared_ptr<videoFrameData>& 
 		video_frame->replay_flag = replay_flag;
 		//std::shared_ptr<VideoFrameWithMask> video_frame = std::make_shared<VideoFrameWithMask>(pImage, status, start_time);
 		if (Settings::CropFlag && /*(crop_msg.crop_x >= 0 || crop_msg.crop_y >= 0) &&*/ crop_thread) crop_thread->addVideoFrame(video_frame);
+		else if (scale_flag && scale_thread) {
+			//if (!scale_thread) scale_thread = std::make_shared<VideoScaleThread>(this);
+			scale_thread->addVideoFrame(video_frame);
+		}
 		else
 		{
 			emit PushFrame(video_frame);
@@ -676,7 +729,7 @@ void ProcessMaskThread::workMaskWithUYVY(const std::shared_ptr<videoFrameData>& 
 	else
 	{
 		//emit PushFrame(video_frame);
-		if (Settings::CropFlag && /*(crop_msg.crop_x >=0 || crop_msg.crop_y >= 0) &&*/ crop_thread)
+		if ((Settings::CropFlag && crop_thread) /*|| (scale_flag && scale_thread)*/)
 		{
 			//auto t2 = TimeMilliSecond();
 			/*size_t dst_argb_size = width * height << 2;
@@ -713,9 +766,15 @@ void ProcessMaskThread::workMaskWithUYVY(const std::shared_ptr<videoFrameData>& 
 			std::shared_ptr<VideoFrameWithMask> video_frame_i = std::make_shared<VideoFrameWithMask>(width, height, timestamp, pImage->sequenceNum, dst_uyvy, bmdFormat8BitYUV, pImage->flag_, pImage->meta_);
 			//std::shared_ptr<VideoFrameWithMask> video_frame = std::make_shared<VideoFrameWithMask>(pImage, status);
-			if(crop_thread) crop_thread->addVideoFrame(video_frame_i);
+			if (crop_thread)  crop_thread->addVideoFrame(video_frame_i);
+			//if (scale_thread) scale_thread->addVideoFrame(video_frame_i);
 			//qDebug() << "memcpy duration:" << TimeMilliSecond() - t2 << "\n";
 		}
+		else if (scale_flag && scale_thread) {
+			std::shared_ptr<VideoFrameWithMask> video_frame = std::make_shared<VideoFrameWithMask>(pImage, pImage->flag_);
+			scale_thread->addVideoFrame(video_frame);
+		}
 		else
 		{
 			std::shared_ptr<VideoFrameWithMask> video_frame = std::make_shared<VideoFrameWithMask>(pImage, pImage->flag_);

--- a/MomentaMedia/src/Threads/ReplayThread.cpp
+++ b/MomentaMedia/src/Threads/ReplayThread.cpp
@@ -11,6 +11,7 @@ ReplayThread::ReplayThread()
 	max_store_size = Settings::ReplayStoreTime  * Settings::FrameRate / 1000;
 	max_store_back_size = Settings::ReplayBackStoreTime * Settings::FrameRate / 1000;
 	max_store_crop_size = Settings::ReplayCropStoreTime * Settings::FrameRate / 1000;
+	max_store_scale_size = max_store_crop_size;
 	interval = 1000 / Settings::FrameRate;
 }
@@ -33,6 +34,19 @@ void ReplayThread::addCropFrame(std::shared_ptr<VideoFrameWithMask> frame)
 }
+void ReplayThread::addScaleFrame(std::shared_ptr<VideoFrameWithMask> frame)
+{
+	if (frame && (frame->data_ || frame->pImage))
+	{
+		if (replay_params.status == RS_IDEL || replay_params.status == RS_END)
+			storeScaleMap.Insert(frame->timestamp_, frame);
+		if (storeScaleMap.Size() > max_store_scale_size)
+		{
+			storeScaleMap.Pop();
+		}
+	}
+}
 void ReplayThread::addFrame(std::shared_ptr<videoFrameData> frameData)
 {
 	if(frameData && frameData->data)
@@ -128,6 +142,23 @@ CropThread* ReplayThread::GetCropThread()
 	return crop_thread.get();
 }
+void ReplayThread::SetBmdInOutVideoParams(const BmdVideoParams& input, const BmdVideoParams& output)
+{
+	input_params = input;
+	output_params = output;
+	if (input_params.width != output_params.width || input_params.height != output_params.height) {
+		scale_flag = true;
+		if (!scale_thread)
+		{
+			scale_thread = std::make_shared<VideoScaleThread>(this);
+			ScaleMsg msg{ output.width,output.height };
+			scale_thread->addScaleMsg(msg);
+			scale_thread->start();
+		}
+	}
+	else scale_flag = false;
+}
 void ReplayThread::OnRecvViedoFrame(std::shared_ptr<VideoFrameWithMask> crop_frame)
 {
 	if (crop_frame)
@@ -137,6 +168,15 @@ void ReplayThread::OnRecvViedoFrame(std::shared_ptr<VideoFrameWithMask> crop_fra
 	}
 }
+void ReplayThread::OnRecvScaleViedoFrame(std::shared_ptr<VideoFrameWithMask> scale_frame)
+{
+	if(scale_frame)
+	{
+		storeCropMap.EnsureInsert(scale_frame->timestamp_, scale_frame);
+		replayVideoSdiVec.push_back(scale_frame);
+	}
+}
 void ReplayThread::OnChange()
 {
@@ -239,6 +279,7 @@ void ReplayThread::run()
 			}
 			qint64 rp_begin = in_tm  - dy_in_count;
+			if (dy_out_count <= 0) dy_out_count = 1;
 			qint64 rp_end   = out_tm + dy_out_count;
 			auto begin_tm = storeVideoMap.begin()->first;
@@ -296,6 +337,7 @@ void ReplayThread::run()
 				for (auto itor = itor_begin;; itor++)
 				{
 					if (itor == in_itor)
+					//if (itor == ndi_itor)
 					{
 						break;
 					}
@@ -318,6 +360,19 @@ void ReplayThread::run()
 							needCropQueue.Push(frame);
 						}
 					}
+					else if (scale_flag)
+					{
+						std::shared_ptr<VideoFrameWithMask> scale_frame = nullptr;
+						if(storeScaleMap.Find(itor->first, scale_frame))
+						{ 
+							replayVideoSdiVec.push_back(scale_frame);
+						}
+						else
+						{
+							std::shared_ptr<VideoFrameWithMask> video_frame = std::make_shared<VideoFrameWithMask>(frame, frame->flag_);
+							scale_thread->addVideoFrame(video_frame);
+						}
+					}
 					else
 					{
 						std::shared_ptr<VideoFrameWithMask> video_frame = std::make_shared<VideoFrameWithMask>(frame, frame->flag_);

--- a/MomentaMedia/src/Threads/VideoScaleThread.cpp
+++ b/MomentaMedia/src/Threads/VideoScaleThread.cpp
+#include "Threads/VideoScaleThread.h"
+#include "Utils/yuv4k.h"
+#include "Utils/Settings.h"
+VideoScaleThread::VideoScaleThread(Listener* listener) :p_listener(listener)
+{
+	if(!uyvy_to_i422) uyvy_to_i422 = new uint8_t[i422_width * i422_height << 1];
+}
+VideoScaleThread::~VideoScaleThread()
+{
+	if (uyvy_to_i422) delete uyvy_to_i422;
+	if (i422_scale) delete i422_scale;
+}
+void VideoScaleThread::addVideoFrame(std::shared_ptr<VideoFrameWithMask> frame)
+{
+	if(frame) video_queue.Push(frame);
+}
+void VideoScaleThread::addScaleMsg(const ScaleMsg& msg)
+{
+	scale_msg_queue.Push(msg);
+}
+void VideoScaleThread::run()
+{
+	scale();
+}
+void VideoScaleThread::scale()
+{
+	while (true)
+	{
+		std::shared_ptr<VideoFrameWithMask> p_frame = nullptr;
+		if (video_queue.WaitFor(p_frame))
+		{
+			if (scale_msg_queue.Size())
+			{
+				ScaleMsg tmp_msg;
+				scale_msg_queue.PopLast(tmp_msg);
+				/*if (!crop_msg.equal(tmp_msg) && p_listener)
+				{
+					p_listener->OnChange();
+				}*/
+				if (tmp_msg.scale_width && tmp_msg.scale_height)
+				{
+					if (tmp_msg.scale_width != scale_msg.scale_width || tmp_msg.scale_height != scale_msg.scale_height)
+					{
+						if (i422_scale) delete i422_scale;
+						i422_scale = new uint8_t[tmp_msg.scale_width * tmp_msg.scale_height << 1];
+					}
+					scale_msg = tmp_msg;
+				}
+			}
+			if (p_frame)
+			{
+				uint8_t* uyvy_data = NULL;
+				if (p_frame->fmt_ == bmdFormat8BitBGRA && p_frame->data_)
+				{
+				}
+				else if (p_frame->fmt_ == bmdFormat8BitYUV && (p_frame->data_ || (p_frame->pImage && p_frame->pImage->uyvy_data)))
+				{
+					uyvy_data = (p_frame->data_ ? p_frame->data_ : p_frame->pImage->uyvy_data);
+					//uyvy_data = p_frame->data_;
+					auto src_w = p_frame->width_;
+					auto src_h = p_frame->height_;
+					if (src_w * src_h > i422_width * i422_height)
+					{
+						delete uyvy_to_i422;
+						uyvy_to_i422 = new uint8_t[src_w * src_h << 1];
+						i422_width = src_w;
+						i422_height = src_h;
+					}
+					Yuv4k::UYVYToI422(uyvy_data, uyvy_to_i422, src_w, src_h);
+					Yuv4k::I422Scale(uyvy_to_i422, src_w, src_h, i422_scale, scale_msg.scale_width, scale_msg.scale_height, Settings::ZoomUseOmp);
+					uint8_t* scale_uyvy = new uint8_t[scale_msg.scale_width * scale_msg.scale_height << 1];
+					Yuv4k::I422ToUYVY(i422_scale, scale_uyvy, scale_msg.scale_width, scale_msg.scale_height, 0);
+					p_frame->width_ = scale_msg.scale_width;
+					p_frame->height_ = scale_msg.scale_height;
+					p_frame->size_ = scale_msg.scale_width * scale_msg.scale_height << 1;
+					if (p_frame->data_) delete p_frame->data_;
+					p_frame->data_ = scale_uyvy;
+				}
+				if (p_listener) p_listener->OnRecvScaleViedoFrame(p_frame);
+			}
+		}
+	}
+}
\ No newline at end of file
--- a/MomentaMedia/src/TimePlus.cpp
+++ b/MomentaMedia/src/TimePlus.cpp
@@ -161,6 +161,8 @@ MomentaMedia::MomentaMedia(QWidget *parent)
    OutputModeCombo->addItem(QString::fromWCharArray(L"跟随输入格式"));
    OutputModeCombo->addItem("1080 50I");
+    OutputModeCombo->addItem("1080 50P");
+    OutputModeCombo->addItem("2160 50P");
    OutputModeCombo->setCurrentIndex(mode);
    connect(OutputModeCombo, QOverload<int>::of(&QComboBox::currentIndexChanged), this, &MomentaMedia::OutputModeChanged);

--- a/x64/Debug/FigureOut.exe
+++ b/x64/Debug/FigureOut.exe
--- a/x64/Debug/FigureOut.ilk
+++ b/x64/Debug/FigureOut.ilk
--- a/x64/Debug/libyuv.dll
+++ b/x64/Debug/libyuv.dll
--- a/x64/Debug/pic/frame.png
+++ b/x64/Debug/pic/frame.png
--- a/x64/Debug/settings.ini
+++ b/x64/Debug/settings.ini
@@ -29,7 +29,7 @@ CROP_X=3200
 CROP_Y=1800
 CROP_DIRECTION=4
 USE_PIC_FLAG=1
-OUTPUT_PLAY_MODE=0
+OUTPUT_PLAY_MODE=2
 AUDIO_CHANNEL=2
 ASPEC_DEN=9
 ASPEC_NUM=16

--- a/x64/Release/FigureOut.exe
+++ b/x64/Release/FigureOut.exe
--- a/x64/Release/libyuv.dll
+++ b/x64/Release/libyuv.dll