FFmpeg 7.1 + cuframes input demuxer

Snapshot FFmpeg n7.1 (release tag) с применённым patch'ем для cuframes input format. Используется как FFMPEG_REPO_OVERRIDE в NickM-27/FFmpeg-Builds fork для статической сборки patched binary под Frigate (Debian 12 / glibc 2.36). Apply changes: + libavformat/cuframesdec.c (новый — реализация демуксера) M libavformat/Makefile (CONFIG_CUFRAMES_DEMUXER target) M libavformat/allformats.c (extern declaration) M configure (--enable-libcuframes option + dep check) Patch source: https://git.goldix.org/gx/cuframes (filter/ffmpeg-7.1-cuframes-demuxer.patch) History сброшена (snapshot вместо fork) потому что upstream shallow clone не позволял push в gitea. Полная история FFmpeg — на github.com/FFmpeg/FFmpeg n7.1.
2026-05-17 11:43:10 +01:00
commit dcecd42de4
8554 changed files with 2122707 additions and 0 deletions
@@ -0,0 +1,40 @@
+OBJS-$(CONFIG_H264CHROMA)             += loongarch/h264chroma_init_loongarch.o
+OBJS-$(CONFIG_H264QPEL)               += loongarch/h264qpel_init_loongarch.o
+OBJS-$(CONFIG_H264DSP)                += loongarch/h264dsp_init_loongarch.o
+OBJS-$(CONFIG_H264PRED)               += loongarch/h264_intrapred_init_loongarch.o
+OBJS-$(CONFIG_VP8_DECODER)            += loongarch/vp8dsp_init_loongarch.o
+OBJS-$(CONFIG_VP9_DECODER)            += loongarch/vp9dsp_init_loongarch.o
+OBJS-$(CONFIG_VC1DSP)                 += loongarch/vc1dsp_init_loongarch.o
+OBJS-$(CONFIG_HPELDSP)                += loongarch/hpeldsp_init_loongarch.o
+OBJS-$(CONFIG_IDCTDSP)                += loongarch/idctdsp_init_loongarch.o
+OBJS-$(CONFIG_VIDEODSP)               += loongarch/videodsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER)           += loongarch/hevcdsp_init_loongarch.o
+LASX-OBJS-$(CONFIG_H264QPEL)          += loongarch/h264qpel_lasx.o
+LASX-OBJS-$(CONFIG_H264DSP)           += loongarch/h264dsp_lasx.o \
+                                         loongarch/h264_deblock_lasx.o
+LASX-OBJS-$(CONFIG_VC1DSP)            += loongarch/vc1dsp_lasx.o
+LASX-OBJS-$(CONFIG_HPELDSP)           += loongarch/hpeldsp_lasx.o
+LASX-OBJS-$(CONFIG_IDCTDSP)           += loongarch/simple_idct_lasx.o  \
+                                         loongarch/idctdsp_lasx.o
+LSX-OBJS-$(CONFIG_VP8_DECODER)        += loongarch/vp8_mc_lsx.o \
+                                         loongarch/vp8_lpf_lsx.o
+LSX-OBJS-$(CONFIG_VP9_DECODER)        += loongarch/vp9_mc_lsx.o \
+                                         loongarch/vp9_intra_lsx.o \
+                                         loongarch/vp9_lpf_lsx.o \
+                                         loongarch/vp9_idct_lsx.o
+LSX-OBJS-$(CONFIG_HEVC_DECODER)       += loongarch/hevcdsp_lsx.o \
+                                         loongarch/hevc_idct_lsx.o \
+                                         loongarch/hevc_lpf_sao_lsx.o \
+                                         loongarch/hevc_mc_bi_lsx.o \
+                                         loongarch/hevc_mc_uni_lsx.o \
+                                         loongarch/hevc_mc_uniw_lsx.o \
+                                         loongarch/hevc_add_res.o \
+                                         loongarch/hevc_mc.o \
+                                         loongarch/hevc_idct.o
+LSX-OBJS-$(CONFIG_H264DSP)            += loongarch/h264idct.o \
+                                         loongarch/h264idct_loongarch.o \
+                                         loongarch/h264dsp.o
+LSX-OBJS-$(CONFIG_H264QPEL)           += loongarch/h264qpel.o \
+                                         loongarch/h264qpel_lsx.o
+LSX-OBJS-$(CONFIG_H264CHROMA)         += loongarch/h264chroma.o
+LSX-OBJS-$(CONFIG_H264PRED)           += loongarch/h264intrapred.o
@@ -0,0 +1,239 @@
+/*
+ * Loongson  optimized cabac
+ *
+ * Copyright (c) 2020 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Gu Xiwei(guxiwei-hf@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_CABAC_H
+#define AVCODEC_LOONGARCH_CABAC_H
+
+#include "libavutil/attributes.h"
+#include "libavcodec/cabac.h"
+#include "config.h"
+
+#define GET_CABAC_LOONGARCH_UNCBSR                                      \
+    "ld.bu        %[bit],        %[state],       0x0           \n\t"    \
+    "andi         %[tmp0],       %[c_range],     0xC0          \n\t"    \
+    "slli.d       %[tmp0],       %[tmp0],        0x01          \n\t"    \
+    "add.d        %[tmp0],       %[tmp0],        %[tables]     \n\t"    \
+    "add.d        %[tmp0],       %[tmp0],        %[bit]        \n\t"    \
+    /* tmp1: RangeLPS */                                                \
+    "ld.bu        %[tmp1],       %[tmp0],        %[lps_off]    \n\t"    \
+                                                                        \
+    "sub.d        %[c_range],    %[c_range],     %[tmp1]       \n\t"    \
+    "slli.d       %[tmp0],       %[c_range],     0x11          \n\t"    \
+    "bge          %[tmp0],       %[c_low],       1f            \n\t"    \
+    "move         %[c_range],    %[tmp1]                       \n\t"    \
+    "nor          %[bit],        %[bit],         %[bit]        \n\t"    \
+    "sub.d        %[c_low],      %[c_low],       %[tmp0]       \n\t"    \
+                                                                        \
+    "1:                                                        \n\t"    \
+    /* tmp1: *state */                                                  \
+    "add.d        %[tmp0],       %[tables],      %[bit]        \n\t"    \
+    "ld.bu        %[tmp1],       %[tmp0],        %[mlps_off]   \n\t"    \
+    /* tmp2: lps_mask */                                                \
+    "add.d        %[tmp0],       %[tables],      %[c_range]    \n\t"    \
+    "ld.bu        %[tmp2],       %[tmp0],        %[norm_off]   \n\t"    \
+                                                                        \
+    "andi         %[bit],        %[bit],         0x01          \n\t"    \
+    "st.b         %[tmp1],       %[state],       0x0           \n\t"    \
+    "sll.d        %[c_range],    %[c_range],     %[tmp2]       \n\t"    \
+    "sll.d        %[c_low],      %[c_low],       %[tmp2]       \n\t"    \
+                                                                        \
+    "and          %[tmp1],       %[c_low],       %[cabac_mask] \n\t"    \
+    "bnez         %[tmp1],       1f                            \n\t"    \
+    "ld.hu        %[tmp1],       %[c_bytestream], 0x0          \n\t"    \
+    "ctz.d        %[tmp0],       %[c_low]                      \n\t"    \
+    "addi.d       %[tmp2],       %[tmp0],        -16           \n\t"    \
+    "revb.2h      %[tmp0],       %[tmp1]                       \n\t"    \
+    "slli.d       %[tmp0],       %[tmp0],        0x01          \n\t"    \
+    "sub.d        %[tmp0],       %[tmp0],        %[cabac_mask] \n\t"    \
+    "sll.d        %[tmp0],       %[tmp0],        %[tmp2]       \n\t"    \
+    "add.d        %[c_low],      %[c_low],       %[tmp0]       \n\t"    \
+    "addi.d       %[c_bytestream], %[c_bytestream],     0x02   \n\t"    \
+    "1:                                                        \n\t"    \
+
+#define GET_CABAC_LOONGARCH                                             \
+    "ld.bu        %[bit],        %[state],       0x0           \n\t"    \
+    "andi         %[tmp0],       %[c_range],     0xC0          \n\t"    \
+    "slli.d       %[tmp0],       %[tmp0],        0x01          \n\t"    \
+    "add.d        %[tmp0],       %[tmp0],        %[tables]     \n\t"    \
+    "add.d        %[tmp0],       %[tmp0],        %[bit]        \n\t"    \
+    /* tmp1: RangeLPS */                                                \
+    "ld.bu        %[tmp1],       %[tmp0],        %[lps_off]    \n\t"    \
+                                                                        \
+    "sub.d        %[c_range],    %[c_range],     %[tmp1]       \n\t"    \
+    "slli.d       %[tmp0],       %[c_range],     0x11          \n\t"    \
+    "bge          %[tmp0],       %[c_low],       1f            \n\t"    \
+    "move         %[c_range],    %[tmp1]                       \n\t"    \
+    "nor          %[bit],        %[bit],         %[bit]        \n\t"    \
+    "sub.d        %[c_low],      %[c_low],       %[tmp0]       \n\t"    \
+                                                                        \
+    "1:                                                        \n\t"    \
+    /* tmp1: *state */                                                  \
+    "add.d        %[tmp0],       %[tables],      %[bit]        \n\t"    \
+    "ld.bu        %[tmp1],       %[tmp0],        %[mlps_off]   \n\t"    \
+    /* tmp2: lps_mask */                                                \
+    "add.d        %[tmp0],       %[tables],      %[c_range]    \n\t"    \
+    "ld.bu        %[tmp2],       %[tmp0],        %[norm_off]   \n\t"    \
+                                                                        \
+    "andi         %[bit],        %[bit],         0x01          \n\t"    \
+    "st.b         %[tmp1],       %[state],       0x0           \n\t"    \
+    "sll.d        %[c_range],    %[c_range],     %[tmp2]       \n\t"    \
+    "sll.d        %[c_low],      %[c_low],       %[tmp2]       \n\t"    \
+                                                                        \
+    "and          %[tmp1],       %[c_low],       %[cabac_mask] \n\t"    \
+    "bnez         %[tmp1],       1f                            \n\t"    \
+    "ld.hu        %[tmp1],       %[c_bytestream], 0x0          \n\t"    \
+    "ctz.d        %[tmp0],       %[c_low]                      \n\t"    \
+    "addi.d       %[tmp2],       %[tmp0],        -16           \n\t"    \
+    "revb.2h      %[tmp0],       %[tmp1]                       \n\t"    \
+    "slli.d       %[tmp0],       %[tmp0],        0x01          \n\t"    \
+    "sub.d        %[tmp0],       %[tmp0],        %[cabac_mask] \n\t"    \
+    "sll.d        %[tmp0],       %[tmp0],        %[tmp2]       \n\t"    \
+                                                                        \
+    "add.d        %[c_low],      %[c_low],       %[tmp0]       \n\t"    \
+                                                                        \
+    "slt      %[tmp0],  %[c_bytestream],  %[c_bytestream_end]  \n\t"    \
+    "add.d    %[c_bytestream], %[c_bytestream],     %[tmp0]    \n\t"    \
+    "add.d    %[c_bytestream], %[c_bytestream],     %[tmp0]    \n\t"    \
+    "1:                                                        \n\t"    \
+
+#define get_cabac_inline get_cabac_inline_loongarch
+static av_always_inline
+int get_cabac_inline_loongarch(CABACContext *c, uint8_t * const state)
+{
+    int64_t tmp0, tmp1, tmp2, bit;
+
+    __asm__ volatile (
+#if UNCHECKED_BITSTREAM_READER
+        GET_CABAC_LOONGARCH_UNCBSR
+#else
+        GET_CABAC_LOONGARCH
+#endif
+    : [bit]"=&r"(bit), [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1), [tmp2]"=&r"(tmp2),
+      [c_range]"+&r"(c->range), [c_low]"+&r"(c->low),
+      [c_bytestream]"+&r"(c->bytestream)
+    : [state]"r"(state), [tables]"r"(ff_h264_cabac_tables),
+#if !UNCHECKED_BITSTREAM_READER
+      [c_bytestream_end]"r"(c->bytestream_end),
+#endif
+      [lps_off]"i"(H264_LPS_RANGE_OFFSET),
+      [mlps_off]"i"(H264_MLPS_STATE_OFFSET + 128),
+      [norm_off]"i"(H264_NORM_SHIFT_OFFSET),
+      [cabac_mask]"r"(CABAC_MASK)
+    : "memory"
+    );
+
+    return bit;
+}
+
+#define get_cabac_bypass get_cabac_bypass_loongarch
+static av_always_inline int get_cabac_bypass_loongarch(CABACContext *c)
+{
+    int64_t tmp0, tmp1, tmp2;
+    int res = 0;
+    __asm__ volatile(
+        "slli.d     %[c_low],        %[c_low],        0x01                \n\t"
+        "and        %[tmp0],         %[c_low],        %[cabac_mask]       \n\t"
+        "bnez       %[tmp0],         1f                                   \n\t"
+        "ld.hu      %[tmp1],         %[c_bytestream], 0x0                 \n\t"
+#if UNCHECKED_BITSTREAM_READER
+        "addi.d     %[c_bytestream], %[c_bytestream], 0x02                \n\t"
+#else
+        "slt        %[tmp0],         %[c_bytestream], %[c_bytestream_end] \n\t"
+        "add.d      %[c_bytestream], %[c_bytestream], %[tmp0]             \n\t"
+        "add.d      %[c_bytestream], %[c_bytestream], %[tmp0]             \n\t"
+#endif
+        "revb.2h    %[tmp1],         %[tmp1]                              \n\t"
+        "slli.d     %[tmp1],         %[tmp1],         0x01                \n\t"
+        "sub.d      %[tmp1],         %[tmp1],         %[cabac_mask]       \n\t"
+        "add.d      %[c_low],        %[c_low],        %[tmp1]             \n\t"
+        "1:                                                               \n\t"
+        "slli.d     %[tmp1],         %[c_range],      0x11                \n\t"
+        "slt        %[tmp0],         %[c_low],        %[tmp1]             \n\t"
+        "sub.d      %[tmp1],         %[c_low],        %[tmp1]             \n\t"
+        "masknez    %[tmp2],         %[one],          %[tmp0]             \n\t"
+        "maskeqz    %[res],          %[res],          %[tmp0]             \n\t"
+        "or         %[res],          %[res],          %[tmp2]             \n\t"
+        "masknez    %[tmp2],         %[tmp1],         %[tmp0]             \n\t"
+        "maskeqz    %[c_low],        %[c_low],        %[tmp0]             \n\t"
+        "or         %[c_low],        %[c_low],        %[tmp2]             \n\t"
+        : [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1), [tmp2]"=&r"(tmp2),
+          [c_range]"+&r"(c->range), [c_low]"+&r"(c->low),
+          [c_bytestream]"+&r"(c->bytestream), [res]"+&r"(res)
+        : [cabac_mask]"r"(CABAC_MASK),
+#if !UNCHECKED_BITSTREAM_READER
+          [c_bytestream_end]"r"(c->bytestream_end),
+#endif
+          [one]"r"(0x01)
+        : "memory"
+    );
+    return res;
+}
+
+#define get_cabac_bypass_sign get_cabac_bypass_sign_loongarch
+static av_always_inline
+int get_cabac_bypass_sign_loongarch(CABACContext *c, int val)
+{
+    int64_t tmp0, tmp1;
+    int res = val;
+    __asm__ volatile(
+        "slli.d     %[c_low],        %[c_low],        0x01                \n\t"
+        "and        %[tmp0],         %[c_low],        %[cabac_mask]       \n\t"
+        "bnez       %[tmp0],         1f                                   \n\t"
+        "ld.hu      %[tmp1],         %[c_bytestream], 0x0                 \n\t"
+#if UNCHECKED_BITSTREAM_READER
+        "addi.d     %[c_bytestream], %[c_bytestream], 0x02                \n\t"
+#else
+        "slt        %[tmp0],         %[c_bytestream], %[c_bytestream_end] \n\t"
+        "add.d      %[c_bytestream], %[c_bytestream], %[tmp0]             \n\t"
+        "add.d      %[c_bytestream], %[c_bytestream], %[tmp0]             \n\t"
+#endif
+        "revb.2h    %[tmp1],         %[tmp1]                              \n\t"
+        "slli.d     %[tmp1],         %[tmp1],         0x01                \n\t"
+        "sub.d      %[tmp1],         %[tmp1],         %[cabac_mask]       \n\t"
+        "add.d      %[c_low],        %[c_low],        %[tmp1]             \n\t"
+        "1:                                                               \n\t"
+        "slli.d     %[tmp1],         %[c_range],      0x11                \n\t"
+        "slt        %[tmp0],         %[c_low],        %[tmp1]             \n\t"
+        "sub.d      %[tmp1],         %[c_low],        %[tmp1]             \n\t"
+        "masknez    %[tmp1],         %[tmp1],         %[tmp0]             \n\t"
+        "maskeqz    %[c_low],        %[c_low],        %[tmp0]             \n\t"
+        "or         %[c_low],        %[c_low],        %[tmp1]             \n\t"
+        "sub.d      %[tmp1],         %[zero],         %[res]              \n\t"
+        "maskeqz    %[tmp1],         %[tmp1],         %[tmp0]             \n\t"
+        "masknez    %[res],          %[res],          %[tmp0]             \n\t"
+        "or         %[res],          %[res],          %[tmp1]             \n\t"
+        : [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1), [res]"+&r"(res),
+          [c_range]"+&r"(c->range), [c_low]"+&r"(c->low),
+          [c_bytestream]"+&r"(c->bytestream)
+        : [cabac_mask]"r"(CABAC_MASK),
+#if !UNCHECKED_BITSTREAM_READER
+          [c_bytestream_end]"r"(c->bytestream_end),
+#endif
+          [zero]"r"(0x0)
+        : "memory"
+    );
+
+    return res;
+}
+#endif /* AVCODEC_LOONGARCH_CABAC_H */
@@ -0,0 +1,140 @@
+/*
+ * Loongson  optimized cabac
+ *
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/cabac.h"
+#include "cabac.h"
+
+#define decode_significance decode_significance_loongarch
+static int decode_significance_loongarch(CABACContext *c, int max_coeff,
+    uint8_t *significant_coeff_ctx_base, int *index, int64_t last_off)
+{
+    void *end = significant_coeff_ctx_base + max_coeff - 1;
+    int64_t minusstart = -(int64_t)significant_coeff_ctx_base;
+    int64_t minusindex = 4 - (int64_t)index;
+    int64_t bit, tmp0, tmp1, tmp2, one = 1;
+    uint8_t *state = significant_coeff_ctx_base;
+
+    __asm__ volatile(
+    "3:"
+#if UNCHECKED_BITSTREAM_READER
+    GET_CABAC_LOONGARCH_UNCBSR
+#else
+    GET_CABAC_LOONGARCH
+#endif
+    "blt     %[bit],          %[one],            4f               \n\t"
+    "add.d   %[state],        %[state],          %[last_off]      \n\t"
+#if UNCHECKED_BITSTREAM_READER
+    GET_CABAC_LOONGARCH_UNCBSR
+#else
+    GET_CABAC_LOONGARCH
+#endif
+    "sub.d   %[state],        %[state],          %[last_off]      \n\t"
+    "add.d   %[tmp0],         %[state],          %[minusstart]    \n\t"
+    "st.w    %[tmp0],         %[index],          0                \n\t"
+    "bge     %[bit],          %[one],            5f               \n\t"
+    "addi.d  %[index],        %[index],          4                \n\t"
+    "4:                                                           \n\t"
+    "addi.d  %[state],        %[state],          1                \n\t"
+    "blt     %[state],        %[end],            3b               \n\t"
+    "add.d   %[tmp0],         %[state],          %[minusstart]    \n\t"
+    "st.w    %[tmp0],         %[index],          0                \n\t"
+    "5:                                                           \n\t"
+    "add.d   %[tmp0],         %[index],          %[minusindex]    \n\t"
+    "srli.d  %[tmp0],         %[tmp0],           2                \n\t"
+    : [bit]"=&r"(bit), [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1), [tmp2]"=&r"(tmp2),
+      [c_range]"+&r"(c->range), [c_low]"+&r"(c->low), [state]"+&r"(state),
+      [c_bytestream]"+&r"(c->bytestream), [index]"+&r"(index)
+    : [tables]"r"(ff_h264_cabac_tables), [end]"r"(end), [one]"r"(one),
+      [minusstart]"r"(minusstart), [minusindex]"r"(minusindex),
+      [last_off]"r"(last_off),
+#if !UNCHECKED_BITSTREAM_READER
+      [c_bytestream_end]"r"(c->bytestream_end),
+#endif
+      [lps_off]"i"(H264_LPS_RANGE_OFFSET),
+      [mlps_off]"i"(H264_MLPS_STATE_OFFSET + 128),
+      [norm_off]"i"(H264_NORM_SHIFT_OFFSET),
+      [cabac_mask]"r"(CABAC_MASK)
+    : "memory"
+    );
+
+    return (int)tmp0;
+}
+
+#define decode_significance_8x8 decode_significance_8x8_loongarch
+static int decode_significance_8x8_loongarch(
+    CABACContext *c, uint8_t *significant_coeff_ctx_base,
+    int *index, uint8_t *last_coeff_ctx_base, const uint8_t *sig_off)
+{
+    int64_t minusindex = 4 - (int64_t)index;
+    int64_t bit, tmp0, tmp1, tmp2, one = 1, end =  63, last = 0;
+    uint8_t *state = 0;
+    int64_t flag_offset = H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET;
+
+    __asm__ volatile(
+    "3:                                                              \n\t"
+    "ldx.bu   %[tmp0],     %[sig_off],       %[last]                 \n\t"
+    "add.d    %[state],    %[tmp0], %[significant_coeff_ctx_base]    \n\t"
+#if UNCHECKED_BITSTREAM_READER
+    GET_CABAC_LOONGARCH_UNCBSR
+#else
+    GET_CABAC_LOONGARCH
+#endif
+    "blt      %[bit],      %[one],           4f                      \n\t"
+    "add.d    %[tmp0],     %[tables],        %[flag_offset]          \n\t"
+    "ldx.bu   %[tmp1],     %[tmp0],          %[last]                 \n\t"
+    "add.d    %[state],    %[tmp1],    %[last_coeff_ctx_base]        \n\t"
+#if UNCHECKED_BITSTREAM_READER
+    GET_CABAC_LOONGARCH_UNCBSR
+#else
+    GET_CABAC_LOONGARCH
+#endif
+    "st.w    %[last],      %[index],         0                       \n\t"
+    "bge     %[bit],       %[one],           5f                      \n\t"
+    "addi.d  %[index],     %[index],         4                       \n\t"
+    "4:                                                              \n\t"
+    "addi.d  %[last],      %[last],          1                       \n\t"
+    "blt     %[last],      %[end],           3b                      \n\t"
+    "st.w    %[last],      %[index],         0                       \n\t"
+    "5:                                                              \n\t"
+    "add.d   %[tmp0],      %[index],         %[minusindex]           \n\t"
+    "srli.d  %[tmp0],      %[tmp0],          2                       \n\t"
+    : [bit]"=&r"(bit), [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1),
+      [tmp2]"=&r"(tmp2), [c_range]"+&r"(c->range),
+      [c_low]"+&r"(c->low), [state]"+&r"(state), [last]"+&r"(last),
+      [c_bytestream]"+&r"(c->bytestream), [index]"+&r"(index)
+    : [tables]"r"(ff_h264_cabac_tables), [end]"r"(end),
+      [one]"r"(one), [minusindex]"r"(minusindex),
+      [last_coeff_ctx_base]"r"(last_coeff_ctx_base),
+      [flag_offset]"r"(flag_offset),
+#if !UNCHECKED_BITSTREAM_READER
+      [c_bytestream_end]"r"(c->bytestream_end),
+#endif
+      [lps_off]"i"(H264_LPS_RANGE_OFFSET), [sig_off]"r"(sig_off),
+      [mlps_off]"i"(H264_MLPS_STATE_OFFSET + 128),
+      [norm_off]"i"(H264_NORM_SHIFT_OFFSET),
+      [cabac_mask]"r"(CABAC_MASK),
+      [significant_coeff_ctx_base]"r"(significant_coeff_ctx_base)
+    );
+
+    return (int)tmp0;
+}
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Xiwei Gu <guxiwei-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/bit_depth_template.c"
+#include "h264dsp_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+#define H264_LOOP_FILTER_STRENGTH_ITERATION_LASX(edges, step, mask_mv, dir, \
+                                                 d_idx, mask_dir)           \
+do {                                                                        \
+    int b_idx = 0; \
+    int step_x4 = step << 2; \
+    int d_idx_12 = d_idx + 12; \
+    int d_idx_52 = d_idx + 52; \
+    int d_idx_x4 = d_idx << 2; \
+    int d_idx_x4_48 = d_idx_x4 + 48; \
+    int dir_x32  = dir * 32; \
+    uint8_t *ref_t = (uint8_t*)ref; \
+    uint8_t *mv_t  = (uint8_t*)mv; \
+    uint8_t *nnz_t = (uint8_t*)nnz; \
+    uint8_t *bS_t  = (uint8_t*)bS; \
+    mask_mv <<= 3; \
+    for (; b_idx < edges; b_idx += step) { \
+        out &= mask_dir; \
+        if (!(mask_mv & b_idx)) { \
+            if (bidir) { \
+                ref2 = __lasx_xvldx(ref_t, d_idx_12); \
+                ref3 = __lasx_xvldx(ref_t, d_idx_52); \
+                ref0 = __lasx_xvld(ref_t, 12); \
+                ref1 = __lasx_xvld(ref_t, 52); \
+                ref2 = __lasx_xvilvl_w(ref3, ref2); \
+                ref0 = __lasx_xvilvl_w(ref0, ref0); \
+                ref1 = __lasx_xvilvl_w(ref1, ref1); \
+                ref3 = __lasx_xvshuf4i_w(ref2, 0xB1); \
+                ref0 = __lasx_xvsub_b(ref0, ref2); \
+                ref1 = __lasx_xvsub_b(ref1, ref3); \
+                ref0 = __lasx_xvor_v(ref0, ref1); \
+\
+                tmp2 = __lasx_xvldx(mv_t, d_idx_x4_48);   \
+                tmp3 = __lasx_xvld(mv_t, 48); \
+                tmp4 = __lasx_xvld(mv_t, 208); \
+                tmp5 = __lasx_xvld(mv_t + d_idx_x4, 208); \
+                DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp2, 0x20, tmp5, tmp5, \
+                          0x20, tmp2, tmp5); \
+                tmp3 =  __lasx_xvpermi_q(tmp4, tmp3, 0x20); \
+                tmp2 = __lasx_xvsub_h(tmp2, tmp3); \
+                tmp5 = __lasx_xvsub_h(tmp5, tmp3); \
+                DUP2_ARG2(__lasx_xvsat_h, tmp2, 7, tmp5, 7, tmp2, tmp5); \
+                tmp0 = __lasx_xvpickev_b(tmp5, tmp2); \
+                tmp0 = __lasx_xvpermi_d(tmp0, 0xd8); \
+                tmp0 = __lasx_xvadd_b(tmp0, cnst_1); \
+                tmp0 = __lasx_xvssub_bu(tmp0, cnst_0); \
+                tmp0 = __lasx_xvsat_h(tmp0, 7); \
+                tmp0 = __lasx_xvpickev_b(tmp0, tmp0); \
+                tmp0 = __lasx_xvpermi_d(tmp0, 0xd8); \
+                tmp1 = __lasx_xvpickod_d(tmp0, tmp0); \
+                out = __lasx_xvor_v(ref0, tmp0); \
+                tmp1 = __lasx_xvshuf4i_w(tmp1, 0xB1); \
+                out = __lasx_xvor_v(out, tmp1); \
+                tmp0 = __lasx_xvshuf4i_w(out, 0xB1); \
+                out = __lasx_xvmin_bu(out, tmp0); \
+            } else { \
+                ref0 = __lasx_xvldx(ref_t, d_idx_12); \
+                ref3 = __lasx_xvld(ref_t, 12); \
+                tmp2 = __lasx_xvldx(mv_t, d_idx_x4_48); \
+                tmp3 = __lasx_xvld(mv_t, 48); \
+                tmp4 = __lasx_xvsub_h(tmp3, tmp2); \
+                tmp1 = __lasx_xvsat_h(tmp4, 7); \
+                tmp1 = __lasx_xvpickev_b(tmp1, tmp1); \
+                tmp1 = __lasx_xvadd_b(tmp1, cnst_1); \
+                out = __lasx_xvssub_bu(tmp1, cnst_0); \
+                out = __lasx_xvsat_h(out, 7); \
+                out = __lasx_xvpickev_b(out, out); \
+                ref0 = __lasx_xvsub_b(ref3, ref0); \
+                out = __lasx_xvor_v(out, ref0); \
+            } \
+        } \
+        tmp0 = __lasx_xvld(nnz_t, 12); \
+        tmp1 = __lasx_xvldx(nnz_t, d_idx_12); \
+        tmp0 = __lasx_xvor_v(tmp0, tmp1); \
+        tmp0 = __lasx_xvmin_bu(tmp0, cnst_2); \
+        out  = __lasx_xvmin_bu(out, cnst_2); \
+        tmp0 = __lasx_xvslli_h(tmp0, 1); \
+        tmp0 = __lasx_xvmax_bu(out, tmp0); \
+        tmp0 = __lasx_vext2xv_hu_bu(tmp0); \
+        __lasx_xvstelm_d(tmp0, bS_t + dir_x32, 0, 0); \
+        ref_t += step; \
+        mv_t  += step_x4; \
+        nnz_t += step; \
+        bS_t  += step; \
+    } \
+} while(0)
+
+void ff_h264_loop_filter_strength_lasx(int16_t bS[2][4][4], uint8_t nnz[40],
+                                       int8_t ref[2][40], int16_t mv[2][40][2],
+                                       int bidir, int edges, int step,
+                                       int mask_mv0, int mask_mv1, int field)
+{
+    __m256i out;
+    __m256i ref0, ref1, ref2, ref3;
+    __m256i tmp0, tmp1;
+    __m256i tmp2, tmp3, tmp4, tmp5;
+    __m256i cnst_0, cnst_1, cnst_2;
+    __m256i zero = __lasx_xvldi(0);
+    __m256i one  = __lasx_xvnor_v(zero, zero);
+    int64_t cnst3 = 0x0206020602060206, cnst4 = 0x0103010301030103;
+    if (field) {
+        cnst_0 = __lasx_xvreplgr2vr_d(cnst3);
+        cnst_1 = __lasx_xvreplgr2vr_d(cnst4);
+        cnst_2 = __lasx_xvldi(0x01);
+    } else {
+        DUP2_ARG1(__lasx_xvldi, 0x06, 0x03, cnst_0, cnst_1);
+        cnst_2 = __lasx_xvldi(0x01);
+    }
+    step  <<= 3;
+    edges <<= 3;
+
+    H264_LOOP_FILTER_STRENGTH_ITERATION_LASX(edges, step, mask_mv1,
+                                             1, -8, zero);
+    H264_LOOP_FILTER_STRENGTH_ITERATION_LASX(32, 8, mask_mv0, 0, -1, one);
+
+    DUP2_ARG2(__lasx_xvld, (int8_t*)bS, 0, (int8_t*)bS, 16, tmp0, tmp1);
+    DUP2_ARG2(__lasx_xvilvh_d, tmp0, tmp0, tmp1, tmp1, tmp2, tmp3);
+    LASX_TRANSPOSE4x4_H(tmp0, tmp2, tmp1, tmp3, tmp2, tmp3, tmp4, tmp5);
+    __lasx_xvstelm_d(tmp2, (int8_t*)bS, 0, 0);
+    __lasx_xvstelm_d(tmp3, (int8_t*)bS + 8, 0, 0);
+    __lasx_xvstelm_d(tmp4, (int8_t*)bS + 16, 0, 0);
+    __lasx_xvstelm_d(tmp5, (int8_t*)bS + 24, 0, 0);
+}
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/cpu.h"
+#include "libavcodec/h264pred.h"
+#include "h264_intrapred_loongarch.h"
+
+av_cold void ff_h264_pred_init_loongarch(H264PredContext *h, int codec_id,
+                                         const int bit_depth,
+                                         const int chroma_format_idc)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (bit_depth == 8) {
+        if (have_lsx(cpu_flags)) {
+            if (chroma_format_idc <= 1) {
+            }
+            if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+            } else {
+                if (chroma_format_idc <= 1) {
+                }
+                if (codec_id == AV_CODEC_ID_SVQ3) {
+                    h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_svq3_8_lsx;
+                } else if (codec_id == AV_CODEC_ID_RV40) {
+                    h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_rv40_8_lsx;
+                } else {
+                    h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_h264_8_lsx;
+                }
+            }
+        }
+        if (have_lasx(cpu_flags)) {
+            if (chroma_format_idc <= 1) {
+            }
+            if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+            } else {
+                if (chroma_format_idc <= 1) {
+                }
+                if (codec_id == AV_CODEC_ID_SVQ3) {
+                    h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_svq3_8_lasx;
+                } else if (codec_id == AV_CODEC_ID_RV40) {
+                    h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_rv40_8_lasx;
+                } else {
+                    h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_h264_8_lasx;
+                }
+            }
+        }
+    }
+}
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H
+
+#include "libavcodec/avcodec.h"
+
+void ff_h264_pred16x16_plane_h264_8_lsx(uint8_t *src, ptrdiff_t stride);
+void ff_h264_pred16x16_plane_rv40_8_lsx(uint8_t *src, ptrdiff_t stride);
+void ff_h264_pred16x16_plane_svq3_8_lsx(uint8_t *src, ptrdiff_t stride);
+
+void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride);
+void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride);
+void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride);
+
+#endif  // #ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H
@@ -0,0 +1,966 @@
+/*
+ * Loongson LSX/LASX optimized h264chroma
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+/* void ff_put_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                   int h, int x, int y) */
+function ff_put_h264_chroma_mc8_lsx
+    li.d             t8,      8
+    sub.d            t1,      t8,     a4     // 8-x
+    sub.d            t2,      t8,     a5     // 8-y
+    mul.d            t3,      t1,     t2     // A
+    mul.d            t4,      a4,     t2     // B
+    mul.d            t5,      t1,     a5     // C
+    mul.d            t6,      a4,     a5     // D
+    add.d            t0,      t4,     t5     // E
+    vreplgr2vr.b     vr0,     t3
+    vreplgr2vr.b     vr1,     t4
+    vreplgr2vr.b     vr2,     t5
+    vreplgr2vr.b     vr3,     t6
+    vreplgr2vr.b     vr4,     t0
+    slli.d           t2,      a2,     1
+    add.d            t3,      t2,     a2
+    slli.d           t4,      a2,     2
+
+    bge              zero,    t6,     .ENDLOOP_D
+    move             t1,      a3
+    vilvl.b          vr9,     vr1,    vr0
+    vilvl.b          vr10,    vr3,    vr2
+.LOOP_D:
+    vld              vr5,     a1,     0
+    vld              vr6,     a1,     1
+    add.d            a1,      a1,     a2
+    vld              vr7,     a1,     0
+    vld              vr8,     a1,     1
+    vilvl.b          vr11,    vr6,    vr5
+    vilvl.b          vr12,    vr8,    vr7
+    vmulwev.h.bu     vr13,    vr9,    vr11
+    vmaddwod.h.bu    vr13,    vr9,    vr11
+    vmulwev.h.bu     vr14,    vr10,   vr12
+    vmaddwod.h.bu    vr14,    vr10,   vr12
+    vadd.h           vr13,    vr13,   vr14
+    vsrarni.b.h      vr13,    vr13,   6
+    vstelm.d         vr13,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vld              vr6,     a1,     1
+    vilvl.b          vr11,    vr8,    vr7
+    vilvl.b          vr12,    vr6,    vr5
+    vmulwev.h.bu     vr13,    vr9,    vr11
+    vmaddwod.h.bu    vr13,    vr9,    vr11
+    vmulwev.h.bu     vr14,    vr10,   vr12
+    vmaddwod.h.bu    vr14,    vr10,   vr12
+    vadd.h           vr13,    vr13,   vr14
+    vsrarni.b.h      vr13,    vr13,   6
+    vstelm.d         vr13,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr7,     a1,     0
+    vld              vr8,     a1,     1
+    vilvl.b          vr11,    vr6,    vr5
+    vilvl.b          vr12,    vr8,    vr7
+    vmulwev.h.bu     vr13,    vr9,    vr11
+    vmaddwod.h.bu    vr13,    vr9,    vr11
+    vmulwev.h.bu     vr14,    vr10,   vr12
+    vmaddwod.h.bu    vr14,    vr10,   vr12
+    vadd.h           vr13,    vr13,   vr14
+    vsrarni.b.h      vr13,    vr13,   6
+    vstelm.d         vr13,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vld              vr6,     a1,     1
+    vilvl.b          vr11,    vr8,    vr7
+    vilvl.b          vr12,    vr6,    vr5
+    vmulwev.h.bu     vr13,    vr9,    vr11
+    vmaddwod.h.bu    vr13,    vr9,    vr11
+    vmulwev.h.bu     vr14,    vr10,   vr12
+    vmaddwod.h.bu    vr14,    vr10,   vr12
+    vadd.h           vr13,    vr13,   vr14
+    vsrarni.b.h      vr13,    vr13,   6
+    vstelm.d         vr13,    a0,     0,    0
+    add.d            a0,      a0,     a2
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOP_D
+    b                .ENDLOOP
+.ENDLOOP_D:
+
+    bge              zero,    t0,     .ENDLOOP_E
+    move             t1,      a3
+    li.d             t7,      1
+    slt              t8,      zero,   t5
+    maskeqz          t5,      a2,     t8
+    masknez          t7,      t7,     t8
+    or               t7,      t7,     t5
+    vilvl.b          vr7,     vr4,    vr0
+.LOOP_E:
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vilvl.b          vr5,     vr6,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vilvl.b          vr5,     vr6,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vilvl.b          vr5,     vr6,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vilvl.b          vr5,     vr6,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOP_E
+    b                .ENDLOOP
+.ENDLOOP_E:
+
+    move             t1,      a3
+.LOOP:
+    vld              vr5,     a1,     0
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vsrarni.b.h      vr7,     vr7,    6
+    vilvl.b          vr6,     vr7,    vr6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vldx             vr5,     a1,     a2
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vsrarni.b.h      vr7,     vr7,    6
+    vilvl.b          vr6,     vr7,    vr6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vldx             vr5,     a1,     t2
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vsrarni.b.h      vr7,     vr7,    6
+    vilvl.b          vr6,     vr7,    vr6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vldx             vr5,     a1,     t3
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vsrarni.b.h      vr7,     vr7,    6
+    vilvl.b          vr6,     vr7,    vr6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     t4
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOP
+.ENDLOOP:
+endfunc
+
+/* void ff_avg_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                   int h, int x, int y) */
+function ff_avg_h264_chroma_mc8_lsx
+    li.d             t8,      8
+    sub.d            t1,      t8,     a4     // 8-x
+    sub.d            t2,      t8,     a5     // 8-y
+    mul.d            t3,      t1,     t2     // A
+    mul.d            t4,      a4,     t2     // B
+    mul.d            t5,      t1,     a5     // C
+    mul.d            t6,      a4,     a5     // D
+    add.d            t0,      t4,     t5     // E
+    vreplgr2vr.b     vr0,     t3
+    vreplgr2vr.b     vr1,     t4
+    vreplgr2vr.b     vr2,     t5
+    vreplgr2vr.b     vr3,     t6
+    vreplgr2vr.b     vr4,     t0
+    slli.d           t2,      a2,     1
+    add.d            t3,      t2,     a2
+    slli.d           t4,      a2,     2
+
+    bge              zero,    t6,     .ENDLOOPD
+    move             t1,      a3
+    vilvl.b          vr9,     vr1,    vr0
+    vilvl.b          vr10,    vr3,    vr2
+.LOOPD:
+    vld              vr5,     a1,     0
+    vld              vr6,     a1,     1
+    add.d            a1,      a1,     a2
+    vld              vr7,     a1,     0
+    vld              vr8,     a1,     1
+    vld              vr11,    a0,     0
+    vilvl.b          vr12,    vr6,    vr5
+    vilvl.b          vr13,    vr8,    vr7
+    vmulwev.h.bu     vr14,    vr9,    vr12
+    vmaddwod.h.bu    vr14,    vr9,    vr12
+    vmulwev.h.bu     vr15,    vr10,   vr13
+    vmaddwod.h.bu    vr15,    vr10,   vr13
+    vadd.h           vr14,    vr14,   vr15
+    vsrari.h         vr14,    vr14,   6
+    vsllwil.hu.bu    vr11,    vr11,   0
+    vadd.h           vr11,    vr14,   vr11
+    vsrarni.b.h      vr11,    vr11,   1
+    vstelm.d         vr11,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vld              vr6,     a1,     1
+    vld              vr11,    a0,     0
+    vilvl.b          vr12,    vr8,    vr7
+    vilvl.b          vr13,    vr6,    vr5
+    vmulwev.h.bu     vr14,    vr9,    vr12
+    vmaddwod.h.bu    vr14,    vr9,    vr12
+    vmulwev.h.bu     vr15,    vr10,   vr13
+    vmaddwod.h.bu    vr15,    vr10,   vr13
+    vadd.h           vr14,    vr14,   vr15
+    vsrari.h         vr14,    vr14,   6
+    vsllwil.hu.bu    vr11,    vr11,   0
+    vadd.h           vr11,    vr14,   vr11
+    vsrarni.b.h      vr11,    vr11,   1
+    vstelm.d         vr11,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr7,     a1,     0
+    vld              vr8,     a1,     1
+    vld              vr11,    a0,     0
+    vilvl.b          vr12,    vr6,    vr5
+    vilvl.b          vr13,    vr8,    vr7
+    vmulwev.h.bu     vr14,    vr9,    vr12
+    vmaddwod.h.bu    vr14,    vr9,    vr12
+    vmulwev.h.bu     vr15,    vr10,   vr13
+    vmaddwod.h.bu    vr15,    vr10,   vr13
+    vadd.h           vr14,    vr14,   vr15
+    vsrari.h         vr14,    vr14,   6
+    vsllwil.hu.bu    vr11,    vr11,   0
+    vadd.h           vr11,    vr14,   vr11
+    vsrarni.b.h      vr11,    vr11,   1
+    vstelm.d         vr11,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vld              vr6,     a1,     1
+    vld              vr11,    a0,     0
+    vilvl.b          vr12,    vr8,    vr7
+    vilvl.b          vr13,    vr6,    vr5
+    vmulwev.h.bu     vr14,    vr9,    vr12
+    vmaddwod.h.bu    vr14,    vr9,    vr12
+    vmulwev.h.bu     vr15,    vr10,   vr13
+    vmaddwod.h.bu    vr15,    vr10,   vr13
+    vadd.h           vr14,    vr14,   vr15
+    vsrari.h         vr14,    vr14,   6
+    vsllwil.hu.bu    vr11,    vr11,   0
+    vadd.h           vr11,    vr14,   vr11
+    vsrarni.b.h      vr11,    vr11,   1
+    vstelm.d         vr11,    a0,     0,    0
+    add.d            a0,      a0,     a2
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOPD
+    b                .ENDLOOPELSE
+.ENDLOOPD:
+
+    bge              zero,    t0,     .ENDLOOPE
+    move             t1,      a3
+    li.d             t7,      1
+    slt              t8,      zero,   t5
+    maskeqz          t5,      a2,     t8
+    masknez          t7,      t7,     t8
+    or               t7,      t7,     t5
+    vilvl.b          vr7,     vr4,    vr0
+.LOOPE:
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vld              vr8,     a0,     0
+    vilvl.b          vr5,     vr6,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrari.h         vr6,     vr6,    6
+    vsllwil.hu.bu    vr8,     vr8,    0
+    vadd.h           vr8,     vr6,    vr8
+    vsrarni.b.h      vr8,     vr8,    1
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vld              vr8,     a0,     0
+    vilvl.b          vr5,     vr6,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrari.h         vr6,     vr6,    6
+    vsllwil.hu.bu    vr8,     vr8,    0
+    vadd.h           vr8,     vr6,    vr8
+    vsrarni.b.h      vr8,     vr8,    1
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vld              vr8,     a0,     0
+    vilvl.b          vr5,     vr6,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrari.h         vr6,     vr6,    6
+    vsllwil.hu.bu    vr8,     vr8,    0
+    vadd.h           vr8,     vr6,    vr8
+    vsrarni.b.h      vr8,     vr8,    1
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vld              vr8,     a0,     0
+    vilvl.b          vr5,     vr6,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrari.h         vr6,     vr6,    6
+    vsllwil.hu.bu    vr8,     vr8,    0
+    vadd.h           vr8,     vr6,    vr8
+    vsrarni.b.h      vr8,     vr8,    1
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOPE
+    b                .ENDLOOPELSE
+.ENDLOOPE:
+
+    move             t1,      a3
+.LOOPELSE:
+    vld              vr5,     a1,     0
+    vld              vr8,     a0,     0
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vilvl.h          vr6,     vr7,    vr6
+    vsrari.h         vr6,     vr6,    6
+    vsllwil.hu.bu    vr8,     vr8,    0
+    vadd.h           vr8,     vr6,    vr8
+    vsrarni.b.h      vr8,     vr8,    1
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vldx             vr5,     a1,     a2
+    vld              vr8,     a0,     0
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vilvl.h          vr6,     vr7,    vr6
+    vsrari.h         vr6,     vr6,    6
+    vsllwil.hu.bu    vr8,     vr8,    0
+    vadd.h           vr8,     vr6,    vr8
+    vsrarni.b.h      vr8,     vr8,    1
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vldx             vr5,     a1,     t2
+    vld              vr8,     a0,     0
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vilvl.h          vr6,     vr7,    vr6
+    vsrari.h         vr6,     vr6,    6
+    vsllwil.hu.bu    vr8,     vr8,    0
+    vadd.h           vr8,     vr6,    vr8
+    vsrarni.b.h      vr8,     vr8,    1
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vldx             vr5,     a1,     t3
+    vld              vr8,     a0,     0
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vilvl.h          vr6,     vr7,    vr6
+    vsrari.h         vr6,     vr6,    6
+    vsllwil.hu.bu    vr8,     vr8,    0
+    vadd.h           vr8,     vr6,    vr8
+    vsrarni.b.h      vr8,     vr8,    1
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     t4
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOPELSE
+.ENDLOOPELSE:
+endfunc
+
+/* void ff_put_h264_chroma_mc4_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                   int h, int x, int y) */
+function ff_put_h264_chroma_mc4_lsx
+    li.d             t8,      8
+    sub.d            t1,      t8,     a4     // 8-x
+    sub.d            t2,      t8,     a5     // 8-y
+    mul.d            t3,      t1,     t2     // A
+    mul.d            t4,      a4,     t2     // B
+    mul.d            t5,      t1,     a5     // C
+    mul.d            t6,      a4,     a5     // D
+    add.d            t0,      t4,     t5     // E
+    slli.d           t8,      a2,     1
+    vreplgr2vr.b     vr0,     t3
+    vreplgr2vr.b     vr1,     t4
+    vreplgr2vr.b     vr2,     t5
+    vreplgr2vr.b     vr3,     t6
+    vreplgr2vr.b     vr4,     t0
+
+    bge              zero,    t6,     .ENDPUT_D
+    move             t1,      a3
+    vilvl.b          vr9,     vr1,    vr0
+    vilvl.b          vr10,    vr3,    vr2
+.PUT_D:
+    vld              vr5,     a1,     0
+    vld              vr6,     a1,     1
+    add.d            a1,      a1,     a2
+    vld              vr7,     a1,     0
+    vld              vr8,     a1,     1
+    add.d            a1,      a1,     a2
+    vld              vr11,    a1,     0
+    vld              vr12,    a1,     1
+    vilvl.b          vr5,     vr6,    vr5
+    vilvl.b          vr7,     vr8,    vr7
+    vilvl.b          vr13,    vr12,   vr11
+    vilvl.d          vr5,     vr7,    vr5
+    vilvl.d          vr13,    vr13,   vr7
+    vmulwev.h.bu     vr14,    vr9,    vr5
+    vmaddwod.h.bu    vr14,    vr9,    vr5
+    vmulwev.h.bu     vr15,    vr10,   vr13
+    vmaddwod.h.bu    vr15,    vr10,   vr13
+    vadd.h           vr14,    vr14,   vr15
+    vsrarni.b.h      vr14,    vr14,   6
+    vstelm.w         vr14,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    vstelm.w         vr14,    a0,     0,    1
+    add.d            a0,      a0,     a2
+    addi.d           t1,      t1,     -2
+    blt              zero,    t1,     .PUT_D
+    b                .ENDPUT
+.ENDPUT_D:
+
+    bge              zero,    t0,     .ENDPUT_E
+    move             t1,      a3
+    li.d             t7,      1
+    slt              t8,      zero,   t5
+    maskeqz          t5,      a2,     t8
+    masknez          t7,      t7,     t8
+    or               t7,      t7,     t5
+    vilvl.b          vr7,     vr4,    vr0
+.PUT_E:
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vilvl.b          vr5,     vr6,    vr5
+    add.d            a1,      a1,     a2
+    vld              vr8,     a1,     0
+    vldx             vr9,     a1,     t7
+    vilvl.b          vr8,     vr9,    vr8
+    vilvl.d          vr5,     vr8,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vstelm.w         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vstelm.w         vr6,     a0,     0,    1
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    addi.d           t1,      t1,     -2
+    blt              zero,    t1,     .PUT_E
+    b                .ENDPUT
+.ENDPUT_E:
+
+    move             t1,      a3
+.PUT:
+    vld              vr5,     a1,     0
+    vldx             vr8,     a1,     a2
+    vilvl.w          vr5,     vr8,    vr5
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vsrarni.b.h      vr7,     vr7,    6
+    vilvl.b          vr6,     vr7,    vr6
+    vstelm.w         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vstelm.w         vr6,     a0,     0,    1
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     t8
+    addi.d           t1,      t1,     -2
+    blt              zero,    t1,     .PUT
+.ENDPUT:
+endfunc
+
+/* void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                    int h, int x, int y) */
+function ff_put_h264_chroma_mc8_lasx
+    li.d             t8,      8
+    sub.d            t1,      t8,     a4     // 8-x
+    sub.d            t2,      t8,     a5     // 8-y
+    mul.d            t3,      t1,     t2     // A
+    mul.d            t4,      a4,     t2     // B
+    mul.d            t5,      t1,     a5     // C
+    mul.d            t6,      a4,     a5     // D
+    add.d            t0,      t4,     t5     // E
+    xvreplgr2vr.b    xr0,     t3
+    xvreplgr2vr.b    xr1,     t4
+    xvreplgr2vr.b    xr2,     t5
+    xvreplgr2vr.b    xr3,     t6
+    xvreplgr2vr.b    xr4,     t0
+    slli.d           t2,      a2,     1
+    add.d            t3,      t2,     a2
+    slli.d           t4,      a2,     2
+
+    bge              zero,    t6,     .ENDLOOP_DA
+    move             t1,      a3
+    xvilvl.b         xr9,     xr1,    xr0
+    xvilvl.b         xr10,    xr3,    xr2
+.LOOP_DA:
+    fld.d            f5,      a1,     0
+    fld.d            f6,      a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f7,      a1,     0
+    fld.d            f8,      a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f13,     a1,     0
+    fld.d            f14,     a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f15,     a1,     0
+    fld.d            f16,     a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f17,     a1,     0
+    fld.d            f18,     a1,     1
+    vilvl.b          vr11,    vr6,    vr5
+    vilvl.b          vr12,    vr8,    vr7
+    vilvl.b          vr14,    vr14,   vr13
+    vilvl.b          vr15,    vr16,   vr15
+    vilvl.b          vr16,    vr18,   vr17
+    xvpermi.q        xr11,    xr12,   0x02
+    xvpermi.q        xr12,    xr14,   0x02
+    xvpermi.q        xr14,    xr15,   0x02
+    xvpermi.q        xr15,    xr16,   0x02
+
+    xvmulwev.h.bu    xr19,    xr9,    xr11
+    xvmaddwod.h.bu   xr19,    xr9,    xr11
+    xvmulwev.h.bu    xr20,    xr10,   xr12
+    xvmaddwod.h.bu   xr20,    xr10,   xr12
+    xvadd.h          xr21,    xr19,   xr20
+    xvsrarni.b.h     xr21,    xr21,   6
+    vstelm.d         vr21,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr21,    a0,     0,    2
+    add.d            a0,      a0,     a2
+    xvmulwev.h.bu    xr13,    xr9,    xr14
+    xvmaddwod.h.bu   xr13,    xr9,    xr14
+    xvmulwev.h.bu    xr14,    xr10,   xr15
+    xvmaddwod.h.bu   xr14,    xr10,   xr15
+    xvadd.h          xr13,    xr13,   xr14
+    xvsrarni.b.h     xr13,    xr13,   6
+    vstelm.d         vr13,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr13,    a0,     0,    2
+    add.d            a0,      a0,     a2
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOP_DA
+    b                .ENDLOOPA
+.ENDLOOP_DA:
+
+    bge              zero,    t0,     .ENDLOOP_EA
+    move             t1,      a3
+    li.d             t7,      1
+    slt              t8,      zero,   t5
+    maskeqz          t5,      a2,     t8
+    masknez          t7,      t7,     t8
+    or               t7,      t7,     t5
+    xvilvl.b         xr7,     xr4,    xr0
+.LOOP_EA:
+    fld.d            f5,      a1,     0
+    fldx.d           f6,      a1,     t7
+    add.d            a1,      a1,     a2
+    fld.d            f9,      a1,     0
+    fldx.d           f10,     a1,     t7
+    add.d            a1,      a1,     a2
+    fld.d            f11,     a1,     0
+    fldx.d           f12,     a1,     t7
+    add.d            a1,      a1,     a2
+    fld.d            f13,     a1,     0
+    fldx.d           f14,     a1,     t7
+    vilvl.b          vr5,     vr6,    vr5
+    vilvl.b          vr9,     vr10,   vr9
+    vilvl.b          vr11,    vr12,   vr11
+    vilvl.b          vr13,    vr14,   vr13
+    xvpermi.q        xr5,     xr9,    0x02
+    xvpermi.q        xr11,    xr13,   0x02
+
+    xvmulwev.h.bu    xr8,     xr7,    xr5
+    xvmaddwod.h.bu   xr8,     xr7,    xr5
+    xvmulwev.h.bu    xr6,     xr7,    xr11
+    xvmaddwod.h.bu   xr6,     xr7,    xr11
+    xvsrarni.b.h     xr8,     xr8,    6
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr8,     a0,     0,    2
+    add.d            a0,      a0,     a2
+    xvsrarni.b.h     xr6,     xr6,    6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr6,     a0,     0,    2
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOP_EA
+    b                .ENDLOOPA
+.ENDLOOP_EA:
+
+    move             t1,      a3
+.LOOPA:
+    fld.d            f5,      a1,     0
+    fldx.d           f6,      a1,     a2
+    fldx.d           f7,      a1,     t2
+    fldx.d           f8,      a1,     t3
+    vilvl.d          vr5,     vr6,    vr5
+    vilvl.d          vr7,     vr8,    vr7
+    xvpermi.q        xr5,     xr7,    0x02
+    xvmulwev.h.bu    xr6,     xr0,    xr5
+    xvmulwod.h.bu    xr7,     xr0,    xr5
+    xvilvl.h         xr8,     xr7,    xr6
+    xvilvh.h         xr9,     xr7,    xr6
+    xvsrarni.b.h     xr9,     xr8,    6
+    vstelm.d         vr9,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vstelm.d         vr9,     a0,     0,    1
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr9,     a0,     0,    2
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr9,     a0,     0,    3
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     t4
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOPA
+.ENDLOOPA:
+endfunc
+
+/* void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                    int h, int x, int y) */
+function ff_avg_h264_chroma_mc8_lasx
+    li.d             t8,      8
+    sub.d            t1,      t8,     a4     // 8-x
+    sub.d            t2,      t8,     a5     // 8-y
+    mul.d            t3,      t1,     t2     // A
+    mul.d            t4,      a4,     t2     // B
+    mul.d            t5,      t1,     a5     // C
+    mul.d            t6,      a4,     a5     // D
+    add.d            t0,      t4,     t5     // E
+    xvreplgr2vr.b    xr0,     t3
+    xvreplgr2vr.b    xr1,     t4
+    xvreplgr2vr.b    xr2,     t5
+    xvreplgr2vr.b    xr3,     t6
+    xvreplgr2vr.b    xr4,     t0
+    slli.d           t2,      a2,     1
+    add.d            t3,      t2,     a2
+    slli.d           t4,      a2,     2
+
+    bge              zero,    t6,     .ENDLOOPDA
+    move             t1,      a3
+    xvilvl.b         xr9,     xr1,    xr0
+    xvilvl.b         xr10,    xr3,    xr2
+.LOOPDA:
+    fld.d            f5,      a1,     0
+    fld.d            f6,      a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f7,      a1,     0
+    fld.d            f8,      a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f11,     a1,     0
+    fld.d            f12,     a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f13,     a1,     0
+    fld.d            f14,     a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f15,     a1,     0
+    fld.d            f16,     a1,     1
+    fld.d            f17,     a0,     0
+    fldx.d           f18,     a0,     a2
+    fldx.d           f19,     a0,     t2
+    fldx.d           f20,     a0,     t3
+    vilvl.b          vr5,     vr6,    vr5
+    vilvl.b          vr7,     vr8,    vr7
+    vilvl.b          vr11,    vr12,   vr11
+    vilvl.b          vr13,    vr14,   vr13
+    vilvl.b          vr16,    vr16,   vr15
+    xvpermi.q        xr5,     xr7,    0x02
+    xvpermi.q        xr7,     xr11,   0x02
+    xvpermi.q        xr11,    xr13,   0x02
+    xvpermi.q        xr13,    xr16,   0x02
+    xvpermi.q        xr17,    xr18,   0x02
+    xvpermi.q        xr19,    xr20,   0x02
+
+    xvmulwev.h.bu    xr14,    xr9,    xr5
+    xvmaddwod.h.bu   xr14,    xr9,    xr5
+    xvmulwev.h.bu    xr15,    xr10,   xr7
+    xvmaddwod.h.bu   xr15,    xr10,   xr7
+    xvadd.h          xr14,    xr14,   xr15
+    xvsrari.h        xr14,    xr14,   6
+    xvsllwil.hu.bu   xr17,    xr17,   0
+    xvadd.h          xr20,    xr14,   xr17
+    xvsrarni.b.h     xr20,    xr20,   1
+    xvstelm.d        xr20,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr20,    a0,     0,    2
+    add.d            a0,      a0,     a2
+    xvmulwev.h.bu    xr14,    xr9,    xr11
+    xvmaddwod.h.bu   xr14,    xr9,    xr11
+    xvmulwev.h.bu    xr15,    xr10,   xr13
+    xvmaddwod.h.bu   xr15,    xr10,   xr13
+    xvadd.h          xr14,    xr14,   xr15
+    xvsrari.h        xr14,    xr14,   6
+    xvsllwil.hu.bu   xr19,    xr19,   0
+    xvadd.h          xr21,    xr14,   xr19
+    xvsrarni.b.h     xr21,    xr21,   1
+    xvstelm.d        xr21,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr21,    a0,     0,    2
+    add.d            a0,      a0,     a2
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOPDA
+    b                .ENDLOOPELSEA
+.ENDLOOPDA:
+
+    bge              zero,    t0,     .ENDLOOPEA
+    move             t1,      a3
+    li.d             t7,      1
+    slt              t8,      zero,   t5
+    maskeqz          t5,      a2,     t8
+    masknez          t7,      t7,     t8
+    or               t7,      t7,     t5
+    xvilvl.b         xr7,     xr4,    xr0
+.LOOPEA:
+    fld.d            f5,      a1,     0
+    fldx.d           f6,      a1,     t7
+    add.d            a1,      a1,     a2
+    fld.d            f8,      a1,     0
+    fldx.d           f9,      a1,     t7
+    add.d            a1,      a1,     a2
+    fld.d            f10,     a1,     0
+    fldx.d           f11,     a1,     t7
+    add.d            a1,      a1,     a2
+    fld.d            f12,     a1,     0
+    fldx.d           f13,     a1,     t7
+    add.d            a1,      a1,     a2
+    fld.d            f14,     a0,     0
+    fldx.d           f15,     a0,     a2
+    fldx.d           f16,     a0,     t2
+    fldx.d           f17,     a0,     t3
+    vilvl.b          vr5,     vr6,    vr5
+    vilvl.b          vr8,     vr9,    vr8
+    vilvl.b          vr10,    vr11,   vr10
+    vilvl.b          vr12,    vr13,   vr12
+    xvpermi.q        xr5,     xr8,    0x02
+    xvpermi.q        xr10,    xr12,   0x02
+    xvpermi.q        xr14,    xr15,   0x02
+    xvpermi.q        xr16,    xr17,   0x02
+
+    xvmulwev.h.bu    xr6,     xr7,    xr5
+    xvmaddwod.h.bu   xr6,     xr7,    xr5
+    xvsrari.h        xr6,     xr6,    6
+    xvsllwil.hu.bu   xr14,    xr14,   0
+    xvadd.h          xr8,     xr6,    xr14
+    xvsrarni.b.h     xr8,     xr8,    1
+    xvstelm.d        xr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr8,     a0,     0,    2
+    add.d            a0,      a0,     a2
+    xvmulwev.h.bu    xr6,     xr7,    xr10
+    xvmaddwod.h.bu   xr6,     xr7,    xr10
+    xvsrari.h        xr6,     xr6,    6
+    xvsllwil.hu.bu   xr16,    xr16,   0
+    xvadd.h          xr8,     xr6,    xr16
+    xvsrarni.b.h     xr8,     xr8,    1
+    xvstelm.d        xr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr8,     a0,     0,    2
+    add.d            a0,      a0,     a2
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOPEA
+    b                .ENDLOOPELSEA
+.ENDLOOPEA:
+
+    move             t1,      a3
+.LOOPELSEA:
+    fld.d            f5,      a1,     0
+    fldx.d           f6,      a1,     a2
+    fldx.d           f7,      a1,     t2
+    fldx.d           f8,      a1,     t3
+    fld.d            f9,      a0,     0
+    fldx.d           f10,     a0,     a2
+    fldx.d           f11,     a0,     t2
+    fldx.d           f12,     a0,     t3
+    xvpermi.q        xr5,     xr6,    0x02
+    xvpermi.q        xr7,     xr8,    0x02
+    xvpermi.q        xr9,     xr10,   0x02
+    xvpermi.q        xr11,    xr12,   0x02
+
+    xvmulwev.h.bu    xr12,    xr0,    xr5
+    xvmulwod.h.bu    xr13,    xr0,    xr5
+    xvilvl.h         xr12,    xr13,   xr12
+    xvsrari.h        xr12,    xr12,   6
+    xvsllwil.hu.bu   xr9,     xr9,    0
+    xvadd.h          xr9,     xr12,   xr9
+    xvsrarni.b.h     xr9,     xr9,    1
+    xvstelm.d        xr9,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr9,     a0,     0,    2
+    add.d            a0,      a0,     a2
+    xvmulwev.h.bu    xr12,    xr0,    xr7
+    xvmulwod.h.bu    xr13,    xr0,    xr7
+    xvilvl.h         xr12,    xr13,   xr12
+    xvsrari.h        xr12,    xr12,   6
+    xvsllwil.hu.bu   xr11,    xr11,   0
+    xvadd.h          xr13,    xr12,   xr11
+    xvsrarni.b.h     xr13,    xr13,   1
+    xvstelm.d        xr13,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr13,    a0,     0,    2
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     t4
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOPELSEA
+.ENDLOOPELSEA:
+endfunc
+
+/* void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                    int h, int x, int y) */
+function ff_put_h264_chroma_mc4_lasx
+    li.d             t8,      8
+    sub.d            t1,      t8,     a4     // 8-x
+    sub.d            t2,      t8,     a5     // 8-y
+    mul.d            t3,      t1,     t2     // A
+    mul.d            t4,      a4,     t2     // B
+    mul.d            t5,      t1,     a5     // C
+    mul.d            t6,      a4,     a5     // D
+    add.d            t0,      t4,     t5     // E
+    slli.d           t8,      a2,     1
+    vreplgr2vr.b     vr0,     t3
+    vreplgr2vr.b     vr1,     t4
+    vreplgr2vr.b     vr2,     t5
+    vreplgr2vr.b     vr3,     t6
+    vreplgr2vr.b     vr4,     t0
+
+    bge              zero,    t6,     .ENDPUT_DA
+    move             t1,      a3
+    vilvl.b          vr9,     vr1,    vr0
+    vilvl.b          vr10,    vr3,    vr2
+.PUT_DA:
+    fld.d            f5,      a1,     0
+    fld.d            f6,      a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f7,      a1,     0
+    fld.d            f8,      a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f11,     a1,     0
+    fld.d            f12,     a1,     1
+    vilvl.b          vr5,     vr6,    vr5
+    vilvl.b          vr7,     vr8,    vr7
+    vilvl.b          vr13,    vr12,   vr11
+    vilvl.d          vr5,     vr7,    vr5
+    vilvl.d          vr13,    vr13,   vr7
+    vmulwev.h.bu     vr14,    vr9,    vr5
+    vmaddwod.h.bu    vr14,    vr9,    vr5
+    vmulwev.h.bu     vr15,    vr10,   vr13
+    vmaddwod.h.bu    vr15,    vr10,   vr13
+    xvadd.h          xr14,    xr14,   xr15
+    vsrarni.b.h      vr16,    vr14,   6
+    vstelm.w         vr16,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    vstelm.w         vr16,    a0,     0,    1
+    add.d            a0,      a0,     a2
+    addi.d           t1,      t1,     -2
+    blt              zero,    t1,     .PUT_DA
+    b                .ENDPUTA
+.ENDPUT_DA:
+
+    bge              zero,    t0,     .ENDPUT_EA
+    move             t1,      a3
+    li.d             t7,      1
+    slt              t8,      zero,   t5
+    maskeqz          t5,      a2,     t8
+    masknez          t7,      t7,     t8
+    or               t7,      t7,     t5
+    vilvl.b          vr7,     vr4,    vr0
+.PUT_EA:
+    fld.d            f5,      a1,     0
+    fldx.d           f6,      a1,     t7
+    vilvl.b          vr5,     vr6,    vr5
+    add.d            a1,      a1,     a2
+    fld.d            f8,      a1,     0
+    fldx.d           f9,      a1,     t7
+    vilvl.b          vr8,     vr9,    vr8
+    vilvl.d          vr5,     vr8,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vstelm.w         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vstelm.w         vr6,     a0,     0,    1
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    addi.d           t1,      t1,     -2
+    blt              zero,    t1,     .PUT_EA
+    b                .ENDPUTA
+.ENDPUT_EA:
+
+    move             t1,      a3
+.PUTA:
+    fld.d            f5,      a1,     0
+    fldx.d           f8,      a1,     a2
+    vilvl.w          vr5,     vr8,    vr5
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vilvl.h          vr6,     vr7,    vr6
+    vsrarni.b.h      vr6,     vr6,    6
+    vstelm.w         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vstelm.w         vr6,     a0,     0,    1
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     t8
+    addi.d           t1,      t1,     -2
+    blt              zero,    t1,     .PUTA
+.ENDPUTA:
+endfunc
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264chroma_loongarch.h"
+#include "libavutil/attributes.h"
+#include "libavutil/loongarch/cpu.h"
+#include "libavcodec/h264chroma.h"
+
+av_cold void ff_h264chroma_init_loongarch(H264ChromaContext *c, int bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_lsx(cpu_flags)) {
+        if (bit_depth <= 8) {
+            c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_lsx;
+            c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_lsx;
+            c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_lsx;
+        }
+    }
+
+    if (have_lasx(cpu_flags)) {
+        if (bit_depth <= 8) {
+            c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_lasx;
+            c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_lasx;
+            c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_lasx;
+        }
+    }
+}
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H
+
+#include "libavcodec/h264.h"
+
+void ff_put_h264_chroma_mc8_lsx(unsigned char *dst, const unsigned char *src,
+                                long int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc8_lsx(unsigned char *dst, const unsigned char *src,
+                                long int stride, int h, int x, int y);
+void ff_put_h264_chroma_mc4_lsx(unsigned char *dst, const unsigned char *src,
+                                long int stride, int h, int x, int y);
+
+void ff_put_h264_chroma_mc4_lasx(unsigned char *dst, const unsigned char *src,
+                                 long int stride, int h, int x, int y);
+void ff_put_h264_chroma_mc8_lasx(unsigned char *dst, const unsigned char *src,
+                                 long int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc8_lasx(unsigned char *dst, const unsigned char *src,
+                                 long int stride, int h, int x, int y);
+
+#endif /* AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H */
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Xiwei  Gu  <guxiwei-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/cpu.h"
+#include "h264dsp_loongarch.h"
+
+av_cold void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
+                                       const int chroma_format_idc)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_lsx(cpu_flags)) {
+        if (chroma_format_idc <= 1)
+            c->h264_loop_filter_strength = ff_h264_loop_filter_strength_lsx;
+        if (bit_depth == 8) {
+            c->h264_idct_add     = ff_h264_idct_add_8_lsx;
+            c->h264_idct8_add    = ff_h264_idct8_add_8_lsx;
+            c->h264_idct_dc_add  = ff_h264_idct_dc_add_8_lsx;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lsx;
+
+            if (chroma_format_idc <= 1) {
+                c->h264_idct_add8 = ff_h264_idct_add8_8_lsx;
+                c->h264_h_loop_filter_chroma = ff_h264_h_lpf_chroma_8_lsx;
+                c->h264_h_loop_filter_chroma_intra = ff_h264_h_lpf_chroma_intra_8_lsx;
+            } else
+                c->h264_idct_add8 = ff_h264_idct_add8_422_8_lsx;
+
+            c->h264_idct_add16 = ff_h264_idct_add16_8_lsx;
+            c->h264_idct8_add4 = ff_h264_idct8_add4_8_lsx;
+            c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_8_lsx;
+            c->h264_idct_add16intra = ff_h264_idct_add16_intra_8_lsx;
+
+            c->h264_add_pixels4_clear = ff_h264_add_pixels4_8_lsx;
+            c->h264_add_pixels8_clear = ff_h264_add_pixels8_8_lsx;
+            c->h264_v_loop_filter_luma = ff_h264_v_lpf_luma_8_lsx;
+            c->h264_h_loop_filter_luma = ff_h264_h_lpf_luma_8_lsx;
+            c->h264_v_loop_filter_luma_intra = ff_h264_v_lpf_luma_intra_8_lsx;
+            c->h264_h_loop_filter_luma_intra = ff_h264_h_lpf_luma_intra_8_lsx;
+            c->h264_v_loop_filter_chroma = ff_h264_v_lpf_chroma_8_lsx;
+
+            c->h264_v_loop_filter_chroma_intra = ff_h264_v_lpf_chroma_intra_8_lsx;
+
+            c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_8_lsx;
+            c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_8_lsx;
+            c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels4_8_lsx;
+            c->weight_h264_pixels_tab[0]   = ff_weight_h264_pixels16_8_lsx;
+            c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_8_lsx;
+            c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels4_8_lsx;
+            c->h264_idct8_add    = ff_h264_idct8_add_8_lsx;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lsx;
+        }
+    }
+#if HAVE_LASX
+    if (have_lasx(cpu_flags)) {
+        if (chroma_format_idc <= 1)
+            c->h264_loop_filter_strength = ff_h264_loop_filter_strength_lasx;
+        if (bit_depth == 8) {
+            c->h264_add_pixels4_clear = ff_h264_add_pixels4_8_lasx;
+            c->h264_add_pixels8_clear = ff_h264_add_pixels8_8_lasx;
+            c->h264_v_loop_filter_luma = ff_h264_v_lpf_luma_8_lasx;
+            c->h264_h_loop_filter_luma = ff_h264_h_lpf_luma_8_lasx;
+            c->h264_v_loop_filter_luma_intra = ff_h264_v_lpf_luma_intra_8_lasx;
+            c->h264_h_loop_filter_luma_intra = ff_h264_h_lpf_luma_intra_8_lasx;
+
+            /* Weighted MC */
+            c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_8_lasx;
+            c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_8_lasx;
+
+            c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_8_lasx;
+            c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_8_lasx;
+
+            c->h264_idct8_add    = ff_h264_idct8_add_8_lasx;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lasx;
+            c->h264_idct8_add4   = ff_h264_idct8_add4_8_lasx;
+        }
+    }
+#endif // #if HAVE_LASX
+}
@@ -0,0 +1,782 @@
+/*
+ * Loongson LASX optimized h264dsp
+ *
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Xiwei  Gu  <guxiwei-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "h264dsp_loongarch.h"
+
+#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,   \
+                         p1_or_q1_org_in, p2_or_q2_org_in,   \
+                         neg_tc_in, tc_in, p1_or_q1_out)     \
+{                                                            \
+    __m256i clip3, temp;                                     \
+                                                             \
+    clip3 = __lasx_xvavgr_hu(p0_or_q0_org_in,                \
+                             q0_or_p0_org_in);               \
+    temp = __lasx_xvslli_h(p1_or_q1_org_in, 1);              \
+    clip3 = __lasx_xvsub_h(clip3, temp);                     \
+    clip3 = __lasx_xvavg_h(p2_or_q2_org_in, clip3);          \
+    clip3 = __lasx_xvclip_h(clip3, neg_tc_in, tc_in);        \
+    p1_or_q1_out = __lasx_xvadd_h(p1_or_q1_org_in, clip3);   \
+}
+
+#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in,       \
+                     p1_or_q1_org_in, q1_or_p1_org_in,       \
+                     neg_threshold_in, threshold_in,         \
+                     p0_or_q0_out, q0_or_p0_out)             \
+{                                                            \
+    __m256i q0_sub_p0, p1_sub_q1, delta;                     \
+                                                             \
+    q0_sub_p0 = __lasx_xvsub_h(q0_or_p0_org_in,              \
+                               p0_or_q0_org_in);             \
+    p1_sub_q1 = __lasx_xvsub_h(p1_or_q1_org_in,              \
+                               q1_or_p1_org_in);             \
+    q0_sub_p0 = __lasx_xvslli_h(q0_sub_p0, 2);               \
+    p1_sub_q1 = __lasx_xvaddi_hu(p1_sub_q1, 4);              \
+    delta = __lasx_xvadd_h(q0_sub_p0, p1_sub_q1);            \
+    delta = __lasx_xvsrai_h(delta, 3);                       \
+    delta = __lasx_xvclip_h(delta, neg_threshold_in,         \
+           threshold_in);                                    \
+    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, delta);   \
+    q0_or_p0_out = __lasx_xvsub_h(q0_or_p0_org_in, delta);   \
+                                                             \
+    p0_or_q0_out = __lasx_xvclip255_h(p0_or_q0_out);         \
+    q0_or_p0_out = __lasx_xvclip255_h(q0_or_p0_out);         \
+}
+
+void ff_h264_h_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
+                               int alpha_in, int beta_in, int8_t *tc)
+{
+    int img_width_2x = img_width << 1;
+    int img_width_4x = img_width << 2;
+    int img_width_8x = img_width << 3;
+    int img_width_3x = img_width_2x + img_width;
+    __m256i tmp_vec0, bs_vec;
+    __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
+                      0x0101010100000000, 0x0303030302020202};
+
+    tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
+    tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
+    bs_vec   = __lasx_xvslti_b(tc_vec, 0);
+    bs_vec   = __lasx_xvxori_b(bs_vec, 255);
+    bs_vec   = __lasx_xvandi_b(bs_vec, 1);
+
+    if (__lasx_xbnz_v(bs_vec)) {
+        uint8_t *src = data - 4;
+        __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
+        __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
+        __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
+        __m256i is_bs_greater_than0;
+        __m256i zero = __lasx_xvldi(0);
+
+        is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
+
+        {
+            uint8_t *src_tmp = src + img_width_8x;
+            __m256i row0, row1, row2, row3, row4, row5, row6, row7;
+            __m256i row8, row9, row10, row11, row12, row13, row14, row15;
+
+            DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
+                      src, img_width_3x, row0, row1, row2, row3);
+            src += img_width_4x;
+            DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
+                      src, img_width_3x, row4, row5, row6, row7);
+            src -= img_width_4x;
+            DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
+                      img_width_2x, src_tmp, img_width_3x,
+                      row8, row9, row10, row11);
+            src_tmp += img_width_4x;
+            DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
+                      img_width_2x, src_tmp, img_width_3x,
+                      row12, row13, row14, row15);
+            src_tmp -= img_width_4x;
+
+            LASX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6,
+                                 row7, row8, row9, row10, row11,
+                                 row12, row13, row14, row15,
+                                 p3_org, p2_org, p1_org, p0_org,
+                                 q0_org, q1_org, q2_org, q3_org);
+        }
+
+        p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
+        p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
+        q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
+
+        alpha = __lasx_xvreplgr2vr_b(alpha_in);
+        beta  = __lasx_xvreplgr2vr_b(beta_in);
+
+        is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
+        is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
+        is_less_than       = is_less_than_alpha & is_less_than_beta;
+        is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
+        is_less_than       = is_less_than_beta & is_less_than;
+        is_less_than       = is_less_than & is_bs_greater_than0;
+
+        if (__lasx_xbnz_v(is_less_than)) {
+            __m256i neg_tc_h, tc_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
+            __m256i p2_asub_p0, q2_asub_q0;
+
+            neg_tc_h = __lasx_xvneg_b(tc_vec);
+            neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
+            tc_h     = __lasx_vext2xv_hu_bu(tc_vec);
+            p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
+            p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
+            q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
+
+            p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
+            is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
+            is_less_than_beta = is_less_than_beta & is_less_than;
+
+            if (__lasx_xbnz_v(is_less_than_beta)) {
+                __m256i p2_org_h, p1_h;
+
+                p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
+                AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, p1_org_h, p2_org_h,
+                                 neg_tc_h, tc_h, p1_h);
+                p1_h = __lasx_xvpickev_b(p1_h, p1_h);
+                p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
+                p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
+                is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
+                tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
+            }
+
+            q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
+            is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
+            is_less_than_beta = is_less_than_beta & is_less_than;
+
+            q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
+
+            if (__lasx_xbnz_v(is_less_than_beta)) {
+                __m256i q2_org_h, q1_h;
+
+                q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
+                AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, q1_org_h, q2_org_h,
+                                 neg_tc_h, tc_h, q1_h);
+                q1_h = __lasx_xvpickev_b(q1_h, q1_h);
+                q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
+                q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
+
+                is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
+                tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
+            }
+
+            {
+                __m256i neg_thresh_h, p0_h, q0_h;
+
+                neg_thresh_h = __lasx_xvneg_b(tc_vec);
+                neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
+                tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
+
+                AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
+                             neg_thresh_h, tc_h, p0_h, q0_h);
+                DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
+                          p0_h, q0_h);
+                DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
+                          p0_h, q0_h);
+                p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
+                q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
+            }
+
+            {
+                __m256i row0, row1, row2, row3, row4, row5, row6, row7;
+                __m256i control = {0x0000000400000000, 0x0000000500000001,
+                                   0x0000000600000002, 0x0000000700000003};
+
+                DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org,
+                          q2_org, 0x02, p2_org, q1_org, 0x02, p3_org,
+                          q0_org, 0x02, p0_org, p1_org, p2_org, p3_org);
+                DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
+                          row0, row2);
+                DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
+                          row1, row3);
+                DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
+                DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
+                DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
+                          control, row7, control, row4, row5, row6, row7);
+                __lasx_xvstelm_d(row4, src, 0, 0);
+                __lasx_xvstelm_d(row4, src + img_width, 0, 1);
+                src += img_width_2x;
+                __lasx_xvstelm_d(row4, src, 0, 2);
+                __lasx_xvstelm_d(row4, src + img_width, 0, 3);
+                src += img_width_2x;
+                __lasx_xvstelm_d(row5, src, 0, 0);
+                __lasx_xvstelm_d(row5, src + img_width, 0, 1);
+                src += img_width_2x;
+                __lasx_xvstelm_d(row5, src, 0, 2);
+                __lasx_xvstelm_d(row5, src + img_width, 0, 3);
+                src += img_width_2x;
+                __lasx_xvstelm_d(row6, src, 0, 0);
+                __lasx_xvstelm_d(row6, src + img_width, 0, 1);
+                src += img_width_2x;
+                __lasx_xvstelm_d(row6, src, 0, 2);
+                __lasx_xvstelm_d(row6, src + img_width, 0, 3);
+                src += img_width_2x;
+                __lasx_xvstelm_d(row7, src, 0, 0);
+                __lasx_xvstelm_d(row7, src + img_width, 0, 1);
+                src += img_width_2x;
+                __lasx_xvstelm_d(row7, src, 0, 2);
+                __lasx_xvstelm_d(row7, src + img_width, 0, 3);
+            }
+        }
+    }
+}
+
+void ff_h264_v_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
+                                   int alpha_in, int beta_in, int8_t *tc)
+{
+    int img_width_2x = img_width << 1;
+    int img_width_3x = img_width + img_width_2x;
+    __m256i tmp_vec0, bs_vec;
+    __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
+                      0x0101010100000000, 0x0303030302020202};
+
+    tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
+    tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
+    bs_vec   = __lasx_xvslti_b(tc_vec, 0);
+    bs_vec   = __lasx_xvxori_b(bs_vec, 255);
+    bs_vec   = __lasx_xvandi_b(bs_vec, 1);
+
+    if (__lasx_xbnz_v(bs_vec)) {
+        __m256i p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
+        __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
+        __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
+        __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
+        __m256i is_bs_greater_than0;
+        __m256i zero = __lasx_xvldi(0);
+
+        alpha = __lasx_xvreplgr2vr_b(alpha_in);
+        beta  = __lasx_xvreplgr2vr_b(beta_in);
+
+        DUP2_ARG2(__lasx_xvldx, data, -img_width_3x, data, -img_width_2x,
+                  p2_org, p1_org);
+        p0_org = __lasx_xvldx(data, -img_width);
+        DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
+
+        is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
+        p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
+        p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
+        q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
+
+        is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
+        is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
+        is_less_than       = is_less_than_alpha & is_less_than_beta;
+        is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
+        is_less_than       = is_less_than_beta & is_less_than;
+        is_less_than       = is_less_than & is_bs_greater_than0;
+
+        if (__lasx_xbnz_v(is_less_than)) {
+            __m256i neg_tc_h, tc_h, p2_asub_p0, q2_asub_q0;
+
+            q2_org = __lasx_xvldx(data, img_width_2x);
+
+            neg_tc_h = __lasx_xvneg_b(tc_vec);
+            neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
+            tc_h     = __lasx_vext2xv_hu_bu(tc_vec);
+            p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
+            p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
+            q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
+
+            p2_asub_p0        = __lasx_xvabsd_bu(p2_org, p0_org);
+            is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
+            is_less_than_beta = is_less_than_beta & is_less_than;
+
+            if (__lasx_xbnz_v(is_less_than_beta)) {
+                __m256i p1_h, p2_org_h;
+
+                p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
+                AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, p1_org_h, p2_org_h,
+                                 neg_tc_h, tc_h, p1_h);
+                p1_h = __lasx_xvpickev_b(p1_h, p1_h);
+                p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
+                p1_h   = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
+                p1_org = __lasx_xvpermi_q(p1_org, p1_h, 0x30);
+                __lasx_xvst(p1_org, data - img_width_2x, 0);
+
+                is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
+                tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
+            }
+
+            q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
+            is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
+            is_less_than_beta = is_less_than_beta & is_less_than;
+
+            q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
+
+            if (__lasx_xbnz_v(is_less_than_beta)) {
+                __m256i q1_h, q2_org_h;
+
+                q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
+                AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, q1_org_h, q2_org_h,
+                                 neg_tc_h, tc_h, q1_h);
+                q1_h = __lasx_xvpickev_b(q1_h, q1_h);
+                q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
+                q1_h = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
+                q1_org = __lasx_xvpermi_q(q1_org, q1_h, 0x30);
+                __lasx_xvst(q1_org, data + img_width, 0);
+
+                is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
+                tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
+
+            }
+
+            {
+                __m256i neg_thresh_h, p0_h, q0_h;
+
+                neg_thresh_h = __lasx_xvneg_b(tc_vec);
+                neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
+                tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
+
+                AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
+                             neg_thresh_h, tc_h, p0_h, q0_h);
+                DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
+                          p0_h, q0_h);
+                DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0Xd8, q0_h, 0xd8,
+                          p0_h, q0_h);
+                p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
+                q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
+                p0_org = __lasx_xvpermi_q(p0_org, p0_h, 0x30);
+                q0_org = __lasx_xvpermi_q(q0_org, q0_h, 0x30);
+                __lasx_xvst(p0_org, data - img_width, 0);
+                __lasx_xvst(q0_org, data, 0);
+            }
+        }
+    }
+}
+
+#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,          \
+                                 q3_or_p3_org_in, p1_or_q1_org_in,          \
+                                 p2_or_q2_org_in, q1_or_p1_org_in,          \
+                                 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)  \
+{                                                                           \
+    __m256i threshold;                                                      \
+    __m256i const2, const3 = __lasx_xvldi(0);                               \
+                                                                            \
+    const2 = __lasx_xvaddi_hu(const3, 2);                                   \
+    const3 = __lasx_xvaddi_hu(const3, 3);                                   \
+    threshold = __lasx_xvadd_h(p0_or_q0_org_in, q3_or_p3_org_in);           \
+    threshold = __lasx_xvadd_h(p1_or_q1_org_in, threshold);                 \
+                                                                            \
+    p0_or_q0_out = __lasx_xvslli_h(threshold, 1);                           \
+    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p2_or_q2_org_in);           \
+    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, q1_or_p1_org_in);           \
+    p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const3);                   \
+                                                                            \
+    p1_or_q1_out = __lasx_xvadd_h(p2_or_q2_org_in, threshold);              \
+    p1_or_q1_out = __lasx_xvsrar_h(p1_or_q1_out, const2);                   \
+                                                                            \
+    p2_or_q2_out = __lasx_xvmul_h(p2_or_q2_org_in, const3);                 \
+    p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in);           \
+    p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in);           \
+    p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, threshold);                 \
+    p2_or_q2_out = __lasx_xvsrar_h(p2_or_q2_out, const3);                   \
+}
+
+/* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
+#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in,             \
+                         p1_or_q1_org_in, p0_or_q0_out)                \
+{                                                                      \
+    __m256i const2 = __lasx_xvldi(0);                                  \
+    const2 = __lasx_xvaddi_hu(const2, 2);                              \
+    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, q1_or_p1_org_in);   \
+    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in);      \
+    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in);      \
+    p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const2);              \
+}
+
+void ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
+                                     int alpha_in, int beta_in)
+{
+    int img_width_2x = img_width << 1;
+    int img_width_4x = img_width << 2;
+    int img_width_3x = img_width_2x + img_width;
+    uint8_t *src = data - 4;
+    __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
+    __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
+    __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
+    __m256i zero = __lasx_xvldi(0);
+
+    {
+        __m256i row0, row1, row2, row3, row4, row5, row6, row7;
+        __m256i row8, row9, row10, row11, row12, row13, row14, row15;
+
+        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
+                  src, img_width_3x, row0, row1, row2, row3);
+        src += img_width_4x;
+        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
+                  src, img_width_3x, row4, row5, row6, row7);
+        src += img_width_4x;
+        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
+                  src, img_width_3x, row8, row9, row10, row11);
+        src += img_width_4x;
+        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
+                  src, img_width_3x, row12, row13, row14, row15);
+        src += img_width_4x;
+
+        LASX_TRANSPOSE16x8_B(row0, row1, row2, row3,
+                             row4, row5, row6, row7,
+                             row8, row9, row10, row11,
+                             row12, row13, row14, row15,
+                             p3_org, p2_org, p1_org, p0_org,
+                             q0_org, q1_org, q2_org, q3_org);
+    }
+
+    alpha = __lasx_xvreplgr2vr_b(alpha_in);
+    beta  = __lasx_xvreplgr2vr_b(beta_in);
+    p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
+    p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
+    q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
+
+    is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
+    is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
+    is_less_than       = is_less_than_beta & is_less_than_alpha;
+    is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
+    is_less_than       = is_less_than_beta & is_less_than;
+    is_less_than       = __lasx_xvpermi_q(zero, is_less_than, 0x30);
+
+    if (__lasx_xbnz_v(is_less_than)) {
+        __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
+        __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
+        __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(alpha, 2);
+
+        less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
+        less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
+                                                 less_alpha_shift2_add2);
+
+        p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
+        p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
+        q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
+        q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
+
+        p2_asub_p0               = __lasx_xvabsd_bu(p2_org, p0_org);
+        is_less_than_beta        = __lasx_xvslt_bu(p2_asub_p0, beta);
+        is_less_than_beta        = is_less_than_beta & less_alpha_shift2_add2;
+        negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
+        is_less_than_beta        = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+
+        /* combine and store */
+        if (__lasx_xbnz_v(is_less_than_beta)) {
+            __m256i p2_org_h, p3_org_h, p1_h, p2_h;
+
+            p2_org_h   = __lasx_vext2xv_hu_bu(p2_org);
+            p3_org_h   = __lasx_vext2xv_hu_bu(p3_org);
+
+            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_h, p0_org_h, q0_org_h, p1_org_h,
+                                     p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
+
+            p0_h = __lasx_xvpickev_b(p0_h, p0_h);
+            p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
+            DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
+            DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
+            p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
+            p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
+            p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
+        }
+
+        AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
+        /* combine */
+        p0_h = __lasx_xvpickev_b(p0_h, p0_h);
+        p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
+        p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
+
+        /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
+        q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
+        is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
+        is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
+        negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+
+        /* combine and store */
+        if (__lasx_xbnz_v(is_less_than_beta)) {
+            __m256i q2_org_h, q3_org_h, q1_h, q2_h;
+
+            q2_org_h   = __lasx_vext2xv_hu_bu(q2_org);
+            q3_org_h   = __lasx_vext2xv_hu_bu(q3_org);
+
+            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_h, q0_org_h, p0_org_h, q1_org_h,
+                                     q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
+
+            q0_h = __lasx_xvpickev_b(q0_h, q0_h);
+            q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
+            DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
+            DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
+            q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
+            q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
+            q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
+
+        }
+
+        AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
+
+        /* combine */
+        q0_h = __lasx_xvpickev_b(q0_h, q0_h);
+        q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
+        q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
+
+        /* transpose and store */
+        {
+            __m256i row0, row1, row2, row3, row4, row5, row6, row7;
+            __m256i control = {0x0000000400000000, 0x0000000500000001,
+                               0x0000000600000002, 0x0000000700000003};
+
+            DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org, q2_org,
+                      0x02, p2_org, q1_org, 0x02, p3_org, q0_org, 0x02,
+                      p0_org, p1_org, p2_org, p3_org);
+            DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
+                      row0, row2);
+            DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
+                      row1, row3);
+            DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
+            DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
+            DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
+                      control, row7, control, row4, row5, row6, row7);
+            src = data - 4;
+            __lasx_xvstelm_d(row4, src, 0, 0);
+            __lasx_xvstelm_d(row4, src + img_width, 0, 1);
+            src += img_width_2x;
+            __lasx_xvstelm_d(row4, src, 0, 2);
+            __lasx_xvstelm_d(row4, src + img_width, 0, 3);
+            src += img_width_2x;
+            __lasx_xvstelm_d(row5, src, 0, 0);
+            __lasx_xvstelm_d(row5, src + img_width, 0, 1);
+            src += img_width_2x;
+            __lasx_xvstelm_d(row5, src, 0, 2);
+            __lasx_xvstelm_d(row5, src + img_width, 0, 3);
+            src += img_width_2x;
+            __lasx_xvstelm_d(row6, src, 0, 0);
+            __lasx_xvstelm_d(row6, src + img_width, 0, 1);
+            src += img_width_2x;
+            __lasx_xvstelm_d(row6, src, 0, 2);
+            __lasx_xvstelm_d(row6, src + img_width, 0, 3);
+            src += img_width_2x;
+            __lasx_xvstelm_d(row7, src, 0, 0);
+            __lasx_xvstelm_d(row7, src + img_width, 0, 1);
+            src += img_width_2x;
+            __lasx_xvstelm_d(row7, src, 0, 2);
+            __lasx_xvstelm_d(row7, src + img_width, 0, 3);
+        }
+    }
+}
+
+void ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
+                                     int alpha_in, int beta_in)
+{
+    int img_width_2x = img_width << 1;
+    int img_width_3x = img_width_2x + img_width;
+    uint8_t *src = data - img_width_2x;
+    __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
+    __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
+    __m256i p1_org, p0_org, q0_org, q1_org;
+    __m256i zero = __lasx_xvldi(0);
+
+    DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
+              src, img_width_3x, p1_org, p0_org, q0_org, q1_org);
+    alpha = __lasx_xvreplgr2vr_b(alpha_in);
+    beta  = __lasx_xvreplgr2vr_b(beta_in);
+    p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
+    p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
+    q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
+
+    is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
+    is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
+    is_less_than       = is_less_than_beta & is_less_than_alpha;
+    is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
+    is_less_than       = is_less_than_beta & is_less_than;
+    is_less_than       = __lasx_xvpermi_q(zero, is_less_than, 0x30);
+
+    if (__lasx_xbnz_v(is_less_than)) {
+        __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
+        __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
+        __m256i p2_org = __lasx_xvldx(src, -img_width);
+        __m256i q2_org = __lasx_xvldx(data, img_width_2x);
+        __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(alpha, 2);
+        less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
+        less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
+                                                 less_alpha_shift2_add2);
+
+        p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
+        p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
+        q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
+        q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
+
+        p2_asub_p0               = __lasx_xvabsd_bu(p2_org, p0_org);
+        is_less_than_beta        = __lasx_xvslt_bu(p2_asub_p0, beta);
+        is_less_than_beta        = is_less_than_beta & less_alpha_shift2_add2;
+        negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
+        is_less_than_beta        = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+
+        /* combine and store */
+        if (__lasx_xbnz_v(is_less_than_beta)) {
+            __m256i p2_org_h, p3_org_h, p1_h, p2_h;
+            __m256i p3_org = __lasx_xvldx(src, -img_width_2x);
+
+            p2_org_h   = __lasx_vext2xv_hu_bu(p2_org);
+            p3_org_h   = __lasx_vext2xv_hu_bu(p3_org);
+
+            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_h, p0_org_h, q0_org_h, p1_org_h,
+                                     p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
+
+            p0_h = __lasx_xvpickev_b(p0_h, p0_h);
+            p0_h =  __lasx_xvpermi_d(p0_h, 0xd8);
+            DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
+            DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
+            p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
+            p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
+            p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
+
+            __lasx_xvst(p1_org, src, 0);
+            __lasx_xvst(p2_org, src - img_width, 0);
+        }
+
+        AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
+        /* combine */
+        p0_h = __lasx_xvpickev_b(p0_h, p0_h);
+        p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
+        p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
+        __lasx_xvst(p0_org, data - img_width, 0);
+
+        /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
+        q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
+        is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
+        is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
+        negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+
+        /* combine and store */
+        if (__lasx_xbnz_v(is_less_than_beta)) {
+            __m256i q2_org_h, q3_org_h, q1_h, q2_h;
+            __m256i q3_org = __lasx_xvldx(data, img_width_2x + img_width);
+
+            q2_org_h   = __lasx_vext2xv_hu_bu(q2_org);
+            q3_org_h   = __lasx_vext2xv_hu_bu(q3_org);
+
+            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_h, q0_org_h, p0_org_h, q1_org_h,
+                                     q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
+
+            q0_h = __lasx_xvpickev_b(q0_h, q0_h);
+            q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
+            DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
+            DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
+            q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
+            q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
+            q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
+
+            __lasx_xvst(q1_org, data + img_width, 0);
+            __lasx_xvst(q2_org, data + img_width_2x, 0);
+        }
+
+        AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
+
+        /* combine */
+        q0_h = __lasx_xvpickev_b(q0_h, q0_h);
+        q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
+        q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
+
+        __lasx_xvst(q0_org, data, 0);
+    }
+}
+
+void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride)
+{
+    __m256i src0, dst0, dst1, dst2, dst3, zero;
+    __m256i tmp0, tmp1;
+    uint8_t* _dst1 = _dst + stride;
+    uint8_t* _dst2 = _dst1 + stride;
+    uint8_t* _dst3 = _dst2 + stride;
+
+    src0 = __lasx_xvld(_src, 0);
+    dst0 = __lasx_xvldrepl_w(_dst, 0);
+    dst1 = __lasx_xvldrepl_w(_dst1, 0);
+    dst2 = __lasx_xvldrepl_w(_dst2, 0);
+    dst3 = __lasx_xvldrepl_w(_dst3, 0);
+    tmp0 = __lasx_xvilvl_w(dst1, dst0);
+    tmp1 = __lasx_xvilvl_w(dst3, dst2);
+    dst0 = __lasx_xvilvl_d(tmp1, tmp0);
+    tmp0 = __lasx_vext2xv_hu_bu(dst0);
+    zero = __lasx_xvldi(0);
+    tmp1 = __lasx_xvadd_h(src0, tmp0);
+    dst0 = __lasx_xvpickev_b(tmp1, tmp1);
+    __lasx_xvstelm_w(dst0, _dst, 0, 0);
+    __lasx_xvstelm_w(dst0, _dst1, 0, 1);
+    __lasx_xvstelm_w(dst0, _dst2, 0, 4);
+    __lasx_xvstelm_w(dst0, _dst3, 0, 5);
+    __lasx_xvst(zero, _src, 0);
+}
+
+void ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride)
+{
+    __m256i src0, src1, src2, src3;
+    __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    __m256i tmp0, tmp1, tmp2, tmp3;
+    __m256i zero = __lasx_xvldi(0);
+    uint8_t *_dst1 = _dst + stride;
+    uint8_t *_dst2 = _dst1 + stride;
+    uint8_t *_dst3 = _dst2 + stride;
+    uint8_t *_dst4 = _dst3 + stride;
+    uint8_t *_dst5 = _dst4 + stride;
+    uint8_t *_dst6 = _dst5 + stride;
+    uint8_t *_dst7 = _dst6 + stride;
+
+    src0 = __lasx_xvld(_src, 0);
+    src1 = __lasx_xvld(_src, 32);
+    src2 = __lasx_xvld(_src, 64);
+    src3 = __lasx_xvld(_src, 96);
+    dst0 = __lasx_xvldrepl_d(_dst, 0);
+    dst1 = __lasx_xvldrepl_d(_dst1, 0);
+    dst2 = __lasx_xvldrepl_d(_dst2, 0);
+    dst3 = __lasx_xvldrepl_d(_dst3, 0);
+    dst4 = __lasx_xvldrepl_d(_dst4, 0);
+    dst5 = __lasx_xvldrepl_d(_dst5, 0);
+    dst6 = __lasx_xvldrepl_d(_dst6, 0);
+    dst7 = __lasx_xvldrepl_d(_dst7, 0);
+    tmp0 = __lasx_xvilvl_d(dst1, dst0);
+    tmp1 = __lasx_xvilvl_d(dst3, dst2);
+    tmp2 = __lasx_xvilvl_d(dst5, dst4);
+    tmp3 = __lasx_xvilvl_d(dst7, dst6);
+    dst0 = __lasx_vext2xv_hu_bu(tmp0);
+    dst1 = __lasx_vext2xv_hu_bu(tmp1);
+    dst1 = __lasx_vext2xv_hu_bu(tmp1);
+    dst2 = __lasx_vext2xv_hu_bu(tmp2);
+    dst3 = __lasx_vext2xv_hu_bu(tmp3);
+    tmp0 = __lasx_xvadd_h(src0, dst0);
+    tmp1 = __lasx_xvadd_h(src1, dst1);
+    tmp2 = __lasx_xvadd_h(src2, dst2);
+    tmp3 = __lasx_xvadd_h(src3, dst3);
+    dst1 = __lasx_xvpickev_b(tmp1, tmp0);
+    dst2 = __lasx_xvpickev_b(tmp3, tmp2);
+    __lasx_xvst(zero, _src, 0);
+    __lasx_xvst(zero, _src, 32);
+    __lasx_xvst(zero, _src, 64);
+    __lasx_xvst(zero, _src, 96);
+    __lasx_xvstelm_d(dst1, _dst, 0, 0);
+    __lasx_xvstelm_d(dst1, _dst1, 0, 2);
+    __lasx_xvstelm_d(dst1, _dst2, 0, 1);
+    __lasx_xvstelm_d(dst1, _dst3, 0, 3);
+    __lasx_xvstelm_d(dst2, _dst4, 0, 0);
+    __lasx_xvstelm_d(dst2, _dst5, 0, 2);
+    __lasx_xvstelm_d(dst2, _dst6, 0, 1);
+    __lasx_xvstelm_d(dst2, _dst7, 0, 3);
+}
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Xiwei  Gu  <guxiwei-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
+
+#include "libavcodec/h264dec.h"
+#include "config.h"
+
+void ff_h264_idct_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_idct8_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_idct_dc_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_idct8_dc_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_luma_dc_dequant_idct_8_lsx(int16_t *_output, int16_t *_input, int qmul);
+void ff_h264_idct_add16_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                              int16_t *block, int32_t dst_stride,
+                              const uint8_t nzc[15 * 8]);
+void ff_h264_idct8_add4_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                              int16_t *block, int32_t dst_stride,
+                              const uint8_t nzc[15 * 8]);
+void ff_h264_idct_add8_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+                             int16_t *block, int32_t dst_stride,
+                             const uint8_t nzc[15 * 8]);
+void ff_h264_idct_add8_422_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+                                 int16_t *block, int32_t dst_stride,
+                                 const uint8_t nzc[15 * 8]);
+void ff_h264_idct_add16_intra_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                                    int16_t *block, int32_t dst_stride,
+                                    const uint8_t nzc[15 * 8]);
+
+void ff_h264_h_lpf_luma_8_lsx(uint8_t *src, ptrdiff_t stride,
+                              int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_luma_8_lsx(uint8_t *src, ptrdiff_t stride,
+                              int alpha, int beta, int8_t *tc0);
+void ff_h264_h_lpf_luma_intra_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                    int alpha, int beta);
+void ff_h264_v_lpf_luma_intra_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                    int alpha, int beta);
+void ff_h264_h_lpf_chroma_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_chroma_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                int alpha, int beta, int8_t *tc0);
+void ff_h264_h_lpf_chroma_intra_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                      int alpha, int beta);
+void ff_h264_v_lpf_chroma_intra_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                      int alpha, int beta);
+void ff_biweight_h264_pixels16_8_lsx(uint8_t *dst, uint8_t *src,
+                                     ptrdiff_t stride, int height,
+                                     int log2_denom, int weight_dst,
+                                     int weight_src, int offset_in);
+void ff_biweight_h264_pixels8_8_lsx(uint8_t *dst, uint8_t *src,
+                                    ptrdiff_t stride, int height,
+                                    int log2_denom, int weight_dst,
+                                    int weight_src, int offset);
+void ff_biweight_h264_pixels4_8_lsx(uint8_t *dst, uint8_t *src,
+                                    ptrdiff_t stride, int height,
+                                    int log2_denom, int weight_dst,
+                                    int weight_src, int offset);
+void ff_weight_h264_pixels16_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                   int height, int log2_denom,
+                                   int weight_src, int offset_in);
+void ff_weight_h264_pixels8_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                  int height, int log2_denom,
+                                  int weight_src, int offset);
+void ff_weight_h264_pixels4_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                  int height, int log2_denom,
+                                  int weight_src, int offset);
+void ff_h264_add_pixels4_8_lsx(uint8_t *_dst, int16_t *_src, int stride);
+void ff_h264_add_pixels8_8_lsx(uint8_t *_dst, int16_t *_src, int stride);
+void ff_h264_loop_filter_strength_lsx(int16_t bS[2][4][4], uint8_t nnz[40],
+                                      int8_t ref[2][40], int16_t mv[2][40][2],
+                                      int bidir, int edges, int step,
+                                      int mask_mv0, int mask_mv1, int field);
+
+#if HAVE_LASX
+void ff_h264_h_lpf_luma_8_lasx(uint8_t *src, ptrdiff_t stride,
+                               int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_luma_8_lasx(uint8_t *src, ptrdiff_t stride,
+                               int alpha, int beta, int8_t *tc0);
+void ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *src, ptrdiff_t stride,
+                                     int alpha, int beta);
+void ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *src, ptrdiff_t stride,
+                                     int alpha, int beta);
+void ff_biweight_h264_pixels16_8_lasx(unsigned char *dst, unsigned char *src,
+                                      long int stride, int height,
+                                      int log2_denom, int weight_dst,
+                                      int weight_src, int offset_in);
+void ff_biweight_h264_pixels8_8_lasx(unsigned char *dst, unsigned char *src,
+                                     long int stride, int height,
+                                     int log2_denom, int weight_dst,
+                                     int weight_src, int offset);
+void ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride,
+                                    int height, int log2_denom,
+                                    int weight_src, int offset_in);
+void ff_weight_h264_pixels8_8_lasx(uint8_t *src, ptrdiff_t stride,
+                                   int height, int log2_denom,
+                                   int weight_src, int offset);
+void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
+
+void ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
+void ff_h264_idct8_add_8_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
+void ff_h264_idct8_dc_add_8_lasx(uint8_t *dst, int16_t *src,
+                                  int32_t dst_stride);
+void ff_h264_idct8_add4_8_lasx(uint8_t *dst, const int32_t *blk_offset,
+                               int16_t *block, int32_t dst_stride,
+                               const uint8_t nzc[15 * 8]);
+void ff_h264_loop_filter_strength_lasx(int16_t bS[2][4][4], uint8_t nnz[40],
+                                       int8_t ref[2][40], int16_t mv[2][40][2],
+                                       int bidir, int edges, int step,
+                                       int mask_mv0, int mask_mv1, int field);
+#endif // #if HAVE_LASX
+
+#endif  // #ifndef AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
@@ -0,0 +1,658 @@
+/*
+ * Loongson LASX optimized h264idct
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+/*
+ * #define FUNC2(a, b, c)  FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride)
+ * LSX optimization is enough for this function.
+ */
+function ff_h264_idct_add_8_lsx
+    fld.d         f0,     a1,    0
+    fld.d         f1,     a1,    8
+    fld.d         f2,     a1,    16
+    fld.d         f3,     a1,    24
+    vxor.v        vr7,    vr7,   vr7
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    vst           vr7,    a1,    0
+    vst           vr7,    a1,    16
+
+    vadd.h        vr4,    vr0,   vr2
+    vsub.h        vr5,    vr0,   vr2
+    vsrai.h       vr6,    vr1,   1
+    vsrai.h       vr7,    vr3,   1
+    vsub.h        vr6,    vr6,   vr3
+    vadd.h        vr7,    vr1,   vr7
+    LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7,  vr0, vr1, vr2, vr3
+    LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3,  vr0, vr1, vr2, vr3,  vr4, vr5
+    vadd.h        vr4,    vr0,   vr2
+    vsub.h        vr5,    vr0,   vr2
+    vsrai.h       vr6,    vr1,   1
+    vsrai.h       vr7,    vr3,   1
+    vsub.h        vr6,    vr6,   vr3
+    vadd.h        vr7,    vr1,   vr7
+    LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7,  vr0, vr1, vr2, vr3
+
+    fld.s         f4,     a0,    0
+    fldx.s        f5,     a0,    a2
+    fldx.s        f6,     a0,    t2
+    fldx.s        f7,     a0,    t3
+
+    vsrari.h      vr0,    vr0,   6
+    vsrari.h      vr1,    vr1,   6
+    vsrari.h      vr2,    vr2,   6
+    vsrari.h      vr3,    vr3,   6
+
+    vsllwil.hu.bu vr4,    vr4,   0
+    vsllwil.hu.bu vr5,    vr5,   0
+    vsllwil.hu.bu vr6,    vr6,   0
+    vsllwil.hu.bu vr7,    vr7,   0
+    vadd.h        vr0,    vr0,   vr4
+    vadd.h        vr1,    vr1,   vr5
+    vadd.h        vr2,    vr2,   vr6
+    vadd.h        vr3,    vr3,   vr7
+    vssrarni.bu.h vr1,    vr0,   0
+    vssrarni.bu.h vr3,    vr2,   0
+
+    vbsrl.v       vr0,    vr1,   8
+    vbsrl.v       vr2,    vr3,   8
+    fst.s         f1,     a0,    0
+    fstx.s        f0,     a0,    a2
+    fstx.s        f3,     a0,    t2
+    fstx.s        f2,     a0,    t3
+endfunc
+
+/*
+ * #define FUNC2(a, b, c)  FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride)
+ */
+function ff_h264_idct8_add_8_lsx
+    ld.h          t0,     a1,    0
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    add.d         t4,     t3,    a2
+    add.d         t5,     t4,    a2
+    add.d         t6,     t5,    a2
+    add.d         t7,     t6,    a2
+    addi.w        t0,     t0,    32
+    st.h          t0,     a1,    0
+
+    vld           vr0,    a1,    0
+    vld           vr1,    a1,    16
+    vld           vr2,    a1,    32
+    vld           vr3,    a1,    48
+    vld           vr4,    a1,    64
+    vld           vr5,    a1,    80
+    vld           vr6,    a1,    96
+    vld           vr7,    a1,    112
+    vxor.v        vr8,    vr8,   vr8
+    vst           vr8,    a1,    0
+    vst           vr8,    a1,    16
+    vst           vr8,    a1,    32
+    vst           vr8,    a1,    48
+    vst           vr8,    a1,    64
+    vst           vr8,    a1,    80
+    vst           vr8,    a1,    96
+    vst           vr8,    a1,    112
+
+    vadd.h        vr18,   vr0,   vr4
+    vsub.h        vr19,   vr0,   vr4
+    vsrai.h       vr20,   vr2,   1
+    vsrai.h       vr21,   vr6,   1
+    vsub.h        vr20,   vr20,  vr6
+    vadd.h        vr21,   vr21,  vr2
+    LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21,  vr10, vr12, vr14, vr16
+    vsrai.h       vr11,   vr7,   1
+    vsrai.h       vr13,   vr3,   1
+    vsrai.h       vr15,   vr5,   1
+    vsrai.h       vr17,   vr1,   1
+    vsub.h        vr11,   vr5,   vr11
+    vsub.h        vr13,   vr7,   vr13
+    vadd.h        vr15,   vr7,   vr15
+    vadd.h        vr17,   vr5,   vr17
+    vsub.h        vr11,   vr11,  vr7
+    vsub.h        vr13,   vr13,  vr3
+    vadd.h        vr15,   vr15,  vr5
+    vadd.h        vr17,   vr17,  vr1
+    vsub.h        vr11,   vr11,  vr3
+    vadd.h        vr13,   vr13,  vr1
+    vsub.h        vr15,   vr15,  vr1
+    vadd.h        vr17,   vr17,  vr3
+    vsrai.h       vr18,   vr11,  2
+    vsrai.h       vr19,   vr13,  2
+    vsrai.h       vr20,   vr15,  2
+    vsrai.h       vr21,   vr17,  2
+    vadd.h        vr11,   vr11,  vr21
+    vadd.h        vr13,   vr13,  vr20
+    vsub.h        vr15,   vr19,  vr15
+    vsub.h        vr17,   vr17,  vr18
+    LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \
+                      vr0,  vr3,  vr1,  vr2,  vr5,  vr6,  vr4,  vr7
+
+    LSX_TRANSPOSE8x8_H vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7, \
+                       vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7, \
+                       vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17
+    vexth.w.h     vr20,   vr0
+    vexth.w.h     vr21,   vr1
+    vexth.w.h     vr22,   vr2
+    vexth.w.h     vr23,   vr3
+    vexth.w.h     vr8,    vr4
+    vexth.w.h     vr9,    vr5
+    vexth.w.h     vr18,   vr6
+    vexth.w.h     vr19,   vr7
+    vsllwil.w.h   vr0,    vr0,   0
+    vsllwil.w.h   vr1,    vr1,   0
+    vsllwil.w.h   vr2,    vr2,   0
+    vsllwil.w.h   vr3,    vr3,   0
+    vsllwil.w.h   vr4,    vr4,   0
+    vsllwil.w.h   vr5,    vr5,   0
+    vsllwil.w.h   vr6,    vr6,   0
+    vsllwil.w.h   vr7,    vr7,   0
+
+    vadd.w        vr11,   vr0,   vr4
+    vsub.w        vr13,   vr0,   vr4
+    vsrai.w       vr15,   vr2,   1
+    vsrai.w       vr17,   vr6,   1
+    vsub.w        vr15,   vr15,  vr6
+    vadd.w        vr17,   vr17,  vr2
+    LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17,  vr10, vr12, vr14, vr16
+    vsrai.w       vr11,   vr7,   1
+    vsrai.w       vr13,   vr3,   1
+    vsrai.w       vr15,   vr5,   1
+    vsrai.w       vr17,   vr1,   1
+    vsub.w        vr11,   vr5,   vr11
+    vsub.w        vr13,   vr7,   vr13
+    vadd.w        vr15,   vr7,   vr15
+    vadd.w        vr17,   vr5,   vr17
+    vsub.w        vr11,   vr11,  vr7
+    vsub.w        vr13,   vr13,  vr3
+    vadd.w        vr15,   vr15,  vr5
+    vadd.w        vr17,   vr17,  vr1
+    vsub.w        vr11,   vr11,  vr3
+    vadd.w        vr13,   vr13,  vr1
+    vsub.w        vr15,   vr15,  vr1
+    vadd.w        vr17,   vr17,  vr3
+    vsrai.w       vr0,    vr11,  2
+    vsrai.w       vr1,    vr13,  2
+    vsrai.w       vr2,    vr15,  2
+    vsrai.w       vr3,    vr17,  2
+    vadd.w        vr11,   vr11,  vr3
+    vadd.w        vr13,   vr13,  vr2
+    vsub.w        vr15,   vr1,   vr15
+    vsub.w        vr17,   vr17,  vr0
+    LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \
+                      vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7
+
+    vadd.w        vr11,    vr20,  vr8
+    vsub.w        vr13,    vr20,  vr8
+    vsrai.w       vr15,    vr22,  1
+    vsrai.w       vr17,    vr18,  1
+    vsub.w        vr15,    vr15,  vr18
+    vadd.w        vr17,    vr17,  vr22
+    LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17,  vr10, vr12, vr14, vr16
+    vsrai.w       vr11,   vr19,  1
+    vsrai.w       vr13,   vr23,  1
+    vsrai.w       vr15,   vr9,   1
+    vsrai.w       vr17,   vr21,  1
+    vsub.w        vr11,   vr9,   vr11
+    vsub.w        vr13,   vr19,  vr13
+    vadd.w        vr15,   vr19,  vr15
+    vadd.w        vr17,   vr9,   vr17
+    vsub.w        vr11,   vr11,  vr19
+    vsub.w        vr13,   vr13,  vr23
+    vadd.w        vr15,   vr15,  vr9
+    vadd.w        vr17,   vr17,  vr21
+    vsub.w        vr11,   vr11,  vr23
+    vadd.w        vr13,   vr13,  vr21
+    vsub.w        vr15,   vr15,  vr21
+    vadd.w        vr17,   vr17,  vr23
+    vsrai.w       vr20,   vr11,  2
+    vsrai.w       vr21,   vr13,  2
+    vsrai.w       vr22,   vr15,  2
+    vsrai.w       vr23,   vr17,  2
+    vadd.w        vr11,   vr11,  vr23
+    vadd.w        vr13,   vr13,  vr22
+    vsub.w        vr15,   vr21,  vr15
+    vsub.w        vr17,   vr17,  vr20
+    LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \
+                      vr20, vr21, vr22, vr23, vr8, vr9, vr18, vr19
+
+    vld           vr10,   a0,    0
+    vldx          vr11,   a0,    a2
+    vldx          vr12,   a0,    t2
+    vldx          vr13,   a0,    t3
+    vldx          vr14,   a0,    t4
+    vldx          vr15,   a0,    t5
+    vldx          vr16,   a0,    t6
+    vldx          vr17,   a0,    t7
+    vsrani.h.w    vr20,   vr0,   6
+    vsrani.h.w    vr21,   vr1,   6
+    vsrani.h.w    vr22,   vr2,   6
+    vsrani.h.w    vr23,   vr3,   6
+    vsrani.h.w    vr8,    vr4,   6
+    vsrani.h.w    vr9,    vr5,   6
+    vsrani.h.w    vr18,   vr6,   6
+    vsrani.h.w    vr19,   vr7,   6
+    vsllwil.hu.bu vr10,   vr10,  0
+    vsllwil.hu.bu vr11,   vr11,  0
+    vsllwil.hu.bu vr12,   vr12,  0
+    vsllwil.hu.bu vr13,   vr13,  0
+    vsllwil.hu.bu vr14,   vr14,  0
+    vsllwil.hu.bu vr15,   vr15,  0
+    vsllwil.hu.bu vr16,   vr16,  0
+    vsllwil.hu.bu vr17,   vr17,  0
+
+    vadd.h        vr0,    vr20,  vr10
+    vadd.h        vr1,    vr21,  vr11
+    vadd.h        vr2,    vr22,  vr12
+    vadd.h        vr3,    vr23,  vr13
+    vadd.h        vr4,    vr8,   vr14
+    vadd.h        vr5,    vr9,   vr15
+    vadd.h        vr6,    vr18,  vr16
+    vadd.h        vr7,    vr19,  vr17
+    vssrarni.bu.h vr1,    vr0,   0
+    vssrarni.bu.h vr3,    vr2,   0
+    vssrarni.bu.h vr5,    vr4,   0
+    vssrarni.bu.h vr7,    vr6,   0
+    vbsrl.v       vr0,    vr1,   8
+    vbsrl.v       vr2,    vr3,   8
+    vbsrl.v       vr4,    vr5,   8
+    vbsrl.v       vr6,    vr7,   8
+    fst.d         f1,     a0,    0
+    fstx.d        f0,     a0,    a2
+    fstx.d        f3,     a0,    t2
+    fstx.d        f2,     a0,    t3
+    fstx.d        f5,     a0,    t4
+    fstx.d        f4,     a0,    t5
+    fstx.d        f7,     a0,    t6
+    fstx.d        f6,     a0,    t7
+endfunc
+
+/*
+ * #define FUNC2(a, b, c)  FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride)
+ */
+function ff_h264_idct8_add_8_lasx
+    ld.h          t0,     a1,    0
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    add.d         t4,     t3,    a2
+    add.d         t5,     t4,    a2
+    add.d         t6,     t5,    a2
+    add.d         t7,     t6,    a2
+    addi.w        t0,     t0,    32
+    st.h          t0,     a1,    0
+
+    vld           vr0,    a1,    0
+    vld           vr1,    a1,    16
+    vld           vr2,    a1,    32
+    vld           vr3,    a1,    48
+    vld           vr4,    a1,    64
+    vld           vr5,    a1,    80
+    vld           vr6,    a1,    96
+    vld           vr7,    a1,    112
+    xvxor.v       xr8,    xr8,   xr8
+    xvst          xr8,    a1,    0
+    xvst          xr8,    a1,    32
+    xvst          xr8,    a1,    64
+    xvst          xr8,    a1,    96
+
+    vadd.h        vr18,   vr0,   vr4
+    vsub.h        vr19,   vr0,   vr4
+    vsrai.h       vr20,   vr2,   1
+    vsrai.h       vr21,   vr6,   1
+    vsub.h        vr20,   vr20,  vr6
+    vadd.h        vr21,   vr21,  vr2
+    LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21,  vr10, vr12, vr14, vr16
+    vsrai.h       vr11,   vr7,   1
+    vsrai.h       vr13,   vr3,   1
+    vsrai.h       vr15,   vr5,   1
+    vsrai.h       vr17,   vr1,   1
+    vsub.h        vr11,   vr5,   vr11
+    vsub.h        vr13,   vr7,   vr13
+    vadd.h        vr15,   vr7,   vr15
+    vadd.h        vr17,   vr5,   vr17
+    vsub.h        vr11,   vr11,  vr7
+    vsub.h        vr13,   vr13,  vr3
+    vadd.h        vr15,   vr15,  vr5
+    vadd.h        vr17,   vr17,  vr1
+    vsub.h        vr11,   vr11,  vr3
+    vadd.h        vr13,   vr13,  vr1
+    vsub.h        vr15,   vr15,  vr1
+    vadd.h        vr17,   vr17,  vr3
+    vsrai.h       vr18,   vr11,  2
+    vsrai.h       vr19,   vr13,  2
+    vsrai.h       vr20,   vr15,  2
+    vsrai.h       vr21,   vr17,  2
+    vadd.h        vr11,   vr11,  vr21
+    vadd.h        vr13,   vr13,  vr20
+    vsub.h        vr15,   vr19,  vr15
+    vsub.h        vr17,   vr17,  vr18
+    LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \
+                      vr0,  vr3,  vr1,  vr2,  vr5,  vr6,  vr4,  vr7
+
+    LSX_TRANSPOSE8x8_H vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7, \
+                       vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7, \
+                       vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17
+    vext2xv.w.h   xr0,    xr0
+    vext2xv.w.h   xr1,    xr1
+    vext2xv.w.h   xr2,    xr2
+    vext2xv.w.h   xr3,    xr3
+    vext2xv.w.h   xr4,    xr4
+    vext2xv.w.h   xr5,    xr5
+    vext2xv.w.h   xr6,    xr6
+    vext2xv.w.h   xr7,    xr7
+
+    xvadd.w       xr11,   xr0,   xr4
+    xvsub.w       xr13,   xr0,   xr4
+    xvsrai.w      xr15,   xr2,   1
+    xvsrai.w      xr17,   xr6,   1
+    xvsub.w       xr15,   xr15,  xr6
+    xvadd.w       xr17,   xr17,  xr2
+    LASX_BUTTERFLY_4_W xr11, xr13, xr15, xr17,  xr10, xr12, xr14, xr16
+    xvsrai.w      xr11,   xr7,   1
+    xvsrai.w      xr13,   xr3,   1
+    xvsrai.w      xr15,   xr5,   1
+    xvsrai.w      xr17,   xr1,   1
+    xvsub.w       xr11,   xr5,   xr11
+    xvsub.w       xr13,   xr7,   xr13
+    xvadd.w       xr15,   xr7,   xr15
+    xvadd.w       xr17,   xr5,   xr17
+    xvsub.w       xr11,   xr11,  xr7
+    xvsub.w       xr13,   xr13,  xr3
+    xvadd.w       xr15,   xr15,  xr5
+    xvadd.w       xr17,   xr17,  xr1
+    xvsub.w       xr11,   xr11,  xr3
+    xvadd.w       xr13,   xr13,  xr1
+    xvsub.w       xr15,   xr15,  xr1
+    xvadd.w       xr17,   xr17,  xr3
+    xvsrai.w      xr0,    xr11,  2
+    xvsrai.w      xr1,    xr13,  2
+    xvsrai.w      xr2,    xr15,  2
+    xvsrai.w      xr3,    xr17,  2
+    xvadd.w       xr11,   xr11,  xr3
+    xvadd.w       xr13,   xr13,  xr2
+    xvsub.w       xr15,   xr1,   xr15
+    xvsub.w       xr17,   xr17,  xr0
+    LASX_BUTTERFLY_8_W xr10, xr12, xr14, xr16, xr11, xr13, xr15, xr17, \
+                       xr0,  xr1,  xr2,  xr3,  xr4,  xr5,  xr6,  xr7
+
+    vld           vr10,   a0,    0
+    vldx          vr11,   a0,    a2
+    vldx          vr12,   a0,    t2
+    vldx          vr13,   a0,    t3
+    vldx          vr14,   a0,    t4
+    vldx          vr15,   a0,    t5
+    vldx          vr16,   a0,    t6
+    vldx          vr17,   a0,    t7
+    xvldi         xr8,    0x806     //"xvldi.w xr8 6"
+    xvsran.h.w    xr0,    xr0,   xr8
+    xvsran.h.w    xr1,    xr1,   xr8
+    xvsran.h.w    xr2,    xr2,   xr8
+    xvsran.h.w    xr3,    xr3,   xr8
+    xvsran.h.w    xr4,    xr4,   xr8
+    xvsran.h.w    xr5,    xr5,   xr8
+    xvsran.h.w    xr6,    xr6,   xr8
+    xvsran.h.w    xr7,    xr7,   xr8
+    xvpermi.d     xr0,    xr0,   0x08
+    xvpermi.d     xr1,    xr1,   0x08
+    xvpermi.d     xr2,    xr2,   0x08
+    xvpermi.d     xr3,    xr3,   0x08
+    xvpermi.d     xr4,    xr4,   0x08
+    xvpermi.d     xr5,    xr5,   0x08
+    xvpermi.d     xr6,    xr6,   0x08
+    xvpermi.d     xr7,    xr7,   0x08
+
+    vsllwil.hu.bu vr10,   vr10,  0
+    vsllwil.hu.bu vr11,   vr11,  0
+    vsllwil.hu.bu vr12,   vr12,  0
+    vsllwil.hu.bu vr13,   vr13,  0
+    vsllwil.hu.bu vr14,   vr14,  0
+    vsllwil.hu.bu vr15,   vr15,  0
+    vsllwil.hu.bu vr16,   vr16,  0
+    vsllwil.hu.bu vr17,   vr17,  0
+
+    vadd.h        vr0,    vr0,   vr10
+    vadd.h        vr1,    vr1,   vr11
+    vadd.h        vr2,    vr2,   vr12
+    vadd.h        vr3,    vr3,   vr13
+    vadd.h        vr4,    vr4,   vr14
+    vadd.h        vr5,    vr5,   vr15
+    vadd.h        vr6,    vr6,   vr16
+    vadd.h        vr7,    vr7,   vr17
+    vssrarni.bu.h vr1,    vr0,   0
+    vssrarni.bu.h vr3,    vr2,   0
+    vssrarni.bu.h vr5,    vr4,   0
+    vssrarni.bu.h vr7,    vr6,   0
+    vbsrl.v       vr0,    vr1,   8
+    vbsrl.v       vr2,    vr3,   8
+    vbsrl.v       vr4,    vr5,   8
+    vbsrl.v       vr6,    vr7,   8
+    fst.d         f1,     a0,    0
+    fstx.d        f0,     a0,    a2
+    fstx.d        f3,     a0,    t2
+    fstx.d        f2,     a0,    t3
+    fstx.d        f5,     a0,    t4
+    fstx.d        f4,     a0,    t5
+    fstx.d        f7,     a0,    t6
+    fstx.d        f6,     a0,    t7
+endfunc
+
+/*
+ * #define FUNC2(a, b, c)  FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *_block, int stride)
+ * LSX optimization is enough for this function.
+ */
+function ff_h264_idct_dc_add_8_lsx
+    vldrepl.h     vr4,    a1,    0
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    fld.s         f0,     a0,    0
+    fldx.s        f1,     a0,    a2
+    fldx.s        f2,     a0,    t2
+    fldx.s        f3,     a0,    t3
+    st.h          zero,   a1,    0
+
+    vsrari.h      vr4,    vr4,   6
+    vilvl.w       vr0,    vr1,   vr0
+    vilvl.w       vr1,    vr3,   vr2
+    vsllwil.hu.bu vr0,    vr0,   0
+    vsllwil.hu.bu vr1,    vr1,   0
+    vadd.h        vr0,    vr0,   vr4
+    vadd.h        vr1,    vr1,   vr4
+    vssrarni.bu.h vr1,    vr0,   0
+
+    vbsrl.v       vr2,    vr1,   4
+    vbsrl.v       vr3,    vr1,   8
+    vbsrl.v       vr4,    vr1,   12
+    fst.s         f1,     a0,    0
+    fstx.s        f2,     a0,    a2
+    fstx.s        f3,     a0,    t2
+    fstx.s        f4,     a0,    t3
+endfunc
+
+/*
+ * #define FUNC2(a, b, c)  FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, int16_t *_block, int stride)
+ */
+function ff_h264_idct8_dc_add_8_lsx
+    vldrepl.h     vr8,    a1,    0
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    add.d         t4,     t3,    a2
+    add.d         t5,     t4,    a2
+    add.d         t6,     t5,    a2
+    add.d         t7,     t6,    a2
+
+    fld.d         f0,     a0,    0
+    fldx.d        f1,     a0,    a2
+    fldx.d        f2,     a0,    t2
+    fldx.d        f3,     a0,    t3
+    fldx.d        f4,     a0,    t4
+    fldx.d        f5,     a0,    t5
+    fldx.d        f6,     a0,    t6
+    fldx.d        f7,     a0,    t7
+    st.h          zero,   a1,    0
+
+    vsrari.h      vr8,    vr8,   6
+    vsllwil.hu.bu vr0,    vr0,   0
+    vsllwil.hu.bu vr1,    vr1,   0
+    vsllwil.hu.bu vr2,    vr2,   0
+    vsllwil.hu.bu vr3,    vr3,   0
+    vsllwil.hu.bu vr4,    vr4,   0
+    vsllwil.hu.bu vr5,    vr5,   0
+    vsllwil.hu.bu vr6,    vr6,   0
+    vsllwil.hu.bu vr7,    vr7,   0
+    vadd.h        vr0,    vr0,   vr8
+    vadd.h        vr1,    vr1,   vr8
+    vadd.h        vr2,    vr2,   vr8
+    vadd.h        vr3,    vr3,   vr8
+    vadd.h        vr4,    vr4,   vr8
+    vadd.h        vr5,    vr5,   vr8
+    vadd.h        vr6,    vr6,   vr8
+    vadd.h        vr7,    vr7,   vr8
+    vssrarni.bu.h vr1,    vr0,   0
+    vssrarni.bu.h vr3,    vr2,   0
+    vssrarni.bu.h vr5,    vr4,   0
+    vssrarni.bu.h vr7,    vr6,   0
+
+    vbsrl.v       vr0,    vr1,   8
+    vbsrl.v       vr2,    vr3,   8
+    vbsrl.v       vr4,    vr5,   8
+    vbsrl.v       vr6,    vr7,   8
+    fst.d         f1,     a0,    0
+    fstx.d        f0,     a0,    a2
+    fstx.d        f3,     a0,    t2
+    fstx.d        f2,     a0,    t3
+    fstx.d        f5,     a0,    t4
+    fstx.d        f4,     a0,    t5
+    fstx.d        f7,     a0,    t6
+    fstx.d        f6,     a0,    t7
+endfunc
+function ff_h264_idct8_dc_add_8_lasx
+    xvldrepl.h    xr8,    a1,    0
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    add.d         t4,     t3,    a2
+    add.d         t5,     t4,    a2
+    add.d         t6,     t5,    a2
+    add.d         t7,     t6,    a2
+
+    fld.d         f0,     a0,    0
+    fldx.d        f1,     a0,    a2
+    fldx.d        f2,     a0,    t2
+    fldx.d        f3,     a0,    t3
+    fldx.d        f4,     a0,    t4
+    fldx.d        f5,     a0,    t5
+    fldx.d        f6,     a0,    t6
+    fldx.d        f7,     a0,    t7
+    st.h          zero,   a1,    0
+
+    xvsrari.h     xr8,    xr8,   6
+    xvpermi.q     xr1,    xr0,   0x20
+    xvpermi.q     xr3,    xr2,   0x20
+    xvpermi.q     xr5,    xr4,   0x20
+    xvpermi.q     xr7,    xr6,   0x20
+    xvsllwil.hu.bu xr1,   xr1,   0
+    xvsllwil.hu.bu xr3,   xr3,   0
+    xvsllwil.hu.bu xr5,   xr5,   0
+    xvsllwil.hu.bu xr7,   xr7,   0
+    xvadd.h       xr1,    xr1,   xr8
+    xvadd.h       xr3,    xr3,   xr8
+    xvadd.h       xr5,    xr5,   xr8
+    xvadd.h       xr7,    xr7,   xr8
+
+    xvssrarni.bu.h xr3,   xr1,   0
+    xvssrarni.bu.h xr7,   xr5,   0
+
+    xvpermi.q     xr1,    xr3,   0x11
+    xvpermi.q     xr5,    xr7,   0x11
+    xvbsrl.v      xr0,    xr1,   8
+    xvbsrl.v      xr2,    xr3,   8
+    xvbsrl.v      xr4,    xr5,   8
+    xvbsrl.v      xr6,    xr7,   8
+
+    fst.d         f3,     a0,    0
+    fstx.d        f1,     a0,    a2
+    fstx.d        f2,     a0,    t2
+    fstx.d        f0,     a0,    t3
+    fstx.d        f7,     a0,    t4
+    fstx.d        f5,     a0,    t5
+    fstx.d        f6,     a0,    t6
+    fstx.d        f4,     a0,    t7
+endfunc
+
+/**
+ * IDCT transforms the 16 dc values and dequantizes them.
+ * @param qmul quantization parameter
+ * void FUNCC(ff_h264_luma_dc_dequant_idct)(int16_t *_output, int16_t *_input, int qmul){
+ * LSX optimization is enough for this function.
+ */
+function ff_h264_luma_dc_dequant_idct_8_lsx
+    vld           vr0,    a1,    0
+    vld           vr1,    a1,    8
+    vld           vr2,    a1,    16
+    vld           vr3,    a1,    24
+    vreplgr2vr.w  vr8,    a2
+    LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr9, vr10
+    LSX_BUTTERFLY_4_H  vr4, vr6, vr7, vr5, vr0, vr3, vr2, vr1
+    LSX_BUTTERFLY_4_H  vr0, vr1, vr2, vr3, vr4, vr7, vr6, vr5
+    LSX_TRANSPOSE4x4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3, vr9, vr10
+    LSX_BUTTERFLY_4_H  vr0, vr1, vr3, vr2, vr4, vr7, vr6, vr5
+    LSX_BUTTERFLY_4_H  vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3
+    vsllwil.w.h   vr0,    vr0,   0
+    vsllwil.w.h   vr1,    vr1,   0
+    vsllwil.w.h   vr2,    vr2,   0
+    vsllwil.w.h   vr3,    vr3,   0
+    vmul.w        vr0,    vr0,   vr8
+    vmul.w        vr1,    vr1,   vr8
+    vmul.w        vr2,    vr2,   vr8
+    vmul.w        vr3,    vr3,   vr8
+    vsrarni.h.w   vr1,    vr0,   8
+    vsrarni.h.w   vr3,    vr2,   8
+
+    vstelm.h      vr1,    a0,    0,   0
+    vstelm.h      vr1,    a0,    32,  4
+    vstelm.h      vr1,    a0,    64,  1
+    vstelm.h      vr1,    a0,    96,  5
+    vstelm.h      vr3,    a0,    128, 0
+    vstelm.h      vr3,    a0,    160, 4
+    vstelm.h      vr3,    a0,    192, 1
+    vstelm.h      vr3,    a0,    224, 5
+    addi.d        a0,     a0,    256
+    vstelm.h      vr1,    a0,    0,   2
+    vstelm.h      vr1,    a0,    32,  6
+    vstelm.h      vr1,    a0,    64,  3
+    vstelm.h      vr1,    a0,    96,  7
+    vstelm.h      vr3,    a0,    128, 2
+    vstelm.h      vr3,    a0,    160, 6
+    vstelm.h      vr3,    a0,    192, 3
+    vstelm.h      vr3,    a0,    224, 7
+endfunc
@@ -0,0 +1,184 @@
+/*
+ * Loongson LSX/LASX optimized h264idct
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Xiwei  Gu  <guxiwei-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264dsp_loongarch.h"
+#include "libavcodec/bit_depth_template.c"
+
+void ff_h264_idct_add16_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                              int16_t *block, int32_t dst_stride,
+                              const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 0; i < 16; i++) {
+        int32_t nnz = nzc[scan8[i]];
+
+        if (nnz == 1 && ((dctcoef *) block)[i * 16]) {
+            ff_h264_idct_dc_add_8_lsx(dst + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    } else if (nnz) {
+            ff_h264_idct_add_8_lsx(dst + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        }
+    }
+}
+
+void ff_h264_idct8_add4_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                              int16_t *block, int32_t dst_stride,
+                              const uint8_t nzc[15 * 8])
+{
+    int32_t cnt;
+
+    for (cnt = 0; cnt < 16; cnt += 4) {
+        int32_t nnz = nzc[scan8[cnt]];
+
+        if (nnz == 1 && ((dctcoef *) block)[cnt * 16]) {
+            ff_h264_idct8_dc_add_8_lsx(dst + blk_offset[cnt],
+                                        block + cnt * 16 * sizeof(pixel),
+                                        dst_stride);
+        } else if (nnz) {
+            ff_h264_idct8_add_8_lsx(dst + blk_offset[cnt],
+                                     block + cnt * 16 * sizeof(pixel),
+                                     dst_stride);
+        }
+    }
+}
+
+#if HAVE_LASX
+void ff_h264_idct8_add4_8_lasx(uint8_t *dst, const int32_t *blk_offset,
+                               int16_t *block, int32_t dst_stride,
+                               const uint8_t nzc[15 * 8])
+{
+    int32_t cnt;
+
+    for (cnt = 0; cnt < 16; cnt += 4) {
+        int32_t nnz = nzc[scan8[cnt]];
+
+        if (nnz == 1 && ((dctcoef *) block)[cnt * 16]) {
+            ff_h264_idct8_dc_add_8_lasx(dst + blk_offset[cnt],
+                                        block + cnt * 16 * sizeof(pixel),
+                                        dst_stride);
+        } else if (nnz) {
+            ff_h264_idct8_add_8_lasx(dst + blk_offset[cnt],
+                                     block + cnt * 16 * sizeof(pixel),
+                                     dst_stride);
+        }
+    }
+}
+#endif // #if HAVE_LASX
+
+void ff_h264_idct_add8_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+                             int16_t *block, int32_t dst_stride,
+                             const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 16; i < 20; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+    for (i = 32; i < 36; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+}
+
+void ff_h264_idct_add8_422_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+                                 int16_t *block, int32_t dst_stride,
+                                 const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 16; i < 20; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+    for (i = 20; i < 24; i++) {
+        if (nzc[scan8[i + 4]])
+            ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i + 4],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i + 4],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+    for (i = 32; i < 36; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+    for (i = 36; i < 40; i++) {
+        if (nzc[scan8[i + 4]])
+            ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i + 4],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i + 4],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+}
+
+void ff_h264_idct_add16_intra_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                                    int16_t *block, int32_t dst_stride,
+                                    const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 0; i < 16; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_8_lsx(dst + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel), dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+}
@@ -0,0 +1,299 @@
+/*
+ * Loongson LSX optimized h264intrapred
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+const shufa
+.byte 6, 5, 4, 3, 2, 1, 0
+endconst
+
+const mulk
+.byte 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0
+endconst
+
+const mulh
+.byte 0, 0, 1, 0,  2,  0,  3, 0,  4, 0,  5, 0,  6, 0,  7, 0
+.byte 8, 0, 9, 0, 10,  0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0
+endconst
+
+.macro PRED16X16_PLANE
+    slli.d        t6,    a1,    1
+    slli.d        t4,    a1,    3
+    addi.d        t0,    a0,    7
+    sub.d         t0,    t0,    a1
+    add.d         t1,    a0,    t4
+    addi.d        t1,    t1,    -1
+    sub.d         t2,    t1,    t6
+
+    ld.bu         t3,    t0,    1
+    ld.bu         t4,    t0,    -1
+    ld.bu         t5,    t1,    0
+    ld.bu         t7,    t2,    0
+    sub.d         t3,    t3,    t4
+    sub.d         t4,    t5,    t7
+
+    la.local      t5,    mulk
+    vld           vr0,   t5,    0
+    fld.d         f1,    t0,    2
+    fld.d         f2,    t0,    -8
+    la.local      t5,    shufa
+    fld.d         f3,    t5,    0
+    vshuf.b       vr2,   vr2,   vr2,   vr3
+    vilvl.b       vr1,   vr1,   vr2
+    vhsubw.hu.bu  vr1,   vr1,   vr1
+    vmul.h        vr0,   vr0,   vr1
+    vhaddw.w.h    vr1,   vr0,   vr0
+    vhaddw.d.w    vr0,   vr1,   vr1
+    vhaddw.q.d    vr1,   vr0,   vr0
+    vpickve2gr.w  t5,    vr1,   0
+    add.d         t3,    t3,    t5
+//2
+    sub.d         t2,    t2,    a1
+    ld.bu         t8,    t2,    0
+    ldx.bu        t7,    t1,    a1
+    sub.d         t5,    t7,    t8
+    slli.d        t5,    t5,    1
+
+//3&4
+    add.d         t1,    t1,    t6
+    sub.d         t2,    t2,    a1
+    ld.bu         t8,    t2,    0
+    ld.bu         t7,    t1,    0
+    sub.d         t7,    t7,    t8
+    slli.d        t8,    t7,    1
+    add.d         t7,    t7,    t8
+    add.d         t5,    t5,    t7
+    sub.d         t2,    t2,    a1
+    ld.bu         t8,    t2,    0
+    ldx.bu        t7,    t1,    a1
+    sub.d         t7,    t7,    t8
+    slli.d        t7,    t7,    2
+    add.d         t5,    t5,    t7
+
+//5&6
+    add.d         t1,    t1,    t6
+    sub.d         t2,    t2,    a1
+    ld.bu         t8,    t2,    0
+    ld.bu         t7,    t1,    0
+    sub.d         t7,    t7,    t8
+    slli.d        t8,    t7,    2
+    add.d         t7,    t7,    t8
+    add.d         t5,    t5,    t7
+    sub.d         t2,    t2,    a1
+    ld.bu         t8,    t2,    0
+    ldx.bu        t7,    t1,    a1
+    sub.d         t7,    t7,    t8
+    slli.d        t8,    t7,    1
+    slli.d        t7,    t7,    2
+    add.d         t7,    t7,    t8
+    add.d         t5,    t5,    t7
+
+//7&8
+    add.d         t1,    t1,    t6
+    sub.d         t2,    t2,    a1
+    ld.bu         t8,    t2,    0
+    ld.bu         t7,    t1,    0
+    sub.d         t7,    t7,    t8
+    slli.d        t8,    t7,    3
+    sub.d         t7,    t8,    t7
+    add.d         t5,    t5,    t7
+    sub.d         t2,    t2,    a1
+    ld.bu         t8,    t2,    0
+    ldx.bu        t7,    t1,    a1
+    sub.d         t7,    t7,    t8
+    slli.d        t7,    t7,    3
+    add.d         t5,    t5,    t7
+    add.d         t4,    t4,    t5
+    add.d         t1,    t1,    a1
+.endm
+
+.macro PRED16X16_PLANE_END
+    ld.bu         t7,    t1,    0
+    ld.bu         t8,    t2,    16
+    add.d         t5,    t7,    t8
+    addi.d        t5,    t5,    1
+    slli.d        t5,    t5,    4
+    add.d         t7,    t3,    t4
+    slli.d        t8,    t7,    3
+    sub.d         t7,    t8,    t7
+    sub.d         t5,    t5,    t7
+
+    la.local      t8,    mulh
+    vld           vr3,   t8,    0
+    slli.d        t8,    t3,    3
+    vreplgr2vr.h  vr4,   t3
+    vreplgr2vr.h  vr9,   t8
+    vmul.h        vr5,   vr3,   vr4
+
+.rept 16
+    move          t7,    t5
+    add.d         t5,    t5,    t4
+    vreplgr2vr.h  vr6,   t7
+    vadd.h        vr7,   vr6,   vr5
+    vadd.h        vr8,   vr9,   vr7
+    vssrani.bu.h  vr8,   vr7,   5
+    vst           vr8,   a0,    0
+    add.d         a0,    a0,    a1
+.endr
+.endm
+
+.macro PRED16X16_PLANE_END_LASX
+    ld.bu         t7,    t1,    0
+    ld.bu         t8,    t2,    16
+    add.d         t5,    t7,    t8
+    addi.d        t5,    t5,    1
+    slli.d        t5,    t5,    4
+    add.d         t7,    t3,    t4
+    slli.d        t8,    t7,    3
+    sub.d         t7,    t8,    t7
+    sub.d         t5,    t5,    t7
+
+    la.local      t8,    mulh
+    xvld          xr3,   t8,    0
+    xvreplgr2vr.h xr4,   t3
+    xvmul.h       xr5,   xr3,   xr4
+
+.rept 8
+    move          t7,    t5
+    add.d         t5,    t5,    t4
+    xvreplgr2vr.h xr6,   t7
+    xvreplgr2vr.h xr8,   t5
+    add.d         t5,    t5,    t4
+    xvadd.h       xr7,   xr6,   xr5
+    xvadd.h       xr9,   xr8,   xr5
+
+    xvssrani.bu.h xr9,   xr7,   5
+    vstelm.d      vr9,   a0,    0,    0
+    xvstelm.d     xr9,   a0,    8,    2
+    add.d         a0,    a0,    a1
+    vstelm.d      vr9,   a0,    0,    1
+    xvstelm.d     xr9,   a0,    8,    3
+    add.d         a0,    a0,    a1
+.endr
+.endm
+
+/* void ff_h264_pred16x16_plane_h264_8_lsx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_h264_8_lsx
+    PRED16X16_PLANE
+
+    slli.d        t7,    t3,    2
+    add.d         t3,    t3,    t7
+    addi.d        t3,    t3,    32
+    srai.d        t3,    t3,    6
+    slli.d        t7,    t4,    2
+    add.d         t4,    t4,    t7
+    addi.d        t4,    t4,    32
+    srai.d        t4,    t4,    6
+
+    PRED16X16_PLANE_END
+endfunc
+
+/* void ff_h264_pred16x16_plane_rv40_8_lsx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_rv40_8_lsx
+    PRED16X16_PLANE
+
+    srai.d        t7,    t3,    2
+    add.d         t3,    t3,    t7
+    srai.d        t3,    t3,    4
+    srai.d        t7,    t4,    2
+    add.d         t4,    t4,    t7
+    srai.d        t4,    t4,    4
+
+    PRED16X16_PLANE_END
+endfunc
+
+/* void ff_h264_pred16x16_plane_svq3_8_lsx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_svq3_8_lsx
+    PRED16X16_PLANE
+
+    li.d          t6,    4
+    li.d          t7,    5
+    li.d          t8,    16
+    div.d         t3,    t3,    t6
+    mul.d         t3,    t3,    t7
+    div.d         t3,    t3,    t8
+    div.d         t4,    t4,    t6
+    mul.d         t4,    t4,    t7
+    div.d         t4,    t4,    t8
+    move          t7,    t3
+    move          t3,    t4
+    move          t4,    t7
+
+    PRED16X16_PLANE_END
+endfunc
+
+/* void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_h264_8_lasx
+    PRED16X16_PLANE
+
+    slli.d        t7,    t3,    2
+    add.d         t3,    t3,    t7
+    addi.d        t3,    t3,    32
+    srai.d        t3,    t3,    6
+    slli.d        t7,    t4,    2
+    add.d         t4,    t4,    t7
+    addi.d        t4,    t4,    32
+    srai.d        t4,    t4,    6
+
+    PRED16X16_PLANE_END_LASX
+endfunc
+
+/* void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_rv40_8_lasx
+    PRED16X16_PLANE
+
+    srai.d        t7,    t3,    2
+    add.d         t3,    t3,    t7
+    srai.d        t3,    t3,    4
+    srai.d        t7,    t4,    2
+    add.d         t4,    t4,    t7
+    srai.d        t4,    t4,    4
+
+    PRED16X16_PLANE_END_LASX
+endfunc
+
+/* void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_svq3_8_lasx
+    PRED16X16_PLANE
+
+    li.d          t5,    4
+    li.d          t7,    5
+    li.d          t8,    16
+    div.d         t3,    t3,    t5
+    mul.d         t3,    t3,    t7
+    div.d         t3,    t3,    t8
+    div.d         t4,    t4,    t5
+    mul.d         t4,    t4,    t7
+    div.d         t4,    t4,    t8
+    move          t7,    t3
+    move          t3,    t4
+    move          t4,    t7
+
+    PRED16X16_PLANE_END_LASX
+endfunc
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2020 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264qpel_loongarch.h"
+#include "libavutil/attributes.h"
+#include "libavutil/loongarch/cpu.h"
+#include "libavcodec/h264qpel.h"
+
+av_cold void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_lsx(cpu_flags)) {
+        if (8 == bit_depth) {
+            c->put_h264_qpel_pixels_tab[0][0]  = ff_put_h264_qpel16_mc00_lsx;
+            c->put_h264_qpel_pixels_tab[0][1]  = ff_put_h264_qpel16_mc10_lsx;
+            c->put_h264_qpel_pixels_tab[0][2]  = ff_put_h264_qpel16_mc20_lsx;
+            c->put_h264_qpel_pixels_tab[0][3]  = ff_put_h264_qpel16_mc30_lsx;
+            c->put_h264_qpel_pixels_tab[0][4]  = ff_put_h264_qpel16_mc01_lsx;
+            c->put_h264_qpel_pixels_tab[0][5]  = ff_put_h264_qpel16_mc11_lsx;
+            c->put_h264_qpel_pixels_tab[0][6]  = ff_put_h264_qpel16_mc21_lsx;
+            c->put_h264_qpel_pixels_tab[0][7]  = ff_put_h264_qpel16_mc31_lsx;
+            c->put_h264_qpel_pixels_tab[0][8]  = ff_put_h264_qpel16_mc02_lsx;
+            c->put_h264_qpel_pixels_tab[0][9]  = ff_put_h264_qpel16_mc12_lsx;
+            c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_lsx;
+            c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_lsx;
+            c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_lsx;
+            c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_lsx;
+            c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_lsx;
+            c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_lsx;
+
+            c->avg_h264_qpel_pixels_tab[0][0]  = ff_avg_h264_qpel16_mc00_lsx;
+            c->avg_h264_qpel_pixels_tab[0][1]  = ff_avg_h264_qpel16_mc10_lsx;
+            c->avg_h264_qpel_pixels_tab[0][2]  = ff_avg_h264_qpel16_mc20_lsx;
+            c->avg_h264_qpel_pixels_tab[0][3]  = ff_avg_h264_qpel16_mc30_lsx;
+            c->avg_h264_qpel_pixels_tab[0][4]  = ff_avg_h264_qpel16_mc01_lsx;
+            c->avg_h264_qpel_pixels_tab[0][5]  = ff_avg_h264_qpel16_mc11_lsx;
+            c->avg_h264_qpel_pixels_tab[0][6]  = ff_avg_h264_qpel16_mc21_lsx;
+            c->avg_h264_qpel_pixels_tab[0][7]  = ff_avg_h264_qpel16_mc31_lsx;
+            c->avg_h264_qpel_pixels_tab[0][8]  = ff_avg_h264_qpel16_mc02_lsx;
+            c->avg_h264_qpel_pixels_tab[0][9]  = ff_avg_h264_qpel16_mc12_lsx;
+            c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_lsx;
+            c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_lsx;
+            c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_lsx;
+            c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_lsx;
+            c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_lsx;
+            c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_lsx;
+
+            c->put_h264_qpel_pixels_tab[1][0]  = ff_put_h264_qpel8_mc00_lsx;
+            c->put_h264_qpel_pixels_tab[1][1]  = ff_put_h264_qpel8_mc10_lsx;
+            c->put_h264_qpel_pixels_tab[1][2]  = ff_put_h264_qpel8_mc20_lsx;
+            c->put_h264_qpel_pixels_tab[1][3]  = ff_put_h264_qpel8_mc30_lsx;
+            c->put_h264_qpel_pixels_tab[1][4]  = ff_put_h264_qpel8_mc01_lsx;
+            c->put_h264_qpel_pixels_tab[1][5]  = ff_put_h264_qpel8_mc11_lsx;
+            c->put_h264_qpel_pixels_tab[1][6]  = ff_put_h264_qpel8_mc21_lsx;
+            c->put_h264_qpel_pixels_tab[1][7]  = ff_put_h264_qpel8_mc31_lsx;
+            c->put_h264_qpel_pixels_tab[1][8]  = ff_put_h264_qpel8_mc02_lsx;
+            c->put_h264_qpel_pixels_tab[1][9]  = ff_put_h264_qpel8_mc12_lsx;
+            c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_lsx;
+            c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_lsx;
+            c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_lsx;
+            c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_lsx;
+            c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_lsx;
+            c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_lsx;
+
+            c->avg_h264_qpel_pixels_tab[1][0]  = ff_avg_h264_qpel8_mc00_lsx;
+            c->avg_h264_qpel_pixels_tab[1][1]  = ff_avg_h264_qpel8_mc10_lsx;
+            c->avg_h264_qpel_pixels_tab[1][2]  = ff_avg_h264_qpel8_mc20_lsx;
+            c->avg_h264_qpel_pixels_tab[1][3]  = ff_avg_h264_qpel8_mc30_lsx;
+            c->avg_h264_qpel_pixels_tab[1][5]  = ff_avg_h264_qpel8_mc11_lsx;
+            c->avg_h264_qpel_pixels_tab[1][6]  = ff_avg_h264_qpel8_mc21_lsx;
+            c->avg_h264_qpel_pixels_tab[1][7]  = ff_avg_h264_qpel8_mc31_lsx;
+            c->avg_h264_qpel_pixels_tab[1][8]  = ff_avg_h264_qpel8_mc02_lsx;
+            c->avg_h264_qpel_pixels_tab[1][9]  = ff_avg_h264_qpel8_mc12_lsx;
+            c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_lsx;
+            c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_lsx;
+            c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_lsx;
+            c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_lsx;
+            c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_lsx;
+        }
+    }
+#if HAVE_LASX
+    if (have_lasx(cpu_flags)) {
+        if (8 == bit_depth) {
+            c->put_h264_qpel_pixels_tab[0][0]  = ff_put_h264_qpel16_mc00_lasx;
+            c->put_h264_qpel_pixels_tab[0][1]  = ff_put_h264_qpel16_mc10_lasx;
+            c->put_h264_qpel_pixels_tab[0][2]  = ff_put_h264_qpel16_mc20_lasx;
+            c->put_h264_qpel_pixels_tab[0][3]  = ff_put_h264_qpel16_mc30_lasx;
+            c->put_h264_qpel_pixels_tab[0][4]  = ff_put_h264_qpel16_mc01_lasx;
+            c->put_h264_qpel_pixels_tab[0][5]  = ff_put_h264_qpel16_mc11_lasx;
+
+            c->put_h264_qpel_pixels_tab[0][6]  = ff_put_h264_qpel16_mc21_lasx;
+            c->put_h264_qpel_pixels_tab[0][7]  = ff_put_h264_qpel16_mc31_lasx;
+            c->put_h264_qpel_pixels_tab[0][8]  = ff_put_h264_qpel16_mc02_lasx;
+            c->put_h264_qpel_pixels_tab[0][9]  = ff_put_h264_qpel16_mc12_lasx;
+            c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_lasx;
+            c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_lasx;
+            c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_lasx;
+            c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_lasx;
+            c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_lasx;
+            c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_lasx;
+            c->avg_h264_qpel_pixels_tab[0][0]  = ff_avg_h264_qpel16_mc00_lasx;
+            c->avg_h264_qpel_pixels_tab[0][1]  = ff_avg_h264_qpel16_mc10_lasx;
+            c->avg_h264_qpel_pixels_tab[0][2]  = ff_avg_h264_qpel16_mc20_lasx;
+            c->avg_h264_qpel_pixels_tab[0][3]  = ff_avg_h264_qpel16_mc30_lasx;
+            c->avg_h264_qpel_pixels_tab[0][4]  = ff_avg_h264_qpel16_mc01_lasx;
+            c->avg_h264_qpel_pixels_tab[0][5]  = ff_avg_h264_qpel16_mc11_lasx;
+            c->avg_h264_qpel_pixels_tab[0][6]  = ff_avg_h264_qpel16_mc21_lasx;
+            c->avg_h264_qpel_pixels_tab[0][7]  = ff_avg_h264_qpel16_mc31_lasx;
+            c->avg_h264_qpel_pixels_tab[0][8]  = ff_avg_h264_qpel16_mc02_lasx;
+            c->avg_h264_qpel_pixels_tab[0][9]  = ff_avg_h264_qpel16_mc12_lasx;
+            c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_lasx;
+            c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_lasx;
+            c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_lasx;
+            c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_lasx;
+            c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_lasx;
+            c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_lasx;
+
+            c->put_h264_qpel_pixels_tab[1][0]  = ff_put_h264_qpel8_mc00_lasx;
+            c->put_h264_qpel_pixels_tab[1][1]  = ff_put_h264_qpel8_mc10_lasx;
+            c->put_h264_qpel_pixels_tab[1][2]  = ff_put_h264_qpel8_mc20_lasx;
+            c->put_h264_qpel_pixels_tab[1][3]  = ff_put_h264_qpel8_mc30_lasx;
+            c->put_h264_qpel_pixels_tab[1][4]  = ff_put_h264_qpel8_mc01_lasx;
+            c->put_h264_qpel_pixels_tab[1][5]  = ff_put_h264_qpel8_mc11_lasx;
+            c->put_h264_qpel_pixels_tab[1][6]  = ff_put_h264_qpel8_mc21_lasx;
+            c->put_h264_qpel_pixels_tab[1][7]  = ff_put_h264_qpel8_mc31_lasx;
+            c->put_h264_qpel_pixels_tab[1][8]  = ff_put_h264_qpel8_mc02_lasx;
+            c->put_h264_qpel_pixels_tab[1][9]  = ff_put_h264_qpel8_mc12_lasx;
+            c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_lasx;
+            c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_lasx;
+            c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_lasx;
+            c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_lasx;
+            c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_lasx;
+            c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_lasx;
+            c->avg_h264_qpel_pixels_tab[1][0]  = ff_avg_h264_qpel8_mc00_lasx;
+            c->avg_h264_qpel_pixels_tab[1][1]  = ff_avg_h264_qpel8_mc10_lasx;
+            c->avg_h264_qpel_pixels_tab[1][2]  = ff_avg_h264_qpel8_mc20_lasx;
+            c->avg_h264_qpel_pixels_tab[1][3]  = ff_avg_h264_qpel8_mc30_lasx;
+            c->avg_h264_qpel_pixels_tab[1][5]  = ff_avg_h264_qpel8_mc11_lasx;
+            c->avg_h264_qpel_pixels_tab[1][6]  = ff_avg_h264_qpel8_mc21_lasx;
+            c->avg_h264_qpel_pixels_tab[1][7]  = ff_avg_h264_qpel8_mc31_lasx;
+            c->avg_h264_qpel_pixels_tab[1][8]  = ff_avg_h264_qpel8_mc02_lasx;
+            c->avg_h264_qpel_pixels_tab[1][9]  = ff_avg_h264_qpel8_mc12_lasx;
+            c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_lasx;
+            c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_lasx;
+            c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_lasx;
+            c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_lasx;
+            c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_lasx;
+        }
+    }
+#endif
+}
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include "libavcodec/h264.h"
+#include "config.h"
+
+void put_h264_qpel8_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                   ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_h264_qpel8_h_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_h264_qpel8_v_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
+                           ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
+                          ptrdiff_t dstStride, ptrdiff_t srcStride);
+
+void avg_h264_qpel8_h_lowpass_lsx(uint8_t *dst, const uint8_t *src, int dstStride,
+                                  int srcStride);
+void avg_h264_qpel8_v_lowpass_lsx(uint8_t *dst, uint8_t *src, int dstStride,
+                                  int srcStride);
+void avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
+                           ptrdiff_t dstStride, ptrdiff_t srcStride);
+void avg_h264_qpel8_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                   ptrdiff_t dstStride, ptrdiff_t srcStride);
+void avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
+                          ptrdiff_t dstStride, ptrdiff_t srcStride);
+
+void ff_put_h264_qpel16_mc00_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+
+void ff_avg_h264_qpel16_mc00_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+
+void ff_put_h264_qpel8_mc03_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc00_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+
+void ff_avg_h264_qpel8_mc00_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+
+#if HAVE_LASX
+void ff_h264_h_lpf_luma_inter_lasx(uint8_t *src, int stride,
+                                   int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_luma_inter_lasx(uint8_t *src, int stride,
+                                   int alpha, int beta, int8_t *tc0);
+void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+
+void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+#endif
+
+#endif  // #ifndef AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H
@@ -0,0 +1,487 @@
+/*
+ * Loongson LSX optimized h264qpel
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264qpel_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "libavutil/attributes.h"
+
+static void put_h264_qpel16_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t dstStride, ptrdiff_t srcStride)
+{
+    put_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+    put_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+    src += srcStride << 3;
+    dst += dstStride << 3;
+    put_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+    put_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+}
+
+void ff_put_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    put_h264_qpel16_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+static void put_h264_qpel16_h_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                          int dstStride, int srcStride)
+{
+    put_h264_qpel8_h_lowpass_lsx(dst, src, dstStride, srcStride);
+    put_h264_qpel8_h_lowpass_lsx(dst+8, src+8, dstStride, srcStride);
+    src += srcStride << 3;
+    dst += dstStride << 3;
+    put_h264_qpel8_h_lowpass_lsx(dst, src, dstStride, srcStride);
+    put_h264_qpel8_h_lowpass_lsx(dst+8, src+8, dstStride, srcStride);
+}
+
+static void put_h264_qpel16_v_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                           int dstStride, int srcStride)
+{
+    put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+    put_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+    put_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+}
+
+void ff_put_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 256;
+
+    put_h264_qpel16_h_lowpass_lsx(halfH, src, 16, stride);
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 256;
+
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_h264_qpel16_v_lowpass_lsx(halfH, src, 16, stride);
+    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 256;
+
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_h264_qpel16_v_lowpass_lsx(halfH, src + 1, 16, stride);
+    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 256;
+
+    put_h264_qpel16_h_lowpass_lsx(halfH, src + stride, 16, stride);
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+static void avg_h264_qpel16_v_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                          int dstStride, int srcStride)
+{
+    avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+    avg_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+    avg_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+}
+
+void ff_avg_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avg_h264_qpel16_v_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t half[256];
+
+    put_h264_qpel16_v_lowpass_lsx(half, src, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, src + stride, half, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 256;
+
+    put_h264_qpel16_h_lowpass_lsx(halfH, src + stride, 16, stride);
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 256;
+
+    put_h264_qpel16_h_lowpass_lsx(halfH, src, 16, stride);
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t half[256];
+
+    put_h264_qpel16_v_lowpass_lsx(half, src, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 256;
+
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_h264_qpel16_v_lowpass_lsx(halfH, src + 1, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 256;
+
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_h264_qpel16_v_lowpass_lsx(halfH, src, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+static void avg_h264_qpel16_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t dstStride, ptrdiff_t srcStride)
+{
+    avg_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+    src += srcStride << 3;
+    dst += dstStride << 3;
+    avg_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+}
+
+void ff_avg_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avg_h264_qpel16_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc03_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_v_lowpass_lsx(half, (uint8_t*)src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, src + stride, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc01_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_v_lowpass_lsx(half, (uint8_t*)src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 64;
+
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src + 1, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 64;
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 64;
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 64;
+
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    put_h264_qpel8_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    put_h264_qpel8_h_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avg_h264_qpel8_h_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 64;
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 64;
+
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avg_h264_qpel8_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 64;
+
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src + 1, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 64;
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
@@ -0,0 +1,162 @@
+/*
+ * Loongson LSX optimized add_residual functions for HEVC decoding
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by jinbo <jinbo@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+/*
+ * void ff_hevc_add_residual4x4_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
+ */
+.macro ADD_RES_LSX_4x4_8
+    vldrepl.w      vr0,    a0,     0
+    add.d          t0,     a0,     a2
+    vldrepl.w      vr1,    t0,     0
+    vld            vr2,    a1,     0
+
+    vilvl.w        vr1,    vr1,    vr0
+    vsllwil.hu.bu  vr1,    vr1,    0
+    vadd.h         vr1,    vr1,    vr2
+    vssrani.bu.h   vr1,    vr1,    0
+
+    vstelm.w       vr1,    a0,     0,    0
+    vstelm.w       vr1,    t0,     0,    1
+.endm
+
+function ff_hevc_add_residual4x4_8_lsx
+    ADD_RES_LSX_4x4_8
+    alsl.d         a0,     a2,     a0,   1
+    addi.d         a1,     a1,     16
+    ADD_RES_LSX_4x4_8
+endfunc
+
+/*
+ * void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
+ */
+.macro ADD_RES_LSX_8x8_8
+    vldrepl.d      vr0,    a0,     0
+    add.d          t0,     a0,     a2
+    vldrepl.d      vr1,    t0,     0
+    add.d          t1,     t0,     a2
+    vldrepl.d      vr2,    t1,     0
+    add.d          t2,     t1,     a2
+    vldrepl.d      vr3,    t2,     0
+
+    vld            vr4,    a1,     0
+    addi.d         t3,     zero,   16
+    vldx           vr5,    a1,     t3
+    addi.d         t4,     a1,     32
+    vld            vr6,    t4,     0
+    vldx           vr7,    t4,     t3
+
+    vsllwil.hu.bu  vr0,    vr0,    0
+    vsllwil.hu.bu  vr1,    vr1,    0
+    vsllwil.hu.bu  vr2,    vr2,    0
+    vsllwil.hu.bu  vr3,    vr3,    0
+    vadd.h         vr0,    vr0,    vr4
+    vadd.h         vr1,    vr1,    vr5
+    vadd.h         vr2,    vr2,    vr6
+    vadd.h         vr3,    vr3,    vr7
+    vssrani.bu.h   vr1,    vr0,    0
+    vssrani.bu.h   vr3,    vr2,    0
+
+    vstelm.d       vr1,    a0,     0,     0
+    vstelm.d       vr1,    t0,     0,     1
+    vstelm.d       vr3,    t1,     0,     0
+    vstelm.d       vr3,    t2,     0,     1
+.endm
+
+function ff_hevc_add_residual8x8_8_lsx
+    ADD_RES_LSX_8x8_8
+    alsl.d         a0,     a2,     a0,    2
+    addi.d         a1,     a1,     64
+    ADD_RES_LSX_8x8_8
+endfunc
+
+/*
+ * void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
+ */
+function ff_hevc_add_residual16x16_8_lsx
+.rept 8
+    vld            vr0,    a0,     0
+    vldx           vr2,    a0,     a2
+
+    vld            vr4,    a1,     0
+    addi.d         t0,     zero,   16
+    vldx           vr5,    a1,     t0
+    addi.d         t1,     a1,     32
+    vld            vr6,    t1,     0
+    vldx           vr7,    t1,     t0
+
+    vexth.hu.bu    vr1,    vr0
+    vsllwil.hu.bu  vr0,    vr0,    0
+    vexth.hu.bu    vr3,    vr2
+    vsllwil.hu.bu  vr2,    vr2,    0
+    vadd.h         vr0,    vr0,    vr4
+    vadd.h         vr1,    vr1,    vr5
+    vadd.h         vr2,    vr2,    vr6
+    vadd.h         vr3,    vr3,    vr7
+
+    vssrani.bu.h   vr1,    vr0,    0
+    vssrani.bu.h   vr3,    vr2,    0
+
+    vst            vr1,    a0,     0
+    vstx           vr3,    a0,     a2
+
+    alsl.d         a0,     a2,     a0,   1
+    addi.d         a1,     a1,     64
+.endr
+endfunc
+
+/*
+ * void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
+ */
+function ff_hevc_add_residual32x32_8_lsx
+.rept 32
+    vld            vr0,    a0,     0
+    addi.w         t0,     zero,   16
+    vldx           vr2,    a0,     t0
+
+    vld            vr4,    a1,     0
+    vldx           vr5,    a1,     t0
+    addi.d         t1,     a1,     32
+    vld            vr6,    t1,     0
+    vldx           vr7,    t1,     t0
+
+    vexth.hu.bu    vr1,    vr0
+    vsllwil.hu.bu  vr0,    vr0,    0
+    vexth.hu.bu    vr3,    vr2
+    vsllwil.hu.bu  vr2,    vr2,    0
+    vadd.h         vr0,    vr0,    vr4
+    vadd.h         vr1,    vr1,    vr5
+    vadd.h         vr2,    vr2,    vr6
+    vadd.h         vr3,    vr3,    vr7
+
+    vssrani.bu.h   vr1,    vr0,    0
+    vssrani.bu.h   vr3,    vr2,    0
+
+    vst            vr1,    a0,     0
+    vstx           vr3,    a0,     t0
+
+    add.d          a0,     a0,     a2
+    addi.d         a1,     a1,     64
+.endr
+endfunc
@@ -0,0 +1,857 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+.macro fr_store
+    addi.d        sp,       sp,       -64
+    fst.d         f24,      sp,       0
+    fst.d         f25,      sp,       8
+    fst.d         f26,      sp,       16
+    fst.d         f27,      sp,       24
+    fst.d         f28,      sp,       32
+    fst.d         f29,      sp,       40
+    fst.d         f30,      sp,       48
+    fst.d         f31,      sp,       56
+.endm
+
+.macro fr_recover
+    fld.d         f24,      sp,       0
+    fld.d         f25,      sp,       8
+    fld.d         f26,      sp,       16
+    fld.d         f27,      sp,       24
+    fld.d         f28,      sp,       32
+    fld.d         f29,      sp,       40
+    fld.d         f30,      sp,       48
+    fld.d         f31,      sp,       56
+    addi.d        sp,       sp,       64
+.endm
+
+.extern gt32x32_cnst1
+
+.extern gt32x32_cnst2
+
+.extern gt8x8_cnst
+
+.extern gt32x32_cnst0
+
+.macro idct_16x32_step1_lasx
+    xvldrepl.w    xr20,     t1,       0
+    xvldrepl.w    xr21,     t1,       4
+    xvldrepl.w    xr22,     t1,       8
+    xvldrepl.w    xr23,     t1,       12
+
+    xvmulwev.w.h  xr16,     xr8,      xr20
+    xvmaddwod.w.h xr16,     xr8,      xr20
+    xvmulwev.w.h  xr17,     xr9,      xr20
+    xvmaddwod.w.h xr17,     xr9,      xr20
+
+    xvmaddwev.w.h xr16,     xr10,     xr21
+    xvmaddwod.w.h xr16,     xr10,     xr21
+    xvmaddwev.w.h xr17,     xr11,     xr21
+    xvmaddwod.w.h xr17,     xr11,     xr21
+
+    xvmaddwev.w.h xr16,     xr12,     xr22
+    xvmaddwod.w.h xr16,     xr12,     xr22
+    xvmaddwev.w.h xr17,     xr13,     xr22
+    xvmaddwod.w.h xr17,     xr13,     xr22
+
+    xvmaddwev.w.h xr16,     xr14,     xr23
+    xvmaddwod.w.h xr16,     xr14,     xr23
+    xvmaddwev.w.h xr17,     xr15,     xr23
+    xvmaddwod.w.h xr17,     xr15,     xr23
+
+    xvld          xr0,      t2,       0
+    xvld          xr1,      t2,       32
+
+    xvadd.w       xr18,     xr0,      xr16
+    xvadd.w       xr19,     xr1,      xr17
+    xvsub.w       xr0,      xr0,      xr16
+    xvsub.w       xr1,      xr1,      xr17
+
+    xvst          xr18,     t2,       0
+    xvst          xr19,     t2,       32
+    xvst          xr0,      t3,       0
+    xvst          xr1,      t3,       32
+.endm
+
+.macro idct_16x32_step2_lasx in0, in1, in2, in3, in4, in5, in6, in7, out0, out1
+
+    xvldrepl.w    xr20,     t1,       0
+    xvldrepl.w    xr21,     t1,       4
+    xvldrepl.w    xr22,     t1,       8
+    xvldrepl.w    xr23,     t1,       12
+
+    xvmulwev.w.h  \out0,    \in0,     xr20
+    xvmaddwod.w.h \out0,    \in0,     xr20
+    xvmulwev.w.h  \out1,    \in1,     xr20
+    xvmaddwod.w.h \out1,    \in1,     xr20
+    xvmaddwev.w.h \out0,    \in2,     xr21
+    xvmaddwod.w.h \out0,    \in2,     xr21
+    xvmaddwev.w.h \out1,    \in3,     xr21
+    xvmaddwod.w.h \out1,    \in3,     xr21
+    xvmaddwev.w.h \out0,    \in4,     xr22
+    xvmaddwod.w.h \out0,    \in4,     xr22
+    xvmaddwev.w.h \out1,    \in5,     xr22
+    xvmaddwod.w.h \out1,    \in5,     xr22
+    xvmaddwev.w.h \out0,    \in6,     xr23
+    xvmaddwod.w.h \out0,    \in6,     xr23
+    xvmaddwev.w.h \out1,    \in7,     xr23  // sum0_r
+    xvmaddwod.w.h \out1,    \in7,     xr23  // sum0_l
+.endm
+
+    /* loop for all columns of filter constants */
+.macro idct_16x32_step3_lasx round
+    xvadd.w       xr16,     xr16,     xr30
+    xvadd.w       xr17,     xr17,     xr31
+
+    xvld          xr0,      t2,       0
+    xvld          xr1,      t2,       32
+
+    xvadd.w       xr30,     xr0,      xr16
+    xvadd.w       xr31,     xr1,      xr17
+    xvsub.w       xr16,     xr0,      xr16
+    xvsub.w       xr17,     xr1,      xr17
+    xvssrarni.h.w xr31,     xr30,     \round
+    xvssrarni.h.w xr17,     xr16,     \round
+    xvst          xr31,     t4,       0
+    xvst          xr17,     t5,       0
+.endm
+
+.macro idct_16x32_lasx buf_pitch, round
+    addi.d        t2,       sp,       64
+
+    addi.d        t0,       a0,       \buf_pitch*4*2
+
+    // 4 12 20 28
+    xvld          xr0,      t0,       0
+    xvld          xr1,      t0,       \buf_pitch*8*2
+    xvld          xr2,      t0,       \buf_pitch*16*2
+    xvld          xr3,      t0,       \buf_pitch*24*2
+
+    xvilvl.h      xr10,     xr1,      xr0
+    xvilvh.h      xr11,     xr1,      xr0
+    xvilvl.h      xr12,     xr3,      xr2
+    xvilvh.h      xr13,     xr3,      xr2
+
+    la.local      t1,       gt32x32_cnst2
+
+    xvldrepl.w    xr20,     t1,       0
+    xvldrepl.w    xr21,     t1,       4
+    xvmulwev.w.h  xr14,     xr10,     xr20
+    xvmaddwod.w.h xr14,     xr10,     xr20
+    xvmulwev.w.h  xr15,     xr11,     xr20
+    xvmaddwod.w.h xr15,     xr11,     xr20
+    xvmaddwev.w.h xr14,     xr12,     xr21
+    xvmaddwod.w.h xr14,     xr12,     xr21
+    xvmaddwev.w.h xr15,     xr13,     xr21
+    xvmaddwod.w.h xr15,     xr13,     xr21
+
+    xvldrepl.w    xr20,     t1,       8
+    xvldrepl.w    xr21,     t1,       12
+    xvmulwev.w.h  xr16,     xr10,     xr20
+    xvmaddwod.w.h xr16,     xr10,     xr20
+    xvmulwev.w.h  xr17,     xr11,     xr20
+    xvmaddwod.w.h xr17,     xr11,     xr20
+    xvmaddwev.w.h xr16,     xr12,     xr21
+    xvmaddwod.w.h xr16,     xr12,     xr21
+    xvmaddwev.w.h xr17,     xr13,     xr21
+    xvmaddwod.w.h xr17,     xr13,     xr21
+
+    xvldrepl.w    xr20,     t1,       16
+    xvldrepl.w    xr21,     t1,       20
+    xvmulwev.w.h  xr18,     xr10,     xr20
+    xvmaddwod.w.h xr18,     xr10,     xr20
+    xvmulwev.w.h  xr19,     xr11,     xr20
+    xvmaddwod.w.h xr19,     xr11,     xr20
+    xvmaddwev.w.h xr18,     xr12,     xr21
+    xvmaddwod.w.h xr18,     xr12,     xr21
+    xvmaddwev.w.h xr19,     xr13,     xr21
+    xvmaddwod.w.h xr19,     xr13,     xr21
+
+    xvldrepl.w    xr20,     t1,       24
+    xvldrepl.w    xr21,     t1,       28
+    xvmulwev.w.h  xr22,     xr10,     xr20
+    xvmaddwod.w.h xr22,     xr10,     xr20
+    xvmulwev.w.h  xr23,     xr11,     xr20
+    xvmaddwod.w.h xr23,     xr11,     xr20
+    xvmaddwev.w.h xr22,     xr12,     xr21
+    xvmaddwod.w.h xr22,     xr12,     xr21
+    xvmaddwev.w.h xr23,     xr13,     xr21
+    xvmaddwod.w.h xr23,     xr13,     xr21
+
+    /* process coeff 0, 8, 16, 24 */
+    la.local      t1,       gt8x8_cnst
+
+    xvld          xr0,      a0,       0
+    xvld          xr1,      a0,       \buf_pitch*8*2
+    xvld          xr2,      a0,       \buf_pitch*16*2
+    xvld          xr3,      a0,       \buf_pitch*24*2
+
+    xvldrepl.w    xr20,     t1,       0
+    xvldrepl.w    xr21,     t1,       4
+
+    xvilvl.h      xr10,     xr2,      xr0
+    xvilvh.h      xr11,     xr2,      xr0
+    xvilvl.h      xr12,     xr3,      xr1
+    xvilvh.h      xr13,     xr3,      xr1
+
+    xvmulwev.w.h  xr4,      xr10,     xr20
+    xvmaddwod.w.h xr4,      xr10,     xr20  // sum0_r
+    xvmulwev.w.h  xr5,      xr11,     xr20
+    xvmaddwod.w.h xr5,      xr11,     xr20  // sum0_l
+    xvmulwev.w.h  xr6,      xr12,     xr21
+    xvmaddwod.w.h xr6,      xr12,     xr21  // tmp1_r
+    xvmulwev.w.h  xr7,      xr13,     xr21
+    xvmaddwod.w.h xr7,      xr13,     xr21  // tmp1_l
+
+    xvsub.w       xr0,      xr4,      xr6   // sum1_r
+    xvadd.w       xr1,      xr4,      xr6   // sum0_r
+    xvsub.w       xr2,      xr5,      xr7   // sum1_l
+    xvadd.w       xr3,      xr5,      xr7   // sum0_l
+
+    // HEVC_EVEN16_CALC
+    xvsub.w       xr24,     xr1,      xr14  // 7
+    xvsub.w       xr25,     xr3,      xr15
+    xvadd.w       xr14,     xr1,      xr14  // 0
+    xvadd.w       xr15,     xr3,      xr15
+    xvst          xr24,     t2,       7*16*4     // 448=16*28=7*16*4
+    xvst          xr25,     t2,       7*16*4+32  // 480
+    xvst          xr14,     t2,       0
+    xvst          xr15,     t2,       32
+
+    xvsub.w       xr26,     xr0,      xr22  // 4
+    xvsub.w       xr27,     xr2,      xr23
+    xvadd.w       xr22,     xr0,      xr22  // 3
+    xvadd.w       xr23,     xr2,      xr23
+    xvst          xr26,     t2,       4*16*4     // 256=4*16*4
+    xvst          xr27,     t2,       4*16*4+32  // 288
+    xvst          xr22,     t2,       3*16*4     // 192=3*16*4
+    xvst          xr23,     t2,       3*16*4+32  // 224
+
+    xvldrepl.w    xr20,     t1,       16
+    xvldrepl.w    xr21,     t1,       20
+
+    xvmulwev.w.h  xr4,      xr10,     xr20
+    xvmaddwod.w.h xr4,      xr10,     xr20
+    xvmulwev.w.h  xr5,      xr11,     xr20
+    xvmaddwod.w.h xr5,      xr11,     xr20
+    xvmulwev.w.h  xr6,      xr12,     xr21
+    xvmaddwod.w.h xr6,      xr12,     xr21
+    xvmulwev.w.h  xr7,      xr13,     xr21
+    xvmaddwod.w.h xr7,      xr13,     xr21
+
+    xvsub.w       xr0,      xr4,      xr6   // sum1_r
+    xvadd.w       xr1,      xr4,      xr6   // sum0_r
+    xvsub.w       xr2,      xr5,      xr7   // sum1_l
+    xvadd.w       xr3,      xr5,      xr7   // sum0_l
+
+    // HEVC_EVEN16_CALC
+    xvsub.w       xr24,     xr1,      xr16  // 6
+    xvsub.w       xr25,     xr3,      xr17
+    xvadd.w       xr16,     xr1,      xr16  // 1
+    xvadd.w       xr17,     xr3,      xr17
+    xvst          xr24,     t2,       6*16*4      // 384=6*16*4
+    xvst          xr25,     t2,       6*16*4+32   // 416
+    xvst          xr16,     t2,       1*16*4      // 64=1*16*4
+    xvst          xr17,     t2,       1*16*4+32   // 96
+
+    xvsub.w       xr26,     xr0,      xr18  // 5
+    xvsub.w       xr27,     xr2,      xr19
+    xvadd.w       xr18,     xr0,      xr18  // 2
+    xvadd.w       xr19,     xr2,      xr19
+    xvst          xr26,     t2,       5*16*4      // 320=5*16*4
+    xvst          xr27,     t2,       5*16*4+32   // 352
+    xvst          xr18,     t2,       2*16*4      // 128=2*16*4
+    xvst          xr19,     t2,       2*16*4+32   // 160
+
+    /* process coeff 2 6 10 14 18 22 26 30 */
+    addi.d        t0,       a0,       \buf_pitch*2*2
+
+    xvld          xr0,      t0,       0
+    xvld          xr1,      t0,       \buf_pitch*4*2
+    xvld          xr2,      t0,       \buf_pitch*8*2
+    xvld          xr3,      t0,       \buf_pitch*12*2
+
+    xvld          xr4,      t0,       \buf_pitch*16*2
+    xvld          xr5,      t0,       \buf_pitch*20*2
+    xvld          xr6,      t0,       \buf_pitch*24*2
+    xvld          xr7,      t0,       \buf_pitch*28*2
+
+    xvilvl.h      xr8,      xr1,      xr0
+    xvilvh.h      xr9,      xr1,      xr0
+    xvilvl.h      xr10,     xr3,      xr2
+    xvilvh.h      xr11,     xr3,      xr2
+    xvilvl.h      xr12,     xr5,      xr4
+    xvilvh.h      xr13,     xr5,      xr4
+    xvilvl.h      xr14,     xr7,      xr6
+    xvilvh.h      xr15,     xr7,      xr6
+
+    la.local      t1,       gt32x32_cnst1
+
+    addi.d        t2,       sp,       64
+    addi.d        t3,       sp,       64+960   // 30*32
+
+    idct_16x32_step1_lasx
+
+.rept 7
+    addi.d        t1,       t1,       16
+    addi.d        t2,       t2,       64
+    addi.d        t3,       t3,       -64
+    idct_16x32_step1_lasx
+.endr
+
+    addi.d        t0,       a0,       \buf_pitch*2
+
+    xvld          xr0,      t0,       0
+    xvld          xr1,      t0,       \buf_pitch*2*2
+    xvld          xr2,      t0,       \buf_pitch*4*2
+    xvld          xr3,      t0,       \buf_pitch*6*2
+    xvld          xr4,      t0,       \buf_pitch*8*2
+    xvld          xr5,      t0,       \buf_pitch*10*2
+    xvld          xr6,      t0,       \buf_pitch*12*2
+    xvld          xr7,      t0,       \buf_pitch*14*2
+
+    xvilvl.h      xr8,      xr1,      xr0
+    xvilvh.h      xr9,      xr1,      xr0
+    xvilvl.h      xr10,     xr3,      xr2
+    xvilvh.h      xr11,     xr3,      xr2
+    xvilvl.h      xr12,     xr5,      xr4
+    xvilvh.h      xr13,     xr5,      xr4
+    xvilvl.h      xr14,     xr7,      xr6
+    xvilvh.h      xr15,     xr7,      xr6
+
+    la.local      t1,       gt32x32_cnst0
+
+    idct_16x32_step2_lasx xr8, xr9, xr10, xr11, xr12, xr13, \
+                          xr14, xr15, xr16, xr17
+
+    addi.d        t0,       a0,       \buf_pitch*16*2+\buf_pitch*2
+
+    xvld          xr0,      t0,       0
+    xvld          xr1,      t0,       \buf_pitch*2*2
+    xvld          xr2,      t0,       \buf_pitch*4*2
+    xvld          xr3,      t0,       \buf_pitch*6*2
+    xvld          xr4,      t0,       \buf_pitch*8*2
+    xvld          xr5,      t0,       \buf_pitch*10*2
+    xvld          xr6,      t0,       \buf_pitch*12*2
+    xvld          xr7,      t0,       \buf_pitch*14*2
+
+    xvilvl.h      xr18,     xr1,      xr0
+    xvilvh.h      xr19,     xr1,      xr0
+    xvilvl.h      xr24,     xr3,      xr2
+    xvilvh.h      xr25,     xr3,      xr2
+    xvilvl.h      xr26,     xr5,      xr4
+    xvilvh.h      xr27,     xr5,      xr4
+    xvilvl.h      xr28,     xr7,      xr6
+    xvilvh.h      xr29,     xr7,      xr6
+
+    addi.d        t1,       t1,       16
+    idct_16x32_step2_lasx xr18, xr19, xr24, xr25, xr26, xr27, \
+                          xr28, xr29, xr30, xr31
+
+    addi.d        t4,       a0,       0
+    addi.d        t5,       a0,       \buf_pitch*31*2
+    addi.d        t2,       sp,       64
+
+    idct_16x32_step3_lasx \round
+
+.rept 15
+
+    addi.d        t1,       t1,       16
+    idct_16x32_step2_lasx xr8, xr9, xr10, xr11, xr12, xr13, \
+                          xr14, xr15, xr16, xr17
+
+    addi.d        t1,       t1,       16
+    idct_16x32_step2_lasx xr18, xr19, xr24, xr25, xr26, xr27, \
+                          xr28, xr29, xr30, xr31
+
+    addi.d        t2,       t2,       64
+    addi.d        t4,       t4,       \buf_pitch*2
+    addi.d        t5,       t5,       -\buf_pitch*2
+
+    idct_16x32_step3_lasx \round
+.endr
+
+.endm
+
+function hevc_idct_16x32_column_step1_lasx
+    addi.d        sp,       sp,      -1600        // 64+512*3
+    fr_store
+
+    idct_16x32_lasx 32, 7
+
+    fr_recover
+    addi.d        sp,       sp,      1600
+endfunc
+
+function hevc_idct_16x32_column_step2_lasx
+    addi.d        sp,       sp,      -1600        // 64+512*3
+    fr_store
+
+    idct_16x32_lasx 16, 12
+
+    fr_recover
+    addi.d        sp,       sp,      1600
+endfunc
+
+function hevc_idct_transpose_32x16_to_16x32_lasx
+    fr_store
+
+    xvld          xr0,      a0,       0
+    xvld          xr1,      a0,       64
+    xvld          xr2,      a0,       128
+    xvld          xr3,      a0,       192
+    xvld          xr4,      a0,       256
+    xvld          xr5,      a0,       320
+    xvld          xr6,      a0,       384
+    xvld          xr7,      a0,       448
+
+    xvpermi.q     xr8,      xr0,      0x01
+    xvpermi.q     xr9,      xr1,      0x01
+    xvpermi.q     xr10,     xr2,      0x01
+    xvpermi.q     xr11,     xr3,      0x01
+    xvpermi.q     xr12,     xr4,      0x01
+    xvpermi.q     xr13,     xr5,      0x01
+    xvpermi.q     xr14,     xr6,      0x01
+    xvpermi.q     xr15,     xr7,      0x01
+
+    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+    LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+                       vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+    addi.d        a0,       a0,       512
+
+    vld           vr24,     a0,       0
+    vld           vr25,     a0,       64
+    vld           vr26,     a0,       128
+    vld           vr27,     a0,       192
+    vld           vr28,     a0,       256
+    vld           vr29,     a0,       320
+    vld           vr30,     a0,       384
+    vld           vr31,     a0,       448
+
+    LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+                       vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+    xvpermi.q     xr0,      xr24,     0x02
+    xvpermi.q     xr1,      xr25,     0x02
+    xvpermi.q     xr2,      xr26,     0x02
+    xvpermi.q     xr3,      xr27,     0x02
+    xvpermi.q     xr4,      xr28,     0x02
+    xvpermi.q     xr5,      xr29,     0x02
+    xvpermi.q     xr6,      xr30,     0x02
+    xvpermi.q     xr7,      xr31,     0x02
+
+    xvst          xr0,      a1,       0
+    xvst          xr1,      a1,       32
+    xvst          xr2,      a1,       64
+    xvst          xr3,      a1,       96
+    xvst          xr4,      a1,       128
+    xvst          xr5,      a1,       160
+    xvst          xr6,      a1,       192
+    xvst          xr7,      a1,       224
+
+    addi.d        a1,       a1,       256
+    addi.d        a0,       a0,       16
+
+    vld           vr24,     a0,       0
+    vld           vr25,     a0,       64
+    vld           vr26,     a0,       128
+    vld           vr27,     a0,       192
+    vld           vr28,     a0,       256
+    vld           vr29,     a0,       320
+    vld           vr30,     a0,       384
+    vld           vr31,     a0,       448
+
+    LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+                       vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+    xvpermi.q     xr8,      xr24,     0x02
+    xvpermi.q     xr9,      xr25,     0x02
+    xvpermi.q     xr10,     xr26,     0x02
+    xvpermi.q     xr11,     xr27,     0x02
+    xvpermi.q     xr12,     xr28,     0x02
+    xvpermi.q     xr13,     xr29,     0x02
+    xvpermi.q     xr14,     xr30,     0x02
+    xvpermi.q     xr15,     xr31,     0x02
+
+    xvst          xr8,      a1,       0
+    xvst          xr9,      a1,       32
+    xvst          xr10,     a1,       64
+    xvst          xr11,     a1,       96
+    xvst          xr12,     a1,       128
+    xvst          xr13,     a1,       160
+    xvst          xr14,     a1,       192
+    xvst          xr15,     a1,       224
+
+    // second
+    addi.d        a0,       a0,       32-512-16
+
+    xvld          xr0,      a0,       0
+    xvld          xr1,      a0,       64
+    xvld          xr2,      a0,       128
+    xvld          xr3,      a0,       192
+    xvld          xr4,      a0,       256
+    xvld          xr5,      a0,       320
+    xvld          xr6,      a0,       384
+    xvld          xr7,      a0,       448
+
+    xvpermi.q     xr8,      xr0,      0x01
+    xvpermi.q     xr9,      xr1,      0x01
+    xvpermi.q     xr10,     xr2,      0x01
+    xvpermi.q     xr11,     xr3,      0x01
+    xvpermi.q     xr12,     xr4,      0x01
+    xvpermi.q     xr13,     xr5,      0x01
+    xvpermi.q     xr14,     xr6,      0x01
+    xvpermi.q     xr15,     xr7,      0x01
+
+    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+    LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+                       vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+    addi.d        a0,       a0,       512
+
+    vld           vr24,     a0,       0
+    vld           vr25,     a0,       64
+    vld           vr26,     a0,       128
+    vld           vr27,     a0,       192
+    vld           vr28,     a0,       256
+    vld           vr29,     a0,       320
+    vld           vr30,     a0,       384
+    vld           vr31,     a0,       448
+
+    LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+                       vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+    xvpermi.q     xr0,      xr24,     0x02
+    xvpermi.q     xr1,      xr25,     0x02
+    xvpermi.q     xr2,      xr26,     0x02
+    xvpermi.q     xr3,      xr27,     0x02
+    xvpermi.q     xr4,      xr28,     0x02
+    xvpermi.q     xr5,      xr29,     0x02
+    xvpermi.q     xr6,      xr30,     0x02
+    xvpermi.q     xr7,      xr31,     0x02
+
+    addi.d        a1,       a1,       256
+    xvst          xr0,      a1,       0
+    xvst          xr1,      a1,       32
+    xvst          xr2,      a1,       64
+    xvst          xr3,      a1,       96
+    xvst          xr4,      a1,       128
+    xvst          xr5,      a1,       160
+    xvst          xr6,      a1,       192
+    xvst          xr7,      a1,       224
+
+    addi.d        a1,       a1,       256
+    addi.d        a0,       a0,       16
+
+    vld           vr24,     a0,       0
+    vld           vr25,     a0,       64
+    vld           vr26,     a0,       128
+    vld           vr27,     a0,       192
+    vld           vr28,     a0,       256
+    vld           vr29,     a0,       320
+    vld           vr30,     a0,       384
+    vld           vr31,     a0,       448
+
+    LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+                       vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+    xvpermi.q     xr8,      xr24,     0x02
+    xvpermi.q     xr9,      xr25,     0x02
+    xvpermi.q     xr10,     xr26,     0x02
+    xvpermi.q     xr11,     xr27,     0x02
+    xvpermi.q     xr12,     xr28,     0x02
+    xvpermi.q     xr13,     xr29,     0x02
+    xvpermi.q     xr14,     xr30,     0x02
+    xvpermi.q     xr15,     xr31,     0x02
+
+    xvst          xr8,      a1,       0
+    xvst          xr9,      a1,       32
+    xvst          xr10,     a1,       64
+    xvst          xr11,     a1,       96
+    xvst          xr12,     a1,       128
+    xvst          xr13,     a1,       160
+    xvst          xr14,     a1,       192
+    xvst          xr15,     a1,       224
+
+    fr_recover
+endfunc
+
+function hevc_idct_transpose_16x32_to_32x16_lasx
+    fr_store
+
+    xvld          xr0,      a0,       0
+    xvld          xr1,      a0,       32
+    xvld          xr2,      a0,       64
+    xvld          xr3,      a0,       96
+    xvld          xr4,      a0,       128
+    xvld          xr5,      a0,       160
+    xvld          xr6,      a0,       192
+    xvld          xr7,      a0,       224
+
+    xvpermi.q     xr8,      xr0,      0x01
+    xvpermi.q     xr9,      xr1,      0x01
+    xvpermi.q     xr10,     xr2,      0x01
+    xvpermi.q     xr11,     xr3,      0x01
+    xvpermi.q     xr12,     xr4,      0x01
+    xvpermi.q     xr13,     xr5,      0x01
+    xvpermi.q     xr14,     xr6,      0x01
+    xvpermi.q     xr15,     xr7,      0x01
+
+    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+    LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+                       vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+    addi.d        a0,       a0,       256
+
+    vld          vr24,      a0,       0
+    vld          vr25,      a0,       32
+    vld          vr26,      a0,       64
+    vld          vr27,      a0,       96
+    vld          vr28,      a0,       128
+    vld          vr29,      a0,       160
+    vld          vr30,      a0,       192
+    vld          vr31,      a0,       224
+
+    LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+                       vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+    xvpermi.q     xr0,      xr24,     0x02
+    xvpermi.q     xr1,      xr25,     0x02
+    xvpermi.q     xr2,      xr26,     0x02
+    xvpermi.q     xr3,      xr27,     0x02
+    xvpermi.q     xr4,      xr28,     0x02
+    xvpermi.q     xr5,      xr29,     0x02
+    xvpermi.q     xr6,      xr30,     0x02
+    xvpermi.q     xr7,      xr31,     0x02
+
+    xvst          xr0,      a1,       0
+    xvst          xr1,      a1,       64
+    xvst          xr2,      a1,       128
+    xvst          xr3,      a1,       192
+    xvst          xr4,      a1,       256
+    xvst          xr5,      a1,       320
+    xvst          xr6,      a1,       384
+    xvst          xr7,      a1,       448
+
+    addi.d        a1,       a1,       512
+    addi.d        a0,       a0,       16
+
+    vld           vr24,     a0,       0
+    vld           vr25,     a0,       32
+    vld           vr26,     a0,       64
+    vld           vr27,     a0,       96
+    vld           vr28,     a0,       128
+    vld           vr29,     a0,       160
+    vld           vr30,     a0,       192
+    vld           vr31,     a0,       224
+
+    LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+                       vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+    xvpermi.q     xr8,      xr24,     0x02
+    xvpermi.q     xr9,      xr25,     0x02
+    xvpermi.q     xr10,     xr26,     0x02
+    xvpermi.q     xr11,     xr27,     0x02
+    xvpermi.q     xr12,     xr28,     0x02
+    xvpermi.q     xr13,     xr29,     0x02
+    xvpermi.q     xr14,     xr30,     0x02
+    xvpermi.q     xr15,     xr31,     0x02
+
+    xvst          xr8,      a1,       0
+    xvst          xr9,      a1,       64
+    xvst          xr10,     a1,       128
+    xvst          xr11,     a1,       192
+    xvst          xr12,     a1,       256
+    xvst          xr13,     a1,       320
+    xvst          xr14,     a1,       384
+    xvst          xr15,     a1,       448
+
+    // second
+    addi.d        a0,       a0,       256-16
+
+    xvld          xr0,      a0,       0
+    xvld          xr1,      a0,       32
+    xvld          xr2,      a0,       64
+    xvld          xr3,      a0,       96
+    xvld          xr4,      a0,       128
+    xvld          xr5,      a0,       160
+    xvld          xr6,      a0,       192
+    xvld          xr7,      a0,       224
+
+    xvpermi.q     xr8,      xr0,      0x01
+    xvpermi.q     xr9,      xr1,      0x01
+    xvpermi.q     xr10,     xr2,      0x01
+    xvpermi.q     xr11,     xr3,      0x01
+    xvpermi.q     xr12,     xr4,      0x01
+    xvpermi.q     xr13,     xr5,      0x01
+    xvpermi.q     xr14,     xr6,      0x01
+    xvpermi.q     xr15,     xr7,      0x01
+
+    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+    LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+                       vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+    addi.d        a0,       a0,       256
+
+    vld           vr24,     a0,       0
+    vld           vr25,     a0,       32
+    vld           vr26,     a0,       64
+    vld           vr27,     a0,       96
+    vld           vr28,     a0,       128
+    vld           vr29,     a0,       160
+    vld           vr30,     a0,       192
+    vld           vr31,     a0,       224
+
+    LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+                       vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+    xvpermi.q     xr0,      xr24,     0x02
+    xvpermi.q     xr1,      xr25,     0x02
+    xvpermi.q     xr2,      xr26,     0x02
+    xvpermi.q     xr3,      xr27,     0x02
+    xvpermi.q     xr4,      xr28,     0x02
+    xvpermi.q     xr5,      xr29,     0x02
+    xvpermi.q     xr6,      xr30,     0x02
+    xvpermi.q     xr7,      xr31,     0x02
+
+    addi.d        a1,       a1,       -512+32
+
+    xvst          xr0,      a1,       0
+    xvst          xr1,      a1,       64
+    xvst          xr2,      a1,       128
+    xvst          xr3,      a1,       192
+    xvst          xr4,      a1,       256
+    xvst          xr5,      a1,       320
+    xvst          xr6,      a1,       384
+    xvst          xr7,      a1,       448
+
+    addi.d        a1,       a1,       512
+    addi.d        a0,       a0,       16
+
+    vld           vr24,     a0,       0
+    vld           vr25,     a0,       32
+    vld           vr26,     a0,       64
+    vld           vr27,     a0,       96
+    vld           vr28,     a0,       128
+    vld           vr29,     a0,       160
+    vld           vr30,     a0,       192
+    vld           vr31,     a0,       224
+
+    LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+                       vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \
+                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+
+    xvpermi.q     xr8,      xr24,     0x02
+    xvpermi.q     xr9,      xr25,     0x02
+    xvpermi.q     xr10,     xr26,     0x02
+    xvpermi.q     xr11,     xr27,     0x02
+    xvpermi.q     xr12,     xr28,     0x02
+    xvpermi.q     xr13,     xr29,     0x02
+    xvpermi.q     xr14,     xr30,     0x02
+    xvpermi.q     xr15,     xr31,     0x02
+
+    xvst          xr8,      a1,       0
+    xvst          xr9,      a1,       64
+    xvst          xr10,     a1,       128
+    xvst          xr11,     a1,       192
+    xvst          xr12,     a1,       256
+    xvst          xr13,     a1,       320
+    xvst          xr14,     a1,       384
+    xvst          xr15,     a1,       448
+
+    fr_recover
+endfunc
+
+function ff_hevc_idct_32x32_lasx
+
+    addi.d        t7,       a0,       0
+    addi.d        t6,       a1,       0
+
+    addi.d        sp,       sp,       -8
+    st.d          ra,       sp,        0
+
+    bl hevc_idct_16x32_column_step1_lasx
+
+    addi.d        a0,       a0,       32
+
+    bl hevc_idct_16x32_column_step1_lasx
+
+    addi.d        sp,       sp,       -1086      // (16*32+31)*2
+    fr_store
+
+    addi.d        t8,       sp,       64+31*2    // tmp_buf_ptr
+
+    addi.d        a0,       t7,       0
+    addi.d        a1,       t8,       0
+    bl hevc_idct_transpose_32x16_to_16x32_lasx
+
+    addi.d        a0,       t8,       0
+    bl hevc_idct_16x32_column_step2_lasx
+
+    addi.d        a0,       t8,       0
+    addi.d        a1,       t7,       0
+    bl hevc_idct_transpose_16x32_to_32x16_lasx
+
+    // second
+    addi.d        a0,       t7,       32*8*2*2
+    addi.d        a1,       t8,       0
+    bl hevc_idct_transpose_32x16_to_16x32_lasx
+
+    addi.d        a0,       t8,       0
+    bl hevc_idct_16x32_column_step2_lasx
+
+    addi.d        a0,       t8,       0
+    addi.d        a1,       t7,       32*8*2*2
+    bl hevc_idct_transpose_16x32_to_32x16_lasx
+
+    fr_recover
+    addi.d        sp,       sp,       1086       // (16*32+31)*2
+
+    ld.d          ra,       sp,       0
+    addi.d        sp,       sp,       8
+
+endfunc
@@ -0,0 +1,842 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+const int16_t gt8x8_cnst[16] __attribute__ ((aligned (64))) = {
+    64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18
+};
+
+const int16_t gt16x16_cnst[64] __attribute__ ((aligned (64))) = {
+    64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43,
+    64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90,
+    64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57,
+    64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25
+};
+
+const int16_t gt32x32_cnst0[256] __attribute__ ((aligned (64))) = {
+    90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
+    90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
+    88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
+    85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
+    82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
+    78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
+    73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
+    67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
+    61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
+    54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
+    46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
+    38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
+    31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
+    22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
+    13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
+    4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
+};
+
+const int16_t gt32x32_cnst1[64] __attribute__ ((aligned (64))) = {
+    90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25,
+    80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57,
+    57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80,
+    25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90
+};
+
+const int16_t gt32x32_cnst2[16] __attribute__ ((aligned (64))) = {
+    89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89
+};
+
+#define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1,          \
+                         sum0, sum1, sum2, sum3, shift)       \
+{                                                             \
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5;               \
+    __m128i cnst64 = __lsx_vldi(0x0840);                      \
+    __m128i cnst83 = __lsx_vldi(0x0853);                      \
+    __m128i cnst36 = __lsx_vldi(0x0824);                      \
+                                                              \
+    vec0 = __lsx_vdp2_w_h(in_r0, cnst64);                     \
+    vec1 = __lsx_vdp2_w_h(in_l0, cnst83);                     \
+    vec2 = __lsx_vdp2_w_h(in_r1, cnst64);                     \
+    vec3 = __lsx_vdp2_w_h(in_l1, cnst36);                     \
+    vec4 = __lsx_vdp2_w_h(in_l0, cnst36);                     \
+    vec5 = __lsx_vdp2_w_h(in_l1, cnst83);                     \
+                                                              \
+    sum0 = __lsx_vadd_w(vec0, vec2);                          \
+    sum1 = __lsx_vsub_w(vec0, vec2);                          \
+    vec1 = __lsx_vadd_w(vec1, vec3);                          \
+    vec4 = __lsx_vsub_w(vec4, vec5);                          \
+    sum2 = __lsx_vsub_w(sum1, vec4);                          \
+    sum3 = __lsx_vsub_w(sum0, vec1);                          \
+    sum0 = __lsx_vadd_w(sum0, vec1);                          \
+    sum1 = __lsx_vadd_w(sum1, vec4);                          \
+                                                              \
+    sum0 = __lsx_vsrari_w(sum0, shift);                       \
+    sum1 = __lsx_vsrari_w(sum1, shift);                       \
+    sum2 = __lsx_vsrari_w(sum2, shift);                       \
+    sum3 = __lsx_vsrari_w(sum3, shift);                       \
+    sum0 = __lsx_vsat_w(sum0, 15);                            \
+    sum1 = __lsx_vsat_w(sum1, 15);                            \
+    sum2 = __lsx_vsat_w(sum2, 15);                            \
+    sum3 = __lsx_vsat_w(sum3, 15);                            \
+}
+
+#define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift)  \
+{                                                                        \
+    __m128i src0_r, src1_r, src2_r, src3_r;                              \
+    __m128i src0_l, src1_l, src2_l, src3_l;                              \
+    __m128i filter0, filter1, filter2, filter3;                          \
+    __m128i temp0_r, temp1_r, temp2_r, temp3_r, temp4_r, temp5_r;        \
+    __m128i temp0_l, temp1_l, temp2_l, temp3_l, temp4_l, temp5_l;        \
+    __m128i sum0_r, sum1_r, sum2_r, sum3_r;                              \
+    __m128i sum0_l, sum1_l, sum2_l, sum3_l;                              \
+                                                                         \
+    DUP4_ARG2(__lsx_vilvl_h, in4, in0, in6, in2, in5, in1, in3, in7,     \
+              src0_r, src1_r, src2_r, src3_r);                           \
+    DUP4_ARG2(__lsx_vilvh_h, in4, in0, in6, in2, in5, in1, in3, in7,     \
+              src0_l, src1_l, src2_l, src3_l);                           \
+                                                                         \
+    DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 8,          \
+              filter, 12, filter0, filter1, filter2, filter3);           \
+    DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0,          \
+              src1_r, filter1, src1_l, filter1,  temp0_r, temp0_l,       \
+              temp1_r, temp1_l);                                         \
+                                                                         \
+    LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\
+                      sum1_l, sum1_r);                                   \
+    sum2_r = sum1_r;                                                     \
+    sum2_l = sum1_l;                                                     \
+    sum3_r = sum0_r;                                                     \
+    sum3_l = sum0_l;                                                     \
+                                                                         \
+    DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2,          \
+              src3_r, filter3, src3_l, filter3,  temp2_r, temp2_l,       \
+              temp3_r, temp3_l);                                         \
+    temp2_r = __lsx_vadd_w(temp2_r, temp3_r);                            \
+    temp2_l = __lsx_vadd_w(temp2_l, temp3_l);                            \
+    sum0_r  = __lsx_vadd_w(sum0_r, temp2_r);                             \
+    sum0_l  = __lsx_vadd_w(sum0_l, temp2_l);                             \
+    sum3_r  = __lsx_vsub_w(sum3_r, temp2_r);                             \
+    sum3_l  = __lsx_vsub_w(sum3_l, temp2_l);                             \
+                                                                         \
+    in0 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift);                     \
+    in7 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift);                     \
+                                                                         \
+    DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3,          \
+              src3_r, filter2, src3_l, filter2,  temp4_r, temp4_l,       \
+              temp5_r, temp5_l);                                         \
+    temp4_r = __lsx_vsub_w(temp4_r, temp5_r);                            \
+    temp4_l = __lsx_vsub_w(temp4_l, temp5_l);                            \
+    sum1_r  = __lsx_vadd_w(sum1_r, temp4_r);                             \
+    sum1_l  = __lsx_vadd_w(sum1_l, temp4_l);                             \
+    sum2_r  = __lsx_vsub_w(sum2_r, temp4_r);                             \
+    sum2_l  = __lsx_vsub_w(sum2_l, temp4_l);                             \
+                                                                         \
+    in3 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift);                     \
+    in4 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift);                     \
+                                                                         \
+    DUP4_ARG2(__lsx_vldrepl_w, filter, 16, filter, 20, filter, 24,       \
+              filter, 28, filter0, filter1, filter2, filter3);           \
+    DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0,          \
+              src1_r, filter1, src1_l, filter1,  temp0_r, temp0_l,       \
+              temp1_r, temp1_l);                                         \
+                                                                         \
+    LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\
+                      sum1_l, sum1_r);                                   \
+    sum2_r = sum1_r;                                                     \
+    sum2_l = sum1_l;                                                     \
+    sum3_r = sum0_r;                                                     \
+    sum3_l = sum0_l;                                                     \
+                                                                         \
+    DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2,          \
+              src3_r, filter3, src3_l, filter3,  temp2_r, temp2_l,       \
+              temp3_r, temp3_l);                                         \
+    temp2_r = __lsx_vadd_w(temp2_r, temp3_r);                            \
+    temp2_l = __lsx_vadd_w(temp2_l, temp3_l);                            \
+    sum0_r  = __lsx_vadd_w(sum0_r, temp2_r);                             \
+    sum0_l  = __lsx_vadd_w(sum0_l, temp2_l);                             \
+    sum3_r  = __lsx_vsub_w(sum3_r, temp2_r);                             \
+    sum3_l  = __lsx_vsub_w(sum3_l, temp2_l);                             \
+                                                                         \
+    in1 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift);                     \
+    in6 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift);                     \
+                                                                         \
+    DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3,          \
+              src3_r, filter2, src3_l, filter2,  temp4_r, temp4_l,       \
+              temp5_r, temp5_l);                                         \
+    temp4_r = __lsx_vsub_w(temp4_r, temp5_r);                            \
+    temp4_l = __lsx_vsub_w(temp4_l, temp5_l);                            \
+    sum1_r  = __lsx_vsub_w(sum1_r, temp4_r);                             \
+    sum1_l  = __lsx_vsub_w(sum1_l, temp4_l);                             \
+    sum2_r  = __lsx_vadd_w(sum2_r, temp4_r);                             \
+    sum2_l  = __lsx_vadd_w(sum2_l, temp4_l);                             \
+                                                                         \
+    in2 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift);                     \
+    in5 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift);                     \
+}
+
+#define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r,                   \
+                           src4_r, src5_r, src6_r, src7_r,                   \
+                           src0_l, src1_l, src2_l, src3_l,                   \
+                           src4_l, src5_l, src6_l, src7_l, shift)            \
+{                                                                            \
+    int16_t *ptr0, *ptr1;                                                    \
+    __m128i dst0, dst1;                                                      \
+    __m128i filter0, filter1, filter2, filter3;                              \
+    __m128i temp0_r, temp1_r, temp0_l, temp1_l;                              \
+    __m128i sum0_r, sum1_r, sum2_r, sum3_r, sum0_l, sum1_l, sum2_l;          \
+    __m128i sum3_l, res0_r, res1_r, res0_l, res1_l;                          \
+                                                                             \
+    ptr0 = (buf_ptr + 112);                                                  \
+    ptr1 = (buf_ptr + 128);                                                  \
+    k = -1;                                                                  \
+                                                                             \
+    for (j = 0; j < 4; j++)                                                  \
+    {                                                                        \
+        DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 16,         \
+                  filter, 20, filter0, filter1, filter2, filter3);           \
+        DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0,          \
+                  src4_r, filter2, src4_l, filter2,  sum0_r, sum0_l,         \
+                  sum2_r, sum2_l);                                           \
+        DUP2_ARG2(__lsx_vdp2_w_h, src7_r, filter2, src7_l, filter2,          \
+                  sum3_r, sum3_l);                                           \
+        DUP4_ARG3(__lsx_vdp2add_w_h, sum0_r, src1_r, filter1, sum0_l,        \
+                  src1_l, filter1, sum2_r, src5_r, filter3, sum2_l,          \
+                  src5_l, filter3, sum0_r, sum0_l, sum2_r, sum2_l);          \
+        DUP2_ARG3(__lsx_vdp2add_w_h, sum3_r, src6_r, filter3, sum3_l,        \
+                  src6_l, filter3, sum3_r, sum3_l);                          \
+                                                                             \
+        sum1_r = sum0_r;                                                     \
+        sum1_l = sum0_l;                                                     \
+                                                                             \
+        DUP4_ARG2(__lsx_vldrepl_w, filter, 8, filter, 12, filter, 24,        \
+                  filter, 28, filter0, filter1, filter2, filter3);           \
+        filter += 16;                                                        \
+        DUP2_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0,          \
+                  temp0_r, temp0_l);                                         \
+        DUP2_ARG3(__lsx_vdp2add_w_h, sum2_r, src6_r, filter2, sum2_l,        \
+                  src6_l, filter2, sum2_r, sum2_l);                          \
+        DUP2_ARG2(__lsx_vdp2_w_h, src5_r, filter2, src5_l, filter2,          \
+                  temp1_r, temp1_l);                                         \
+                                                                             \
+        sum0_r = __lsx_vadd_w(sum0_r, temp0_r);                              \
+        sum0_l = __lsx_vadd_w(sum0_l, temp0_l);                              \
+        sum1_r = __lsx_vsub_w(sum1_r, temp0_r);                              \
+        sum1_l = __lsx_vsub_w(sum1_l, temp0_l);                              \
+        sum3_r = __lsx_vsub_w(temp1_r, sum3_r);                              \
+        sum3_l = __lsx_vsub_w(temp1_l, sum3_l);                              \
+                                                                             \
+        DUP2_ARG2(__lsx_vdp2_w_h, src3_r, filter1, src3_l, filter1,          \
+                  temp0_r, temp0_l);                                         \
+        DUP4_ARG3(__lsx_vdp2add_w_h, sum2_r, src7_r, filter3, sum2_l,        \
+                  src7_l, filter3, sum3_r, src4_r, filter3, sum3_l,          \
+                  src4_l, filter3, sum2_r, sum2_l, sum3_r, sum3_l);          \
+                                                                             \
+        sum0_r = __lsx_vadd_w(sum0_r, temp0_r);                              \
+        sum0_l = __lsx_vadd_w(sum0_l, temp0_l);                              \
+        sum1_r = __lsx_vsub_w(sum1_r, temp0_r);                              \
+        sum1_l = __lsx_vsub_w(sum1_l, temp0_l);                              \
+                                                                             \
+        LSX_BUTTERFLY_4_W(sum0_r, sum0_l, sum2_l, sum2_r, res0_r, res0_l,    \
+                          res1_l, res1_r);                                   \
+        dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift);                    \
+        dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift);                    \
+        __lsx_vst(dst0, buf_ptr, 0);                                         \
+        __lsx_vst(dst1, (buf_ptr + ((15 - (j * 2)) << 4)), 0);               \
+                                                                             \
+        LSX_BUTTERFLY_4_W(sum1_r, sum1_l, sum3_l, sum3_r, res0_r, res0_l,    \
+                          res1_l, res1_r);                                   \
+                                                                             \
+        dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift);                    \
+        dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift);                    \
+        __lsx_vst(dst0, (ptr0 + ((((j + 1) >> 1) * 2 * k) << 4)), 0);        \
+        __lsx_vst(dst1, (ptr1 - ((((j + 1) >> 1) * 2 * k) << 4)), 0);        \
+                                                                             \
+        k *= -1;                                                             \
+        buf_ptr += 16;                                                       \
+    }                                                                        \
+}
+
+#define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx)  \
+{                                                                     \
+    tmp0_r = __lsx_vld(input + load_idx * 8, 0);                      \
+    tmp0_l = __lsx_vld(input + load_idx * 8, 16);                     \
+    tmp1_r = sum0_r;                                                  \
+    tmp1_l = sum0_l;                                                  \
+    sum0_r = __lsx_vadd_w(sum0_r, tmp0_r);                            \
+    sum0_l = __lsx_vadd_w(sum0_l, tmp0_l);                            \
+    __lsx_vst(sum0_r, (input + load_idx * 8), 0);                     \
+    __lsx_vst(sum0_l, (input + load_idx * 8), 16);                    \
+    tmp1_r = __lsx_vsub_w(tmp1_r, tmp0_r);                            \
+    tmp1_l = __lsx_vsub_w(tmp1_l, tmp0_l);                            \
+    __lsx_vst(tmp1_r, (input + store_idx * 8), 0);                    \
+    __lsx_vst(tmp1_l, (input + store_idx * 8), 16);                   \
+}
+
+#define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1,     \
+                              res0, res1, res2, res3, shift)  \
+{                                                             \
+    __m128i vec0, vec1, vec2, vec3;                           \
+    __m128i cnst74 = __lsx_vldi(0x84a);                       \
+    __m128i cnst55 = __lsx_vldi(0x837);                       \
+    __m128i cnst29 = __lsx_vldi(0x81d);                       \
+                                                              \
+    vec0 = __lsx_vadd_w(in_r0, in_r1);                        \
+    vec2 = __lsx_vsub_w(in_r0, in_l1);                        \
+    res0 = __lsx_vmul_w(vec0, cnst29);                        \
+    res1 = __lsx_vmul_w(vec2, cnst55);                        \
+    res2 = __lsx_vsub_w(in_r0, in_r1);                        \
+    vec1 = __lsx_vadd_w(in_r1, in_l1);                        \
+    res2 = __lsx_vadd_w(res2, in_l1);                         \
+    vec3 = __lsx_vmul_w(in_l0, cnst74);                       \
+    res3 = __lsx_vmul_w(vec0, cnst55);                        \
+                                                              \
+    res0 = __lsx_vadd_w(res0, __lsx_vmul_w(vec1, cnst55));    \
+    res1 = __lsx_vsub_w(res1, __lsx_vmul_w(vec1, cnst29));    \
+    res2 = __lsx_vmul_w(res2, cnst74);                        \
+    res3 = __lsx_vadd_w(res3, __lsx_vmul_w(vec2, cnst29));    \
+                                                              \
+    res0 = __lsx_vadd_w(res0, vec3);                          \
+    res1 = __lsx_vadd_w(res1, vec3);                          \
+    res3 = __lsx_vsub_w(res3, vec3);                          \
+                                                              \
+    res0 = __lsx_vsrari_w(res0, shift);                       \
+    res1 = __lsx_vsrari_w(res1, shift);                       \
+    res2 = __lsx_vsrari_w(res2, shift);                       \
+    res3 = __lsx_vsrari_w(res3, shift);                       \
+    res0 = __lsx_vsat_w(res0, 15);                            \
+    res1 = __lsx_vsat_w(res1, 15);                            \
+    res2 = __lsx_vsat_w(res2, 15);                            \
+    res3 = __lsx_vsat_w(res3, 15);                            \
+}
+
+void ff_hevc_idct_4x4_lsx(int16_t *coeffs, int col_limit)
+{
+    __m128i in0, in1;
+    __m128i in_r0, in_l0, in_r1, in_l1;
+    __m128i sum0, sum1, sum2, sum3;
+    __m128i zero = __lsx_vldi(0x00);
+
+    in0   = __lsx_vld(coeffs, 0);
+    in1   = __lsx_vld(coeffs, 16);
+    in_r0 = __lsx_vilvl_h(zero, in0);
+    in_l0 = __lsx_vilvh_h(zero, in0);
+    in_r1 = __lsx_vilvl_h(zero, in1);
+    in_l1 = __lsx_vilvh_h(zero, in1);
+
+    HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7);
+    LSX_TRANSPOSE4x4_W(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1);
+    HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12);
+
+    /* Pack and transpose */
+    in0  = __lsx_vpickev_h(sum2, sum0);
+    in1  = __lsx_vpickev_h(sum3, sum1);
+    sum0 = __lsx_vilvl_h(in1, in0);
+    sum1 = __lsx_vilvh_h(in1, in0);
+    in0  = __lsx_vilvl_w(sum1, sum0);
+    in1  = __lsx_vilvh_w(sum1, sum0);
+
+    __lsx_vst(in0, coeffs, 0);
+    __lsx_vst(in1, coeffs, 16);
+}
+
+void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit)
+{
+    const int16_t *filter = &gt8x8_cnst[0];
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+    DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 16, coeffs, 32,
+              coeffs, 48, in0, in1, in2, in3);
+    DUP4_ARG2(__lsx_vld, coeffs, 64, coeffs, 80, coeffs, 96,
+              coeffs, 112, in4, in5, in6, in7);
+    HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7);
+    LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12);
+    LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+
+    __lsx_vst(in0, coeffs, 0);
+    __lsx_vst(in1, coeffs, 16);
+    __lsx_vst(in2, coeffs, 32);
+    __lsx_vst(in3, coeffs, 48);
+    __lsx_vst(in4, coeffs, 64);
+    __lsx_vst(in5, coeffs, 80);
+    __lsx_vst(in6, coeffs, 96);
+    __lsx_vst(in7, coeffs, 112);
+}
+
+void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit)
+{
+    int16_t i, j, k;
+    int16_t buf[256];
+    int16_t *buf_ptr = &buf[0];
+    int16_t *src = coeffs;
+    const int16_t *filter = &gt16x16_cnst[0];
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+    __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
+
+    for (i = 2; i--;) {
+        DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
+                  in0, in1, in2, in3);
+        DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+                  in4, in5, in6, in7);
+        DUP4_ARG2(__lsx_vld, src, 256, src, 288, src, 320, src, 352,
+                  in8, in9, in10, in11);
+        DUP4_ARG2(__lsx_vld, src, 384, src, 416, src, 448, src, 480,
+                  in12, in13, in14, in15);
+
+        DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10,
+                  src0_r, src1_r, src2_r, src3_r);
+        DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15,
+                  src4_r, src5_r, src6_r, src7_r);
+        DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10,
+                  src0_l, src1_l, src2_l, src3_l);
+        DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15,
+                  src4_l, src5_l, src6_l, src7_l);
+
+        HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
+                           src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
+                           src4_l, src5_l, src6_l, src7_l, 7);
+
+        src += 8;
+        buf_ptr = (&buf[0] + 8);
+        filter = &gt16x16_cnst[0];
+    }
+
+    src = &buf[0];
+    buf_ptr = coeffs;
+    filter = &gt16x16_cnst[0];
+
+    for (i = 2; i--;) {
+        DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
+                  in0, in8, in1, in9);
+        DUP4_ARG2(__lsx_vld, src, 64, src, 80, src, 96, src, 112,
+                  in2, in10, in3, in11);
+        DUP4_ARG2(__lsx_vld, src, 128, src, 144, src, 160, src, 176,
+                  in4, in12, in5, in13);
+        DUP4_ARG2(__lsx_vld, src, 192, src, 208, src, 224, src, 240,
+                  in6, in14, in7, in15);
+        LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+                           in0, in1, in2, in3, in4, in5, in6, in7);
+        LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15,
+                           in8, in9, in10, in11, in12, in13, in14, in15);
+        DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10,
+                  src0_r, src1_r, src2_r, src3_r);
+        DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15,
+                  src4_r, src5_r, src6_r, src7_r);
+        DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10,
+                  src0_l, src1_l, src2_l, src3_l);
+        DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15,
+                  src4_l, src5_l, src6_l, src7_l);
+        HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
+                           src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
+                           src4_l, src5_l, src6_l, src7_l, 12);
+
+        src += 128;
+        buf_ptr = coeffs + 8;
+        filter = &gt16x16_cnst[0];
+    }
+
+    DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 32, coeffs, 64, coeffs, 96,
+              in0, in1, in2, in3);
+    DUP4_ARG2(__lsx_vld, coeffs, 128, coeffs, 160, coeffs, 192, coeffs, 224,
+              in4, in5, in6, in7);
+    LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    __lsx_vst(vec0, coeffs, 0);
+    __lsx_vst(vec1, coeffs, 32);
+    __lsx_vst(vec2, coeffs, 64);
+    __lsx_vst(vec3, coeffs, 96);
+    __lsx_vst(vec4, coeffs, 128);
+    __lsx_vst(vec5, coeffs, 160);
+    __lsx_vst(vec6, coeffs, 192);
+    __lsx_vst(vec7, coeffs, 224);
+
+    src = coeffs + 8;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96, in0, in1, in2, in3);
+    DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+              in4, in5, in6, in7);
+    LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    src = coeffs + 128;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
+              in8, in9, in10, in11);
+    DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+              in12, in13, in14, in15);
+
+    __lsx_vst(vec0, src, 0);
+    __lsx_vst(vec1, src, 32);
+    __lsx_vst(vec2, src, 64);
+    __lsx_vst(vec3, src, 96);
+    __lsx_vst(vec4, src, 128);
+    __lsx_vst(vec5, src, 160);
+    __lsx_vst(vec6, src, 192);
+    __lsx_vst(vec7, src, 224);
+    LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    src = coeffs + 8;
+    __lsx_vst(vec0, src, 0);
+    __lsx_vst(vec1, src, 32);
+    __lsx_vst(vec2, src, 64);
+    __lsx_vst(vec3, src, 96);
+    __lsx_vst(vec4, src, 128);
+    __lsx_vst(vec5, src, 160);
+    __lsx_vst(vec6, src, 192);
+    __lsx_vst(vec7, src, 224);
+
+    src = coeffs + 136;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
+              in0, in1, in2, in3);
+    DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+              in4, in5, in6, in7);
+    LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    __lsx_vst(vec0, src, 0);
+    __lsx_vst(vec1, src, 32);
+    __lsx_vst(vec2, src, 64);
+    __lsx_vst(vec3, src, 96);
+    __lsx_vst(vec4, src, 128);
+    __lsx_vst(vec5, src, 160);
+    __lsx_vst(vec6, src, 192);
+    __lsx_vst(vec7, src, 224);
+}
+
+static void hevc_idct_8x32_column_lsx(int16_t *coeffs, int32_t buf_pitch,
+                                      uint8_t round)
+{
+    uint8_t i;
+    int32_t buf_pitch_2  = buf_pitch << 1;
+    int32_t buf_pitch_4  = buf_pitch << 2;
+    int32_t buf_pitch_8  = buf_pitch << 3;
+    int32_t buf_pitch_16 = buf_pitch << 4;
+
+    const int16_t *filter_ptr0 = &gt32x32_cnst0[0];
+    const int16_t *filter_ptr1 = &gt32x32_cnst1[0];
+    const int16_t *filter_ptr2 = &gt32x32_cnst2[0];
+    const int16_t *filter_ptr3 = &gt8x8_cnst[0];
+    int16_t *src0 = (coeffs + buf_pitch);
+    int16_t *src1 = (coeffs + buf_pitch_2);
+    int16_t *src2 = (coeffs + buf_pitch_4);
+    int16_t *src3 = (coeffs);
+    int32_t tmp_buf[8 * 32 + 15];
+    int32_t *tmp_buf_ptr = tmp_buf + 15;
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+    __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
+    __m128i filter0, filter1, filter2, filter3;
+    __m128i sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l;
+
+    /* Align pointer to 64 byte boundary */
+    tmp_buf_ptr = (int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
+
+    /* process coeff 4, 12, 20, 28 */
+    in0 = __lsx_vld(src2, 0);
+    in1 = __lsx_vld(src2 + buf_pitch_8, 0);
+    in2 = __lsx_vld(src2 + buf_pitch_16, 0);
+    in3 = __lsx_vld(src2 + buf_pitch_16 + buf_pitch_8, 0);
+    in4 = __lsx_vld(src3, 0);
+    in5 = __lsx_vld(src3 + buf_pitch_8, 0);
+    in6 = __lsx_vld(src3 + buf_pitch_16, 0);
+    in7 = __lsx_vld(src3 + buf_pitch_16 + buf_pitch_8, 0);
+    DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in6, in4, in7, in5,
+              src0_r, src1_r, src2_r, src3_r);
+    DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in6, in4, in7, in5,
+              src0_l, src1_l, src2_l, src3_l);
+
+    filter0 = __lsx_vldrepl_w(filter_ptr2, 0);
+    filter1 = __lsx_vldrepl_w(filter_ptr2, 4);
+    sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+    sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+    sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+    sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+    __lsx_vst(sum0_r, tmp_buf_ptr, 0);
+    __lsx_vst(sum0_l, tmp_buf_ptr, 16);
+
+    filter0 = __lsx_vldrepl_w(filter_ptr2, 8);
+    filter1 = __lsx_vldrepl_w(filter_ptr2, 12);
+    sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+    sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+    sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+    sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+    __lsx_vst(sum0_r, tmp_buf_ptr, 32);
+    __lsx_vst(sum0_l, tmp_buf_ptr, 48);
+
+    filter0 = __lsx_vldrepl_w(filter_ptr2, 16);
+    filter1 = __lsx_vldrepl_w(filter_ptr2, 20);
+    sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+    sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+    sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+    sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+    __lsx_vst(sum0_r, tmp_buf_ptr, 64);
+    __lsx_vst(sum0_l, tmp_buf_ptr, 80);
+
+    filter0 = __lsx_vldrepl_w(filter_ptr2, 24);
+    filter1 = __lsx_vldrepl_w(filter_ptr2, 28);
+    sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+    sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+    sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+    sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+    __lsx_vst(sum0_r, tmp_buf_ptr, 96);
+    __lsx_vst(sum0_l, tmp_buf_ptr, 112);
+
+    /* process coeff 0, 8, 16, 24 */
+    filter0 = __lsx_vldrepl_w(filter_ptr3, 0);
+    filter1 = __lsx_vldrepl_w(filter_ptr3, 4);
+
+    DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0,
+              src3_r, filter1, src3_l, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
+    sum1_r = __lsx_vsub_w(sum0_r, tmp1_r);
+    sum1_l = __lsx_vsub_w(sum0_l, tmp1_l);
+    sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
+    sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
+
+    HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, 0, 7);
+    HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, 3, 4);
+
+    filter0 = __lsx_vldrepl_w(filter_ptr3, 16);
+    filter1 = __lsx_vldrepl_w(filter_ptr3, 20);
+
+    DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0,
+              src3_r, filter1, src3_l, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
+    sum1_r = __lsx_vsub_w(sum0_r, tmp1_r);
+    sum1_l = __lsx_vsub_w(sum0_l, tmp1_l);
+    sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
+    sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
+
+    HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, 1, 6);
+    HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, 2, 5);
+
+    /* process coeff 2 6 10 14 18 22 26 30 */
+    in0 = __lsx_vld(src1, 0);
+    in1 = __lsx_vld(src1 + buf_pitch_4, 0);
+    in2 = __lsx_vld(src1 + buf_pitch_8, 0);
+    in3 = __lsx_vld(src1 + buf_pitch_8 + buf_pitch_4, 0);
+    in4 = __lsx_vld(src1 + buf_pitch_16, 0);
+    in5 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_4, 0);
+    in6 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_8, 0);
+    in7 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_8 + buf_pitch_4, 0);
+
+    DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
+              src0_r, src1_r, src2_r, src3_r);
+    DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
+              src0_l, src1_l, src2_l, src3_l);
+
+    /* loop for all columns of constants */
+    for (i = 0; i < 8; i++) {
+        /* processing single column of constants */
+        filter0 = __lsx_vldrepl_w(filter_ptr1, 0);
+        filter1 = __lsx_vldrepl_w(filter_ptr1, 4);
+        filter2 = __lsx_vldrepl_w(filter_ptr1, 8);
+        filter3 = __lsx_vldrepl_w(filter_ptr1, 12);
+        sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+        sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3);
+
+        tmp0_r = __lsx_vld(tmp_buf_ptr + (i << 3), 0);
+        tmp0_l = __lsx_vld(tmp_buf_ptr + (i << 3), 16);
+        tmp1_r = tmp0_r;
+        tmp1_l = tmp0_l;
+        tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r);
+        tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l);
+        tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r);
+        tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l);
+        __lsx_vst(tmp0_r, tmp_buf_ptr + (i << 3), 0);
+        __lsx_vst(tmp0_l, tmp_buf_ptr + (i << 3), 16);
+        __lsx_vst(tmp1_r, tmp_buf_ptr + ((15 - i) * 8), 0);
+        __lsx_vst(tmp1_l, tmp_buf_ptr + ((15 - i) * 8), 16);
+
+        filter_ptr1 += 8;
+    }
+
+    /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */
+    in0 = __lsx_vld(src0, 0);
+    in1 = __lsx_vld(src0 + buf_pitch_2, 0);
+    in2 = __lsx_vld(src0 + buf_pitch_4, 0);
+    in3 = __lsx_vld(src0 + buf_pitch_4 + buf_pitch_2, 0);
+    in4 = __lsx_vld(src0 + buf_pitch_8, 0);
+    in5 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_2, 0);
+    in6 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4, 0);
+    in7 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0);
+
+    src0 += 16 * buf_pitch;
+    DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
+              src0_r, src1_r, src2_r, src3_r);
+    DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
+              src0_l, src1_l, src2_l, src3_l);
+    in0 = __lsx_vld(src0, 0);
+    in1 = __lsx_vld(src0 + buf_pitch_2, 0);
+    in2 = __lsx_vld(src0 + buf_pitch_4, 0);
+    in3 = __lsx_vld(src0 + buf_pitch_4 + buf_pitch_2, 0);
+    in4 = __lsx_vld(src0 + buf_pitch_8, 0);
+    in5 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_2, 0);
+    in6 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4, 0);
+    in7 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0);
+
+    DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
+              src4_r, src5_r, src6_r, src7_r);
+    DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
+              src4_l, src5_l, src6_l, src7_l);
+
+    /* loop for all columns of filter constants */
+    for (i = 0; i < 16; i++) {
+        /* processing single column of constants */
+        filter0 = __lsx_vldrepl_w(filter_ptr0, 0);
+        filter1 = __lsx_vldrepl_w(filter_ptr0, 4);
+        filter2 = __lsx_vldrepl_w(filter_ptr0, 8);
+        filter3 = __lsx_vldrepl_w(filter_ptr0, 12);
+        sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+        sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3);
+        tmp1_r = sum0_r;
+        tmp1_l = sum0_l;
+
+        filter0 = __lsx_vldrepl_w(filter_ptr0, 16);
+        filter1 = __lsx_vldrepl_w(filter_ptr0, 20);
+        filter2 = __lsx_vldrepl_w(filter_ptr0, 24);
+        filter3 = __lsx_vldrepl_w(filter_ptr0, 28);
+        sum0_r = __lsx_vdp2_w_h(src4_r, filter0);
+        sum0_l = __lsx_vdp2_w_h(src4_l, filter0);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src5_r, filter1);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src5_l, filter1);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src6_r, filter2);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src6_l, filter2);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src7_r, filter3);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src7_l, filter3);
+        sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
+        sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
+
+        tmp0_r = __lsx_vld(tmp_buf_ptr + i * 8, 0);
+        tmp0_l = __lsx_vld(tmp_buf_ptr + i * 8, 16);
+        tmp1_r = tmp0_r;
+        tmp1_l = tmp0_l;
+        tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r);
+        tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l);
+        sum1_r = __lsx_vreplgr2vr_w(round);
+        tmp0_r = __lsx_vssrarn_h_w(tmp0_r, sum1_r);
+        tmp0_l = __lsx_vssrarn_h_w(tmp0_l, sum1_r);
+        in0    = __lsx_vpackev_d(tmp0_l, tmp0_r);
+        __lsx_vst(in0, (coeffs + i * buf_pitch), 0);
+        tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r);
+        tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l);
+        tmp1_r = __lsx_vssrarn_h_w(tmp1_r, sum1_r);
+        tmp1_l = __lsx_vssrarn_h_w(tmp1_l, sum1_r);
+        in0    = __lsx_vpackev_d(tmp1_l, tmp1_r);
+        __lsx_vst(in0, (coeffs + (31 - i) * buf_pitch), 0);
+
+        filter_ptr0 += 16;
+    }
+}
+
+static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf)
+{
+    uint8_t i;
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (i = 0; i < 4; i++) {
+        DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 64, coeffs, 128,
+                  coeffs, 192, in0, in1, in2, in3);
+        DUP4_ARG2(__lsx_vld, coeffs, 256, coeffs, 320, coeffs, 384,
+                  coeffs, 448, in4, in5, in6, in7);
+        coeffs += 8;
+        LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+                           in0, in1, in2, in3, in4, in5, in6, in7);
+        __lsx_vst(in0, tmp_buf, 0);
+        __lsx_vst(in1, tmp_buf, 16);
+        __lsx_vst(in2, tmp_buf, 32);
+        __lsx_vst(in3, tmp_buf, 48);
+        __lsx_vst(in4, tmp_buf, 64);
+        __lsx_vst(in5, tmp_buf, 80);
+        __lsx_vst(in6, tmp_buf, 96);
+        __lsx_vst(in7, tmp_buf, 112);
+        tmp_buf += 64;
+    }
+}
+
+static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
+{
+    uint8_t i;
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (i = 0; i < 4; i++) {
+        DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 16, tmp_buf, 32,
+                  tmp_buf, 48, in0, in1, in2, in3);
+        DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 80, tmp_buf, 96,
+                  tmp_buf, 112, in4, in5, in6, in7);
+        tmp_buf += 64;
+        LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+                           in0, in1, in2, in3, in4, in5, in6, in7);
+        __lsx_vst(in0, coeffs, 0);
+        __lsx_vst(in1, coeffs, 64);
+        __lsx_vst(in2, coeffs, 128);
+        __lsx_vst(in3, coeffs, 192);
+        __lsx_vst(in4, coeffs, 256);
+        __lsx_vst(in5, coeffs, 320);
+        __lsx_vst(in6, coeffs, 384);
+        __lsx_vst(in7, coeffs, 448);
+        coeffs += 8;
+    }
+}
+
+void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit)
+{
+    uint8_t row_cnt, col_cnt;
+    int16_t *src = coeffs;
+    int16_t tmp_buf[8 * 32 + 31];
+    int16_t *tmp_buf_ptr = tmp_buf + 31;
+    uint8_t round;
+    int32_t buf_pitch;
+
+    /* Align pointer to 64 byte boundary */
+    tmp_buf_ptr = (int16_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
+
+    /* column transform */
+    round = 7;
+    buf_pitch = 32;
+    for (col_cnt = 0; col_cnt < 4; col_cnt++) {
+        /* process 8x32 blocks */
+        hevc_idct_8x32_column_lsx((coeffs + col_cnt * 8), buf_pitch, round);
+    }
+
+    /* row transform */
+    round = 12;
+    buf_pitch = 8;
+    for (row_cnt = 0; row_cnt < 4; row_cnt++) {
+        /* process 32x8 blocks */
+        src = (coeffs + 32 * 8 * row_cnt);
+
+        hevc_idct_transpose_32x8_to_8x32(src, tmp_buf_ptr);
+        hevc_idct_8x32_column_lsx(tmp_buf_ptr, buf_pitch, round);
+        hevc_idct_transpose_8x32_to_32x8(tmp_buf_ptr, src);
+    }
+}
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *                Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+static av_always_inline
+void hevc_hv_8t_8x2_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+                        int32_t dst_stride, const int8_t *filter_x,
+                        const int8_t *filter_y, int32_t height, int32_t weight,
+                        int32_t offset, int32_t rnd_val, int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    const int32_t src_stride_4x = (src_stride << 2);
+    const int32_t src_stride_3x = src_stride_2x + src_stride;
+
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    __m128i filt0, filt1, filt2, filt3;
+    __m128i filt_h0, filt_h1, filt_h2, filt_h3;
+    __m128i mask1, mask2, mask3;
+    __m128i filter_vec;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l;
+    __m128i dst10_r, dst32_r, dst54_r, dst76_r;
+    __m128i dst10_l, dst32_l, dst54_l, dst76_l;
+    __m128i dst21_r, dst43_r, dst65_r, dst87_r;
+    __m128i dst21_l, dst43_l, dst65_l, dst87_l;
+    __m128i weight_vec, offset_vec, rnd_vec;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+    src -= (src_stride_3x + 3);
+    weight_vec = __lsx_vreplgr2vr_w(weight);
+    offset_vec = __lsx_vreplgr2vr_w(offset);
+    rnd_vec = __lsx_vreplgr2vr_w(rnd_val);
+
+    DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
+              filter_x, 6, filt0, filt1, filt2, filt3);
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+    DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
+              filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+    mask3 = __lsx_vaddi_bu(mask0, 6);
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        src0 = __lsx_vld(src_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                  src1, src2);
+        src3 = __lsx_vldx(src_tmp, src_stride_3x);
+        src_tmp += src_stride_4x;
+        src4 = __lsx_vld(src_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                  src5, src6);
+        src_tmp += src_stride_3x;
+
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+                  src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+                  src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
+        DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+                  src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
+        DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+                  src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
+                  vec12, filt0, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
+                  dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
+                  dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
+                  dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
+                  src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
+                  src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
+        DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
+                  src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
+        dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
+                  dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
+                  dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
+        dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
+
+        DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
+                  dst1, dst10_r, dst32_r, dst54_r, dst21_r);
+        DUP2_ARG2(__lsx_vilvl_h, dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+        DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
+                  dst1, dst10_l, dst32_l, dst54_l, dst21_l);
+        DUP2_ARG2(__lsx_vilvh_h, dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+        for (loop_cnt = height >> 1; loop_cnt--;) {
+            src7 = __lsx_vld(src_tmp, 0);
+            src8 = __lsx_vldx(src_tmp, src_stride);
+            src_tmp += src_stride_2x;
+            DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
+                      src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
+            dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
+                      filt2, dst7, dst7);
+            dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+            dst76_r = __lsx_vilvl_h(dst7, dst6);
+            dst76_l = __lsx_vilvh_h(dst7, dst6);
+            DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
+                      dst0_r, dst0_l);
+            DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+                      dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
+                      dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
+            DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
+                      dst76_l, filt_h3, dst0_r, dst0_l);
+            DUP2_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst0_r, dst0_l);
+
+            /* row 8 */
+            DUP4_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, src8,
+                      src8, mask2, src8, src8, mask3, vec0, vec1, vec2, vec3);
+            dst8 = __lsx_vdp2_h_bu_b(vec0, filt0);
+            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst8, vec1, filt1, dst8, vec2,
+                      filt2, dst8, dst8);
+            dst8 = __lsx_vdp2add_h_bu_b(dst8, vec3, filt3);
+
+            dst87_r = __lsx_vilvl_h(dst8, dst7);
+            dst87_l = __lsx_vilvh_h(dst8, dst7);
+            DUP2_ARG2(__lsx_vdp2_w_h, dst21_r, filt_h0, dst21_l, filt_h0,
+                      dst1_r, dst1_l);
+            DUP4_ARG3(__lsx_vdp2add_w_h, dst1_r, dst43_r, filt_h1, dst1_l,
+                      dst43_l, filt_h1, dst1_r, dst65_r, filt_h2, dst1_l,
+                      dst65_l, filt_h2, dst1_r, dst1_l, dst1_r, dst1_l);
+            DUP2_ARG3(__lsx_vdp2add_w_h, dst1_r, dst87_r, filt_h3, dst1_l,
+                      dst87_l, filt_h3, dst1_r, dst1_l);
+            DUP2_ARG2(__lsx_vsrai_w, dst1_r, 6, dst1_l, 6, dst1_r, dst1_l);
+
+            DUP2_ARG2(__lsx_vmul_w, dst0_r, weight_vec, dst0_l, weight_vec,
+                      dst0_r, dst0_l);
+            DUP2_ARG2(__lsx_vmul_w, dst1_r, weight_vec, dst1_l, weight_vec,
+                      dst1_r, dst1_l);
+            DUP4_ARG2(__lsx_vsrar_w, dst0_r, rnd_vec, dst1_r, rnd_vec, dst0_l,
+                     rnd_vec, dst1_l, rnd_vec, dst0_r, dst1_r, dst0_l, dst1_l);
+
+            DUP2_ARG2(__lsx_vadd_w, dst0_r, offset_vec, dst0_l, offset_vec,
+                      dst0_r, dst0_l);
+            DUP2_ARG2(__lsx_vadd_w, dst1_r, offset_vec, dst1_l, offset_vec,
+                      dst1_r, dst1_l);
+            DUP4_ARG1(__lsx_vclip255_w, dst0_r, dst1_r, dst0_l, dst1_l, dst0_r,
+                      dst1_r, dst0_l, dst1_l);
+            DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r,
+                      dst0_r, dst1_r);
+            dst0_r = __lsx_vpickev_b(dst1_r, dst0_r);
+
+            __lsx_vstelm_d(dst0_r, dst_tmp, 0, 0);
+            __lsx_vstelm_d(dst0_r, dst_tmp + dst_stride, 0, 1);
+            dst_tmp += dst_stride_2x;
+
+            dst10_r = dst32_r;
+            dst32_r = dst54_r;
+            dst54_r = dst76_r;
+            dst10_l = dst32_l;
+            dst32_l = dst54_l;
+            dst54_l = dst76_l;
+            dst21_r = dst43_r;
+            dst43_r = dst65_r;
+            dst65_r = dst87_r;
+            dst21_l = dst43_l;
+            dst43_l = dst65_l;
+            dst65_l = dst87_l;
+            dst6 = dst8;
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static
+void hevc_hv_8t_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+                       int32_t dst_stride, const int8_t *filter_x,
+                       const int8_t *filter_y, int32_t height, int32_t weight,
+                       int32_t offset, int32_t rnd_val)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+                       filter_y, height, weight, offset, rnd_val, 8);
+}
+
+static
+void hevc_hv_8t_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+                        int32_t dst_stride, const int8_t *filter_x,
+                        const int8_t *filter_y, int32_t height, int32_t weight,
+                        int32_t offset, int32_t rnd_val)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+                       filter_y, height, weight, offset, rnd_val, 16);
+}
+
+static
+void hevc_hv_8t_24w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+                        int32_t dst_stride, const int8_t *filter_x,
+                        const int8_t *filter_y, int32_t height, int32_t weight,
+                        int32_t offset, int32_t rnd_val)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+                       filter_y, height, weight, offset, rnd_val, 24);
+}
+
+static
+void hevc_hv_8t_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+                        int32_t dst_stride, const int8_t *filter_x,
+                        const int8_t *filter_y, int32_t height, int32_t weight,
+                        int32_t offset, int32_t rnd_val)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+                       filter_y, height, weight, offset, rnd_val, 32);
+}
+
+static
+void hevc_hv_8t_48w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+                        int32_t dst_stride, const int8_t *filter_x,
+                        const int8_t *filter_y, int32_t height, int32_t weight,
+                        int32_t offset, int32_t rnd_val)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+                       filter_y, height, weight, offset, rnd_val, 48);
+}
+
+static
+void hevc_hv_8t_64w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+                        int32_t dst_stride, const int8_t *filter_x,
+                        const int8_t *filter_y, int32_t height, int32_t weight,
+                        int32_t offset, int32_t rnd_val)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+                       filter_y, height, weight, offset, rnd_val, 64);
+}
+
+
+#define UNI_W_MC_HV(PEL, WIDTH, TAP)                                           \
+void ff_hevc_put_hevc_uni_w_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst,            \
+                                                      ptrdiff_t dst_stride,    \
+                                                      const uint8_t *src,      \
+                                                      ptrdiff_t src_stride,    \
+                                                      int height,              \
+                                                      int denom,               \
+                                                      int weight,              \
+                                                      int offset,              \
+                                                      intptr_t mx,             \
+                                                      intptr_t my,             \
+                                                      int width)               \
+{                                                                              \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx];                      \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my];                      \
+    int shift = denom + 14 - 8;                                                \
+                                                                               \
+    hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, filter_x,\
+                                    filter_y,  height, weight, offset, shift); \
+}
+
+UNI_W_MC_HV(qpel, 8, 8);
+UNI_W_MC_HV(qpel, 16, 8);
+UNI_W_MC_HV(qpel, 24, 8);
+UNI_W_MC_HV(qpel, 32, 8);
+UNI_W_MC_HV(qpel, 48, 8);
+UNI_W_MC_HV(qpel, 64, 8);
+
+#undef UNI_W_MC_HV
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *                Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/cpu.h"
+#include "hevcdsp_lsx.h"
+#include "hevcdsp_lasx.h"
+
+void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_lsx(cpu_flags)) {
+        if (bit_depth == 8) {
+            c->put_hevc_qpel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_lsx;
+            c->put_hevc_qpel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_lsx;
+            c->put_hevc_qpel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_lsx;
+            c->put_hevc_qpel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_lsx;
+            c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_lsx;
+            c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_lsx;
+            c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_lsx;
+            c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_lsx;
+            c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_lsx;
+
+            c->put_hevc_epel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_lsx;
+            c->put_hevc_epel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_lsx;
+            c->put_hevc_epel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_lsx;
+            c->put_hevc_epel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_lsx;
+            c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_lsx;
+            c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_lsx;
+            c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_lsx;
+
+            c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_lsx;
+            c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_lsx;
+            c->put_hevc_qpel[4][0][1] = ff_hevc_put_hevc_qpel_h12_8_lsx;
+            c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_lsx;
+            c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_8_lsx;
+            c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_lsx;
+            c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_lsx;
+            c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_lsx;
+
+            c->put_hevc_qpel[1][1][0] = ff_hevc_put_hevc_qpel_v4_8_lsx;
+            c->put_hevc_qpel[3][1][0] = ff_hevc_put_hevc_qpel_v8_8_lsx;
+            c->put_hevc_qpel[4][1][0] = ff_hevc_put_hevc_qpel_v12_8_lsx;
+            c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_8_lsx;
+            c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_8_lsx;
+            c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_lsx;
+            c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_lsx;
+            c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_lsx;
+
+            c->put_hevc_qpel[1][1][1] = ff_hevc_put_hevc_qpel_hv4_8_lsx;
+            c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_lsx;
+            c->put_hevc_qpel[4][1][1] = ff_hevc_put_hevc_qpel_hv12_8_lsx;
+            c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_8_lsx;
+            c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_8_lsx;
+            c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_8_lsx;
+            c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_8_lsx;
+            c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_8_lsx;
+
+            c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_lsx;
+
+            c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_8_lsx;
+            c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_8_lsx;
+            c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_lsx;
+
+            c->put_hevc_epel[3][1][1] = ff_hevc_put_hevc_epel_hv8_8_lsx;
+            c->put_hevc_epel[4][1][1] = ff_hevc_put_hevc_epel_hv12_8_lsx;
+            c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_8_lsx;
+            c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_lsx;
+            c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_lsx;
+
+            c->put_hevc_qpel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_lsx;
+            c->put_hevc_qpel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_lsx;
+            c->put_hevc_qpel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_lsx;
+            c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_lsx;
+            c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_lsx;
+            c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_lsx;
+            c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_lsx;
+            c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_lsx;
+
+            c->put_hevc_epel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_lsx;
+            c->put_hevc_epel_bi[2][0][0] = ff_hevc_put_hevc_bi_pel_pixels6_8_lsx;
+            c->put_hevc_epel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_lsx;
+            c->put_hevc_epel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_lsx;
+            c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_lsx;
+            c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_lsx;
+            c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_lsx;
+
+            c->put_hevc_qpel_bi[3][1][0] = ff_hevc_put_hevc_bi_qpel_v8_8_lsx;
+            c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_8_lsx;
+            c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_8_lsx;
+            c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_lsx;
+            c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_lsx;
+            c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_lsx;
+
+            c->put_hevc_qpel_bi[3][1][1] = ff_hevc_put_hevc_bi_qpel_hv8_8_lsx;
+            c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_8_lsx;
+            c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_8_lsx;
+            c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_8_lsx;
+            c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_8_lsx;
+            c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_8_lsx;
+
+            c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_8_lsx;
+            c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_8_lsx;
+            c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_lsx;
+            c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_lsx;
+            c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_lsx;
+
+            c->put_hevc_epel_bi[1][0][1] = ff_hevc_put_hevc_bi_epel_h4_8_lsx;
+            c->put_hevc_epel_bi[2][0][1] = ff_hevc_put_hevc_bi_epel_h6_8_lsx;
+            c->put_hevc_epel_bi[3][0][1] = ff_hevc_put_hevc_bi_epel_h8_8_lsx;
+            c->put_hevc_epel_bi[4][0][1] = ff_hevc_put_hevc_bi_epel_h12_8_lsx;
+            c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_8_lsx;
+            c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_8_lsx;
+            c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_lsx;
+            c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_lsx;
+            c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_lsx;
+
+            c->put_hevc_epel_bi[4][1][0] = ff_hevc_put_hevc_bi_epel_v12_8_lsx;
+            c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_8_lsx;
+            c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_8_lsx;
+            c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_lsx;
+
+            c->put_hevc_epel_bi[2][1][1] = ff_hevc_put_hevc_bi_epel_hv6_8_lsx;
+            c->put_hevc_epel_bi[3][1][1] = ff_hevc_put_hevc_bi_epel_hv8_8_lsx;
+            c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_8_lsx;
+            c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_8_lsx;
+            c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_lsx;
+
+            c->put_hevc_qpel_uni[1][0][1] = ff_hevc_put_hevc_uni_qpel_h4_8_lsx;
+            c->put_hevc_qpel_uni[2][0][1] = ff_hevc_put_hevc_uni_qpel_h6_8_lsx;
+            c->put_hevc_qpel_uni[3][0][1] = ff_hevc_put_hevc_uni_qpel_h8_8_lsx;
+            c->put_hevc_qpel_uni[4][0][1] = ff_hevc_put_hevc_uni_qpel_h12_8_lsx;
+            c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_8_lsx;
+            c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_8_lsx;
+            c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_lsx;
+            c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_lsx;
+            c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_lsx;
+
+            c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_8_lsx;
+            c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_lsx;
+            c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_lsx;
+            c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_lsx;
+
+            c->put_hevc_qpel_uni[3][1][1] = ff_hevc_put_hevc_uni_qpel_hv8_8_lsx;
+            c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_8_lsx;
+            c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_8_lsx;
+            c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_8_lsx;
+            c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_lsx;
+            c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_lsx;
+
+            c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_8_lsx;
+            c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_lsx;
+
+            c->put_hevc_epel_uni[3][1][1] = ff_hevc_put_hevc_uni_epel_hv8_8_lsx;
+            c->put_hevc_epel_uni[4][1][1] = ff_hevc_put_hevc_uni_epel_hv12_8_lsx;
+            c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_8_lsx;
+            c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_lsx;
+            c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_lsx;
+
+            c->put_hevc_qpel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx;
+            c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx;
+            c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx;
+            c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx;
+            c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx;
+            c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx;
+            c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx;
+            c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
+            c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
+
+            c->put_hevc_epel_uni_w[1][1][1] = ff_hevc_put_hevc_epel_uni_w_hv4_8_lsx;
+            c->put_hevc_epel_uni_w[2][1][1] = ff_hevc_put_hevc_epel_uni_w_hv6_8_lsx;
+            c->put_hevc_epel_uni_w[3][1][1] = ff_hevc_put_hevc_epel_uni_w_hv8_8_lsx;
+            c->put_hevc_epel_uni_w[4][1][1] = ff_hevc_put_hevc_epel_uni_w_hv12_8_lsx;
+            c->put_hevc_epel_uni_w[5][1][1] = ff_hevc_put_hevc_epel_uni_w_hv16_8_lsx;
+            c->put_hevc_epel_uni_w[6][1][1] = ff_hevc_put_hevc_epel_uni_w_hv24_8_lsx;
+            c->put_hevc_epel_uni_w[7][1][1] = ff_hevc_put_hevc_epel_uni_w_hv32_8_lsx;
+            c->put_hevc_epel_uni_w[8][1][1] = ff_hevc_put_hevc_epel_uni_w_hv48_8_lsx;
+            c->put_hevc_epel_uni_w[9][1][1] = ff_hevc_put_hevc_epel_uni_w_hv64_8_lsx;
+
+            c->put_hevc_epel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx;
+            c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx;
+            c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx;
+            c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx;
+            c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx;
+            c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx;
+            c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx;
+            c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
+            c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
+
+            c->put_hevc_epel_uni_w[1][0][1] = ff_hevc_put_hevc_epel_uni_w_h4_8_lsx;
+            c->put_hevc_epel_uni_w[2][0][1] = ff_hevc_put_hevc_epel_uni_w_h6_8_lsx;
+            c->put_hevc_epel_uni_w[3][0][1] = ff_hevc_put_hevc_epel_uni_w_h8_8_lsx;
+            c->put_hevc_epel_uni_w[4][0][1] = ff_hevc_put_hevc_epel_uni_w_h12_8_lsx;
+            c->put_hevc_epel_uni_w[5][0][1] = ff_hevc_put_hevc_epel_uni_w_h16_8_lsx;
+            c->put_hevc_epel_uni_w[6][0][1] = ff_hevc_put_hevc_epel_uni_w_h24_8_lsx;
+            c->put_hevc_epel_uni_w[7][0][1] = ff_hevc_put_hevc_epel_uni_w_h32_8_lsx;
+            c->put_hevc_epel_uni_w[8][0][1] = ff_hevc_put_hevc_epel_uni_w_h48_8_lsx;
+            c->put_hevc_epel_uni_w[9][0][1] = ff_hevc_put_hevc_epel_uni_w_h64_8_lsx;
+
+            c->put_hevc_epel_uni_w[1][1][0] = ff_hevc_put_hevc_epel_uni_w_v4_8_lsx;
+            c->put_hevc_epel_uni_w[2][1][0] = ff_hevc_put_hevc_epel_uni_w_v6_8_lsx;
+            c->put_hevc_epel_uni_w[3][1][0] = ff_hevc_put_hevc_epel_uni_w_v8_8_lsx;
+            c->put_hevc_epel_uni_w[4][1][0] = ff_hevc_put_hevc_epel_uni_w_v12_8_lsx;
+            c->put_hevc_epel_uni_w[5][1][0] = ff_hevc_put_hevc_epel_uni_w_v16_8_lsx;
+            c->put_hevc_epel_uni_w[6][1][0] = ff_hevc_put_hevc_epel_uni_w_v24_8_lsx;
+            c->put_hevc_epel_uni_w[7][1][0] = ff_hevc_put_hevc_epel_uni_w_v32_8_lsx;
+            c->put_hevc_epel_uni_w[8][1][0] = ff_hevc_put_hevc_epel_uni_w_v48_8_lsx;
+            c->put_hevc_epel_uni_w[9][1][0] = ff_hevc_put_hevc_epel_uni_w_v64_8_lsx;
+
+            c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_lsx;
+            c->put_hevc_qpel_uni_w[5][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv16_8_lsx;
+            c->put_hevc_qpel_uni_w[6][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv24_8_lsx;
+            c->put_hevc_qpel_uni_w[7][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv32_8_lsx;
+            c->put_hevc_qpel_uni_w[8][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv48_8_lsx;
+            c->put_hevc_qpel_uni_w[9][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv64_8_lsx;
+
+            c->put_hevc_qpel_uni_w[1][1][0] = ff_hevc_put_hevc_qpel_uni_w_v4_8_lsx;
+            c->put_hevc_qpel_uni_w[2][1][0] = ff_hevc_put_hevc_qpel_uni_w_v6_8_lsx;
+            c->put_hevc_qpel_uni_w[3][1][0] = ff_hevc_put_hevc_qpel_uni_w_v8_8_lsx;
+            c->put_hevc_qpel_uni_w[4][1][0] = ff_hevc_put_hevc_qpel_uni_w_v12_8_lsx;
+            c->put_hevc_qpel_uni_w[5][1][0] = ff_hevc_put_hevc_qpel_uni_w_v16_8_lsx;
+            c->put_hevc_qpel_uni_w[6][1][0] = ff_hevc_put_hevc_qpel_uni_w_v24_8_lsx;
+            c->put_hevc_qpel_uni_w[7][1][0] = ff_hevc_put_hevc_qpel_uni_w_v32_8_lsx;
+            c->put_hevc_qpel_uni_w[8][1][0] = ff_hevc_put_hevc_qpel_uni_w_v48_8_lsx;
+            c->put_hevc_qpel_uni_w[9][1][0] = ff_hevc_put_hevc_qpel_uni_w_v64_8_lsx;
+
+            c->put_hevc_qpel_uni_w[1][0][1] = ff_hevc_put_hevc_qpel_uni_w_h4_8_lsx;
+            c->put_hevc_qpel_uni_w[2][0][1] = ff_hevc_put_hevc_qpel_uni_w_h6_8_lsx;
+            c->put_hevc_qpel_uni_w[3][0][1] = ff_hevc_put_hevc_qpel_uni_w_h8_8_lsx;
+            c->put_hevc_qpel_uni_w[4][0][1] = ff_hevc_put_hevc_qpel_uni_w_h12_8_lsx;
+            c->put_hevc_qpel_uni_w[5][0][1] = ff_hevc_put_hevc_qpel_uni_w_h16_8_lsx;
+            c->put_hevc_qpel_uni_w[6][0][1] = ff_hevc_put_hevc_qpel_uni_w_h24_8_lsx;
+            c->put_hevc_qpel_uni_w[7][0][1] = ff_hevc_put_hevc_qpel_uni_w_h32_8_lsx;
+            c->put_hevc_qpel_uni_w[8][0][1] = ff_hevc_put_hevc_qpel_uni_w_h48_8_lsx;
+            c->put_hevc_qpel_uni_w[9][0][1] = ff_hevc_put_hevc_qpel_uni_w_h64_8_lsx;
+
+            c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_8_lsx;
+            c->sao_edge_filter[1] = ff_hevc_sao_edge_filter_8_lsx;
+            c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_8_lsx;
+            c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_8_lsx;
+            c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_lsx;
+
+            c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_lsx;
+            c->hevc_v_loop_filter_luma = ff_hevc_loop_filter_luma_v_8_lsx;
+
+            c->hevc_h_loop_filter_luma_c = ff_hevc_loop_filter_luma_h_8_lsx;
+            c->hevc_v_loop_filter_luma_c = ff_hevc_loop_filter_luma_v_8_lsx;
+
+            c->hevc_h_loop_filter_chroma = ff_hevc_loop_filter_chroma_h_8_lsx;
+            c->hevc_v_loop_filter_chroma = ff_hevc_loop_filter_chroma_v_8_lsx;
+
+            c->hevc_h_loop_filter_chroma_c = ff_hevc_loop_filter_chroma_h_8_lsx;
+            c->hevc_v_loop_filter_chroma_c = ff_hevc_loop_filter_chroma_v_8_lsx;
+
+            c->idct[0] = ff_hevc_idct_4x4_lsx;
+            c->idct[1] = ff_hevc_idct_8x8_lsx;
+            c->idct[2] = ff_hevc_idct_16x16_lsx;
+            c->idct[3] = ff_hevc_idct_32x32_lsx;
+
+            c->add_residual[0] = ff_hevc_add_residual4x4_8_lsx;
+            c->add_residual[1] = ff_hevc_add_residual8x8_8_lsx;
+            c->add_residual[2] = ff_hevc_add_residual16x16_8_lsx;
+            c->add_residual[3] = ff_hevc_add_residual32x32_8_lsx;
+        }
+    }
+
+    if (have_lasx(cpu_flags)) {
+        if (bit_depth == 8) {
+            c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx;
+            c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx;
+            c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx;
+            c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx;
+            c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx;
+            c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx;
+            c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
+            c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
+
+            c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx;
+            c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx;
+            c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx;
+            c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx;
+            c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx;
+            c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx;
+            c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
+            c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
+
+            c->put_hevc_epel_uni_w[2][1][1] = ff_hevc_put_hevc_epel_uni_w_hv6_8_lasx;
+            c->put_hevc_epel_uni_w[3][1][1] = ff_hevc_put_hevc_epel_uni_w_hv8_8_lasx;
+            c->put_hevc_epel_uni_w[4][1][1] = ff_hevc_put_hevc_epel_uni_w_hv12_8_lasx;
+            c->put_hevc_epel_uni_w[5][1][1] = ff_hevc_put_hevc_epel_uni_w_hv16_8_lasx;
+            c->put_hevc_epel_uni_w[6][1][1] = ff_hevc_put_hevc_epel_uni_w_hv24_8_lasx;
+            c->put_hevc_epel_uni_w[7][1][1] = ff_hevc_put_hevc_epel_uni_w_hv32_8_lasx;
+            c->put_hevc_epel_uni_w[8][1][1] = ff_hevc_put_hevc_epel_uni_w_hv48_8_lasx;
+            c->put_hevc_epel_uni_w[9][1][1] = ff_hevc_put_hevc_epel_uni_w_hv64_8_lasx;
+
+            c->put_hevc_epel_uni_w[2][0][1] = ff_hevc_put_hevc_epel_uni_w_h6_8_lasx;
+            c->put_hevc_epel_uni_w[3][0][1] = ff_hevc_put_hevc_epel_uni_w_h8_8_lasx;
+            c->put_hevc_epel_uni_w[4][0][1] = ff_hevc_put_hevc_epel_uni_w_h12_8_lasx;
+            c->put_hevc_epel_uni_w[5][0][1] = ff_hevc_put_hevc_epel_uni_w_h16_8_lasx;
+            c->put_hevc_epel_uni_w[6][0][1] = ff_hevc_put_hevc_epel_uni_w_h24_8_lasx;
+            c->put_hevc_epel_uni_w[7][0][1] = ff_hevc_put_hevc_epel_uni_w_h32_8_lasx;
+            c->put_hevc_epel_uni_w[8][0][1] = ff_hevc_put_hevc_epel_uni_w_h48_8_lasx;
+            c->put_hevc_epel_uni_w[9][0][1] = ff_hevc_put_hevc_epel_uni_w_h64_8_lasx;
+
+            c->put_hevc_qpel_uni_w[3][1][0] = ff_hevc_put_hevc_qpel_uni_w_v8_8_lasx;
+            c->put_hevc_qpel_uni_w[4][1][0] = ff_hevc_put_hevc_qpel_uni_w_v12_8_lasx;
+            c->put_hevc_qpel_uni_w[5][1][0] = ff_hevc_put_hevc_qpel_uni_w_v16_8_lasx;
+            c->put_hevc_qpel_uni_w[6][1][0] = ff_hevc_put_hevc_qpel_uni_w_v24_8_lasx;
+            c->put_hevc_qpel_uni_w[7][1][0] = ff_hevc_put_hevc_qpel_uni_w_v32_8_lasx;
+            c->put_hevc_qpel_uni_w[8][1][0] = ff_hevc_put_hevc_qpel_uni_w_v48_8_lasx;
+            c->put_hevc_qpel_uni_w[9][1][0] = ff_hevc_put_hevc_qpel_uni_w_v64_8_lasx;
+
+            c->put_hevc_epel_uni_w[2][1][0] = ff_hevc_put_hevc_epel_uni_w_v6_8_lasx;
+            c->put_hevc_epel_uni_w[3][1][0] = ff_hevc_put_hevc_epel_uni_w_v8_8_lasx;
+            c->put_hevc_epel_uni_w[4][1][0] = ff_hevc_put_hevc_epel_uni_w_v12_8_lasx;
+            c->put_hevc_epel_uni_w[5][1][0] = ff_hevc_put_hevc_epel_uni_w_v16_8_lasx;
+            c->put_hevc_epel_uni_w[6][1][0] = ff_hevc_put_hevc_epel_uni_w_v24_8_lasx;
+            c->put_hevc_epel_uni_w[7][1][0] = ff_hevc_put_hevc_epel_uni_w_v32_8_lasx;
+            c->put_hevc_epel_uni_w[8][1][0] = ff_hevc_put_hevc_epel_uni_w_v48_8_lasx;
+            c->put_hevc_epel_uni_w[9][1][0] = ff_hevc_put_hevc_epel_uni_w_v64_8_lasx;
+
+            c->put_hevc_qpel_uni_w[1][0][1] = ff_hevc_put_hevc_qpel_uni_w_h4_8_lasx;
+            c->put_hevc_qpel_uni_w[2][0][1] = ff_hevc_put_hevc_qpel_uni_w_h6_8_lasx;
+            c->put_hevc_qpel_uni_w[3][0][1] = ff_hevc_put_hevc_qpel_uni_w_h8_8_lasx;
+            c->put_hevc_qpel_uni_w[4][0][1] = ff_hevc_put_hevc_qpel_uni_w_h12_8_lasx;
+            c->put_hevc_qpel_uni_w[5][0][1] = ff_hevc_put_hevc_qpel_uni_w_h16_8_lasx;
+            c->put_hevc_qpel_uni_w[6][0][1] = ff_hevc_put_hevc_qpel_uni_w_h24_8_lasx;
+            c->put_hevc_qpel_uni_w[7][0][1] = ff_hevc_put_hevc_qpel_uni_w_h32_8_lasx;
+            c->put_hevc_qpel_uni_w[8][0][1] = ff_hevc_put_hevc_qpel_uni_w_h48_8_lasx;
+            c->put_hevc_qpel_uni_w[9][0][1] = ff_hevc_put_hevc_qpel_uni_w_h64_8_lasx;
+
+            c->put_hevc_qpel_uni[4][0][1] = ff_hevc_put_hevc_uni_qpel_h12_8_lasx;
+            c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_8_lasx;
+            c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_8_lasx;
+            c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_lasx;
+            c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_lasx;
+            c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_lasx;
+
+            c->put_hevc_epel_bi[4][0][1] = ff_hevc_put_hevc_bi_epel_h12_8_lasx;
+            c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_8_lasx;
+            c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_lasx;
+            c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_lasx;
+            c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_lasx;
+
+            c->idct[3] = ff_hevc_idct_32x32_lasx;
+        }
+    }
+}
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by jinbo <jinbo@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H
+#define AVCODEC_LOONGARCH_HEVCDSP_LASX_H
+
+#include "libavcodec/hevc/dsp.h"
+
+#define PEL_UNI_W(PEL, DIR, WIDTH)                                       \
+void ff_hevc_put_hevc_##PEL##_uni_w_##DIR##WIDTH##_8_lasx(uint8_t *dst,  \
+                                                          ptrdiff_t      \
+                                                          dst_stride,    \
+                                                          const uint8_t *src,  \
+                                                          ptrdiff_t      \
+                                                          src_stride,    \
+                                                          int height,    \
+                                                          int denom,     \
+                                                          int wx,        \
+                                                          int ox,        \
+                                                          intptr_t mx,   \
+                                                          intptr_t my,   \
+                                                          int width)
+
+PEL_UNI_W(pel, pixels, 6);
+PEL_UNI_W(pel, pixels, 8);
+PEL_UNI_W(pel, pixels, 12);
+PEL_UNI_W(pel, pixels, 16);
+PEL_UNI_W(pel, pixels, 24);
+PEL_UNI_W(pel, pixels, 32);
+PEL_UNI_W(pel, pixels, 48);
+PEL_UNI_W(pel, pixels, 64);
+
+PEL_UNI_W(qpel, v, 8);
+PEL_UNI_W(qpel, v, 12);
+PEL_UNI_W(qpel, v, 16);
+PEL_UNI_W(qpel, v, 24);
+PEL_UNI_W(qpel, v, 32);
+PEL_UNI_W(qpel, v, 48);
+PEL_UNI_W(qpel, v, 64);
+
+PEL_UNI_W(qpel, h, 4);
+PEL_UNI_W(qpel, h, 6);
+PEL_UNI_W(qpel, h, 8);
+PEL_UNI_W(qpel, h, 12);
+PEL_UNI_W(qpel, h, 16);
+PEL_UNI_W(qpel, h, 24);
+PEL_UNI_W(qpel, h, 32);
+PEL_UNI_W(qpel, h, 48);
+PEL_UNI_W(qpel, h, 64);
+
+PEL_UNI_W(epel, hv, 6);
+PEL_UNI_W(epel, hv, 8);
+PEL_UNI_W(epel, hv, 12);
+PEL_UNI_W(epel, hv, 16);
+PEL_UNI_W(epel, hv, 24);
+PEL_UNI_W(epel, hv, 32);
+PEL_UNI_W(epel, hv, 48);
+PEL_UNI_W(epel, hv, 64);
+
+PEL_UNI_W(epel, v, 6);
+PEL_UNI_W(epel, v, 8);
+PEL_UNI_W(epel, v, 12);
+PEL_UNI_W(epel, v, 16);
+PEL_UNI_W(epel, v, 24);
+PEL_UNI_W(epel, v, 32);
+PEL_UNI_W(epel, v, 48);
+PEL_UNI_W(epel, v, 64);
+
+PEL_UNI_W(epel, h, 6);
+PEL_UNI_W(epel, h, 8);
+PEL_UNI_W(epel, h, 12);
+PEL_UNI_W(epel, h, 16);
+PEL_UNI_W(epel, h, 24);
+PEL_UNI_W(epel, h, 32);
+PEL_UNI_W(epel, h, 48);
+PEL_UNI_W(epel, h, 64);
+
+#undef PEL_UNI_W
+
+#define UNI_MC(PEL, DIR, WIDTH)                                               \
+void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lasx(uint8_t *dst,         \
+                                                        ptrdiff_t dst_stride, \
+                                                        const uint8_t *src,   \
+                                                        ptrdiff_t src_stride, \
+                                                        int height,           \
+                                                        intptr_t mx,          \
+                                                        intptr_t my,          \
+                                                        int width)
+UNI_MC(qpel, h, 12);
+UNI_MC(qpel, h, 16);
+UNI_MC(qpel, h, 24);
+UNI_MC(qpel, h, 32);
+UNI_MC(qpel, h, 48);
+UNI_MC(qpel, h, 64);
+
+#undef UNI_MC
+
+#define BI_MC(PEL, DIR, WIDTH)                                                \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_lasx(uint8_t *dst,          \
+                                                       ptrdiff_t dst_stride,  \
+                                                       const uint8_t *src,    \
+                                                       ptrdiff_t src_stride,  \
+                                                       const int16_t *src_16bit, \
+                                                       int height,            \
+                                                       intptr_t mx,           \
+                                                       intptr_t my,           \
+                                                       int width)
+BI_MC(epel, h, 12);
+BI_MC(epel, h, 16);
+BI_MC(epel, h, 32);
+BI_MC(epel, h, 48);
+BI_MC(epel, h, 64);
+
+#undef BI_MC
+
+void ff_hevc_idct_32x32_lasx(int16_t *coeffs, int col_limit);
+
+#endif  // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *                Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
+#define AVCODEC_LOONGARCH_HEVCDSP_LSX_H
+
+#include "libavcodec/hevc/dsp.h"
+
+#define MC(PEL, DIR, WIDTH)                                               \
+void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_lsx(int16_t *dst,          \
+                                                   const uint8_t *src,    \
+                                                   ptrdiff_t src_stride,  \
+                                                   int height,            \
+                                                   intptr_t mx,           \
+                                                   intptr_t my,           \
+                                                   int width)
+
+MC(pel, pixels, 4);
+MC(pel, pixels, 6);
+MC(pel, pixels, 8);
+MC(pel, pixels, 12);
+MC(pel, pixels, 16);
+MC(pel, pixels, 24);
+MC(pel, pixels, 32);
+MC(pel, pixels, 48);
+MC(pel, pixels, 64);
+
+MC(qpel, h, 4);
+MC(qpel, h, 8);
+MC(qpel, h, 12);
+MC(qpel, h, 16);
+MC(qpel, h, 24);
+MC(qpel, h, 32);
+MC(qpel, h, 48);
+MC(qpel, h, 64);
+
+MC(qpel, v, 4);
+MC(qpel, v, 8);
+MC(qpel, v, 12);
+MC(qpel, v, 16);
+MC(qpel, v, 24);
+MC(qpel, v, 32);
+MC(qpel, v, 48);
+MC(qpel, v, 64);
+
+MC(qpel, hv, 4);
+MC(qpel, hv, 8);
+MC(qpel, hv, 12);
+MC(qpel, hv, 16);
+MC(qpel, hv, 24);
+MC(qpel, hv, 32);
+MC(qpel, hv, 48);
+MC(qpel, hv, 64);
+
+MC(epel, h, 32);
+
+MC(epel, v, 16);
+MC(epel, v, 24);
+MC(epel, v, 32);
+
+MC(epel, hv, 8);
+MC(epel, hv, 12);
+MC(epel, hv, 16);
+MC(epel, hv, 24);
+MC(epel, hv, 32);
+
+#undef MC
+
+#define BI_MC(PEL, DIR, WIDTH)                                               \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst,          \
+                                                      ptrdiff_t dst_stride,  \
+                                                      const uint8_t *src,    \
+                                                      ptrdiff_t src_stride,  \
+                                                      const int16_t *src_16bit, \
+                                                      int height,            \
+                                                      intptr_t mx,           \
+                                                      intptr_t my,           \
+                                                      int width)
+
+BI_MC(pel, pixels, 4);
+BI_MC(pel, pixels, 6);
+BI_MC(pel, pixels, 8);
+BI_MC(pel, pixels, 12);
+BI_MC(pel, pixels, 16);
+BI_MC(pel, pixels, 24);
+BI_MC(pel, pixels, 32);
+BI_MC(pel, pixels, 48);
+BI_MC(pel, pixels, 64);
+
+BI_MC(qpel, h, 16);
+BI_MC(qpel, h, 24);
+BI_MC(qpel, h, 32);
+BI_MC(qpel, h, 48);
+BI_MC(qpel, h, 64);
+
+BI_MC(qpel, v, 8);
+BI_MC(qpel, v, 16);
+BI_MC(qpel, v, 24);
+BI_MC(qpel, v, 32);
+BI_MC(qpel, v, 48);
+BI_MC(qpel, v, 64);
+
+BI_MC(qpel, hv, 8);
+BI_MC(qpel, hv, 16);
+BI_MC(qpel, hv, 24);
+BI_MC(qpel, hv, 32);
+BI_MC(qpel, hv, 48);
+BI_MC(qpel, hv, 64);
+
+BI_MC(epel, h, 4);
+BI_MC(epel, h, 6);
+BI_MC(epel, h, 8);
+BI_MC(epel, h, 12);
+BI_MC(epel, h, 16);
+BI_MC(epel, h, 24);
+BI_MC(epel, h, 32);
+BI_MC(epel, h, 48);
+BI_MC(epel, h, 64);
+
+BI_MC(epel, v, 12);
+BI_MC(epel, v, 16);
+BI_MC(epel, v, 24);
+BI_MC(epel, v, 32);
+
+BI_MC(epel, hv, 6);
+BI_MC(epel, hv, 8);
+BI_MC(epel, hv, 16);
+BI_MC(epel, hv, 24);
+BI_MC(epel, hv, 32);
+
+#undef BI_MC
+
+#define UNI_MC(PEL, DIR, WIDTH)                                              \
+void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst,         \
+                                                       ptrdiff_t dst_stride, \
+                                                       const uint8_t *src,   \
+                                                       ptrdiff_t src_stride, \
+                                                       int height,           \
+                                                       intptr_t mx,          \
+                                                       intptr_t my,          \
+                                                       int width)
+UNI_MC(qpel, h, 4);
+UNI_MC(qpel, h, 6);
+UNI_MC(qpel, h, 8);
+UNI_MC(qpel, h, 12);
+UNI_MC(qpel, h, 16);
+UNI_MC(qpel, h, 24);
+UNI_MC(qpel, h, 32);
+UNI_MC(qpel, h, 48);
+UNI_MC(qpel, h, 64);
+
+UNI_MC(qpel, v, 24);
+UNI_MC(qpel, v, 32);
+UNI_MC(qpel, v, 48);
+UNI_MC(qpel, v, 64);
+
+UNI_MC(qpel, hv, 8);
+UNI_MC(qpel, hv, 16);
+UNI_MC(qpel, hv, 24);
+UNI_MC(qpel, hv, 32);
+UNI_MC(qpel, hv, 48);
+UNI_MC(qpel, hv, 64);
+
+UNI_MC(epel, v, 24);
+UNI_MC(epel, v, 32);
+
+UNI_MC(epel, hv, 8);
+UNI_MC(epel, hv, 12);
+UNI_MC(epel, hv, 16);
+UNI_MC(epel, hv, 24);
+UNI_MC(epel, hv, 32);
+
+#undef UNI_MC
+
+#define UNI_W_MC(PEL, DIR, WIDTH)                                       \
+void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst,  \
+                                                         ptrdiff_t      \
+                                                         dst_stride,    \
+                                                         const uint8_t *src,  \
+                                                         ptrdiff_t      \
+                                                         src_stride,    \
+                                                         int height,    \
+                                                         int denom,     \
+                                                         int weight,    \
+                                                         int offset,    \
+                                                         intptr_t mx,   \
+                                                         intptr_t my,   \
+                                                         int width)
+
+UNI_W_MC(qpel, hv, 8);
+UNI_W_MC(qpel, hv, 16);
+UNI_W_MC(qpel, hv, 24);
+UNI_W_MC(qpel, hv, 32);
+UNI_W_MC(qpel, hv, 48);
+UNI_W_MC(qpel, hv, 64);
+
+#undef UNI_W_MC
+
+void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                      int32_t beta, const int32_t *tc,
+                                      const uint8_t *p_is_pcm, const uint8_t *q_is_pcm);
+
+void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                      int32_t beta, const int32_t *tc,
+                                      const uint8_t *p_is_pcm, const uint8_t *q_is_pcm);
+
+void ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                        const int32_t *tc, const uint8_t *p_is_pcm,
+                                        const uint8_t *q_is_pcm);
+
+void ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                        const int32_t *tc, const uint8_t *p_is_pcm,
+                                        const uint8_t *q_is_pcm);
+
+void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, const uint8_t *src,
+                                   ptrdiff_t stride_dst,
+                                   const int16_t *sao_offset_val,
+                                   int eo, int width, int height);
+
+void ff_hevc_idct_4x4_lsx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit);
+
+void ff_hevc_add_residual4x4_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+
+#define PEL_UNI_W(PEL, DIR, WIDTH)                                      \
+void ff_hevc_put_hevc_##PEL##_uni_w_##DIR##WIDTH##_8_lsx(uint8_t *dst,  \
+                                                         ptrdiff_t      \
+                                                         dst_stride,    \
+                                                         const uint8_t *src,  \
+                                                         ptrdiff_t      \
+                                                         src_stride,    \
+                                                         int height,    \
+                                                         int denom,     \
+                                                         int wx,        \
+                                                         int ox,        \
+                                                         intptr_t mx,   \
+                                                         intptr_t my,   \
+                                                         int width)
+
+PEL_UNI_W(pel, pixels, 4);
+PEL_UNI_W(pel, pixels, 6);
+PEL_UNI_W(pel, pixels, 8);
+PEL_UNI_W(pel, pixels, 12);
+PEL_UNI_W(pel, pixels, 16);
+PEL_UNI_W(pel, pixels, 24);
+PEL_UNI_W(pel, pixels, 32);
+PEL_UNI_W(pel, pixels, 48);
+PEL_UNI_W(pel, pixels, 64);
+
+PEL_UNI_W(qpel, v, 4);
+PEL_UNI_W(qpel, v, 6);
+PEL_UNI_W(qpel, v, 8);
+PEL_UNI_W(qpel, v, 12);
+PEL_UNI_W(qpel, v, 16);
+PEL_UNI_W(qpel, v, 24);
+PEL_UNI_W(qpel, v, 32);
+PEL_UNI_W(qpel, v, 48);
+PEL_UNI_W(qpel, v, 64);
+
+PEL_UNI_W(qpel, h, 4);
+PEL_UNI_W(qpel, h, 6);
+PEL_UNI_W(qpel, h, 8);
+PEL_UNI_W(qpel, h, 12);
+PEL_UNI_W(qpel, h, 16);
+PEL_UNI_W(qpel, h, 24);
+PEL_UNI_W(qpel, h, 32);
+PEL_UNI_W(qpel, h, 48);
+PEL_UNI_W(qpel, h, 64);
+
+PEL_UNI_W(epel, hv, 4);
+PEL_UNI_W(epel, hv, 6);
+PEL_UNI_W(epel, hv, 8);
+PEL_UNI_W(epel, hv, 12);
+PEL_UNI_W(epel, hv, 16);
+PEL_UNI_W(epel, hv, 24);
+PEL_UNI_W(epel, hv, 32);
+PEL_UNI_W(epel, hv, 48);
+PEL_UNI_W(epel, hv, 64);
+
+PEL_UNI_W(epel, h, 4);
+PEL_UNI_W(epel, h, 6);
+PEL_UNI_W(epel, h, 8);
+PEL_UNI_W(epel, h, 12);
+PEL_UNI_W(epel, h, 16);
+PEL_UNI_W(epel, h, 24);
+PEL_UNI_W(epel, h, 32);
+PEL_UNI_W(epel, h, 48);
+PEL_UNI_W(epel, h, 64);
+
+PEL_UNI_W(epel, v, 4);
+PEL_UNI_W(epel, v, 6);
+PEL_UNI_W(epel, v, 8);
+PEL_UNI_W(epel, v, 12);
+PEL_UNI_W(epel, v, 16);
+PEL_UNI_W(epel, v, 24);
+PEL_UNI_W(epel, v, 32);
+PEL_UNI_W(epel, v, 48);
+PEL_UNI_W(epel, v, 64);
+
+#undef PEL_UNI_W
+
+#endif  // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/cpu.h"
+#include "libavcodec/hpeldsp.h"
+#include "libavcodec/loongarch/hpeldsp_lasx.h"
+
+void ff_hpeldsp_init_loongarch(HpelDSPContext *c, int flags)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_lasx(cpu_flags)) {
+        c->put_pixels_tab[0][0] = ff_put_pixels16_8_lsx;
+        c->put_pixels_tab[0][1] = ff_put_pixels16_x2_8_lasx;
+        c->put_pixels_tab[0][2] = ff_put_pixels16_y2_8_lasx;
+        c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_8_lasx;
+
+        c->put_pixels_tab[1][0] = ff_put_pixels8_8_lasx;
+        c->put_pixels_tab[1][1] = ff_put_pixels8_x2_8_lasx;
+        c->put_pixels_tab[1][2] = ff_put_pixels8_y2_8_lasx;
+        c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_8_lasx;
+        c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_8_lsx;
+        c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_8_lasx;
+        c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_8_lasx;
+        c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_8_lasx;
+
+        c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_8_lasx;
+        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_8_lasx;
+        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_8_lasx;
+        c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_8_lasx;
+    }
+}
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_HPELDSP_LASX_H
+#define AVCODEC_LOONGARCH_HPELDSP_LASX_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include "libavutil/attributes.h"
+
+void ff_put_pixels8_8_lasx(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h);
+void ff_put_pixels8_x2_8_lasx(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_y2_8_lasx(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_8_lsx(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h);
+void ff_put_pixels16_x2_8_lasx(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_y2_8_lasx(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_x2_8_lasx(uint8_t *block, const uint8_t *pixels,
+                                      ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels16_y2_8_lasx(uint8_t *block, const uint8_t *pixels,
+                                      ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels16_xy2_8_lasx(uint8_t *block,
+                                       const uint8_t *pixels,
+                                       ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_x2_8_lasx(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_y2_8_lasx(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_xy2_8_lasx(uint8_t *block, const uint8_t *pixels,
+                                      ptrdiff_t line_size, int h);
+void ff_put_pixels8_xy2_8_lasx(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_8_lasx(uint8_t *block, const uint8_t *pixels,
+                                ptrdiff_t line_size, int h);
+#endif
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/cpu.h"
+#include "idctdsp_loongarch.h"
+#include "libavcodec/xvididct.h"
+
+av_cold void ff_idctdsp_init_loongarch(IDCTDSPContext *c, AVCodecContext *avctx,
+                                       unsigned high_bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_lasx(cpu_flags)) {
+        if ((avctx->lowres != 1) && (avctx->lowres != 2) && (avctx->lowres != 3) &&
+            (avctx->bits_per_raw_sample != 10) &&
+            (avctx->bits_per_raw_sample != 12) &&
+            (avctx->idct_algo == FF_IDCT_AUTO)) {
+                    c->idct_put = ff_simple_idct_put_lasx;
+                    c->idct_add = ff_simple_idct_add_lasx;
+                    c->idct = ff_simple_idct_lasx;
+                    c->perm_type = FF_IDCT_PERM_NONE;
+        }
+        c->put_pixels_clamped = ff_put_pixels_clamped_lasx;
+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_lasx;
+        c->add_pixels_clamped = ff_add_pixels_clamped_lasx;
+    }
+}
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "idctdsp_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+void ff_put_pixels_clamped_lasx(const int16_t *block,
+                                uint8_t *restrict pixels,
+                                ptrdiff_t stride)
+{
+    __m256i b0, b1, b2, b3;
+    __m256i temp0, temp1;
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_4x = stride << 2;
+    ptrdiff_t stride_3x = stride_2x + stride;
+
+    DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96,
+              b0, b1, b2, b3);
+    DUP4_ARG1(__lasx_xvclip255_h, b0, b1, b2, b3, b0, b1, b2, b3);
+    DUP2_ARG2(__lasx_xvpickev_b, b1, b0, b3, b2, temp0, temp1);
+    __lasx_xvstelm_d(temp0, pixels, 0, 0);
+    __lasx_xvstelm_d(temp0, pixels + stride, 0, 2);
+    __lasx_xvstelm_d(temp0, pixels + stride_2x, 0, 1);
+    __lasx_xvstelm_d(temp0, pixels + stride_3x, 0, 3);
+    pixels += stride_4x;
+    __lasx_xvstelm_d(temp1, pixels, 0, 0);
+    __lasx_xvstelm_d(temp1, pixels + stride, 0, 2);
+    __lasx_xvstelm_d(temp1, pixels + stride_2x, 0, 1);
+    __lasx_xvstelm_d(temp1, pixels + stride_3x, 0, 3);
+}
+
+void ff_put_signed_pixels_clamped_lasx(const int16_t *block,
+                                       uint8_t *restrict pixels,
+                                       ptrdiff_t stride)
+{
+    __m256i b0, b1, b2, b3;
+    __m256i temp0, temp1;
+    __m256i const_128 = {0x0080008000800080, 0x0080008000800080,
+                         0x0080008000800080, 0x0080008000800080};
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_4x = stride << 2;
+    ptrdiff_t stride_3x = stride_2x + stride;
+
+    DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96,
+              b0, b1, b2, b3);
+    DUP4_ARG2(__lasx_xvadd_h, b0, const_128, b1, const_128, b2, const_128,
+              b3, const_128, b0, b1, b2, b3);
+    DUP4_ARG1(__lasx_xvclip255_h, b0, b1, b2, b3, b0, b1, b2, b3);
+    DUP2_ARG2(__lasx_xvpickev_b, b1, b0, b3, b2, temp0, temp1);
+    __lasx_xvstelm_d(temp0, pixels, 0, 0);
+    __lasx_xvstelm_d(temp0, pixels + stride, 0, 2);
+    __lasx_xvstelm_d(temp0, pixels + stride_2x, 0, 1);
+    __lasx_xvstelm_d(temp0, pixels + stride_3x, 0, 3);
+    pixels += stride_4x;
+    __lasx_xvstelm_d(temp1, pixels, 0, 0);
+    __lasx_xvstelm_d(temp1, pixels + stride, 0, 2);
+    __lasx_xvstelm_d(temp1, pixels + stride_2x, 0, 1);
+    __lasx_xvstelm_d(temp1, pixels + stride_3x, 0, 3);
+}
+
+void ff_add_pixels_clamped_lasx(const int16_t *block,
+                                uint8_t *restrict pixels,
+                                ptrdiff_t stride)
+{
+    __m256i b0, b1, b2, b3;
+    __m256i p0, p1, p2, p3, p4, p5, p6, p7;
+    __m256i temp0, temp1, temp2, temp3;
+    uint8_t *pix = pixels;
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_4x = stride << 2;
+    ptrdiff_t stride_3x = stride_2x + stride;
+
+    DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96,
+              b0, b1, b2, b3);
+    p0   = __lasx_xvldrepl_d(pix, 0);
+    pix += stride;
+    p1   = __lasx_xvldrepl_d(pix, 0);
+    pix += stride;
+    p2   = __lasx_xvldrepl_d(pix, 0);
+    pix += stride;
+    p3   = __lasx_xvldrepl_d(pix, 0);
+    pix += stride;
+    p4   = __lasx_xvldrepl_d(pix, 0);
+    pix += stride;
+    p5   = __lasx_xvldrepl_d(pix, 0);
+    pix += stride;
+    p6   = __lasx_xvldrepl_d(pix, 0);
+    pix += stride;
+    p7   = __lasx_xvldrepl_d(pix, 0);
+    DUP4_ARG3(__lasx_xvpermi_q, p1, p0, 0x20, p3, p2, 0x20, p5, p4, 0x20,
+              p7, p6, 0x20, temp0, temp1, temp2, temp3);
+    DUP4_ARG2(__lasx_xvaddw_h_h_bu, b0, temp0, b1, temp1, b2, temp2, b3, temp3,
+              temp0, temp1, temp2, temp3);
+    DUP4_ARG1(__lasx_xvclip255_h, temp0, temp1, temp2, temp3,
+              temp0, temp1, temp2, temp3);
+    DUP2_ARG2(__lasx_xvpickev_b, temp1, temp0, temp3, temp2, temp0, temp1);
+    __lasx_xvstelm_d(temp0, pixels, 0, 0);
+    __lasx_xvstelm_d(temp0, pixels + stride, 0, 2);
+    __lasx_xvstelm_d(temp0, pixels + stride_2x, 0, 1);
+    __lasx_xvstelm_d(temp0, pixels + stride_3x, 0, 3);
+    pixels += stride_4x;
+    __lasx_xvstelm_d(temp1, pixels, 0, 0);
+    __lasx_xvstelm_d(temp1, pixels + stride, 0, 2);
+    __lasx_xvstelm_d(temp1, pixels + stride_2x, 0, 1);
+    __lasx_xvstelm_d(temp1, pixels + stride_3x, 0, 3);
+}
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_IDCTDSP_LOONGARCH_H
+#define AVCODEC_LOONGARCH_IDCTDSP_LOONGARCH_H
+
+#include <stdint.h>
+#include "libavcodec/mpegvideo.h"
+
+void ff_simple_idct_lasx(int16_t *block);
+void ff_simple_idct_put_lasx(uint8_t *dest, ptrdiff_t stride_dst, int16_t *block);
+void ff_simple_idct_add_lasx(uint8_t *dest, ptrdiff_t stride_dst, int16_t *block);
+void ff_put_pixels_clamped_lasx(const int16_t *block,
+                                uint8_t *restrict pixels,
+                                ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_lasx(const int16_t *block,
+                                       uint8_t *restrict pixels,
+                                       ptrdiff_t line_size);
+void ff_add_pixels_clamped_lasx(const int16_t *block,
+                                uint8_t *restrict pixels,
+                                ptrdiff_t line_size);
+
+#endif /* AVCODEC_LOONGARCH_IDCTDSP_LOONGARCH_H */
@@ -0,0 +1,945 @@
+/*
+ * Loongson asm helper.
+ *
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Gu Xiwei(guxiwei-hf@loongson.cn)
+ *                Shiyou Yin(yinshiyou-hf@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * MAJOR version: Macro usage changes.
+ * MINOR version: Add new functions, or bug fixes.
+ * MICRO version: Comment changes or implementation changes.
+ */
+#define LML_VERSION_MAJOR 0
+#define LML_VERSION_MINOR 2
+#define LML_VERSION_MICRO 0
+
+/*
+ *============================================================================
+ * macros for specific projetc, set them as needed.
+ * Following LoongML macros for your reference.
+ *============================================================================
+ */
+#define ASM_PREF
+#define DEFAULT_ALIGN    5
+
+.macro function name, align=DEFAULT_ALIGN
+.macro endfunc
+    jirl    $r0, $r1, 0x0
+    .size ASM_PREF\name, . - ASM_PREF\name
+    .purgem endfunc
+.endm
+.text ;
+.align \align ;
+.globl ASM_PREF\name ;
+.type  ASM_PREF\name, @function ;
+ASM_PREF\name: ;
+.endm
+
+/**
+ *  Attention: If align is not zero, the macro will use
+ *  t7 until the end of function
+ */
+.macro alloc_stack size, align=0
+.if \align
+    .macro clean_stack
+        add.d   sp, sp, t7
+    .endm
+    addi.d  sp, sp, - \size
+    andi.d  t7, sp, \align - 1
+    sub.d   sp, sp, t7
+    addi.d  t7, t7, \size
+.else
+    .macro clean_stack
+        addi.d  sp, sp, \size
+    .endm
+    addi.d  sp, sp, - \size
+.endif
+.endm
+
+.macro  const name, align=DEFAULT_ALIGN
+    .macro endconst
+    .size  \name, . - \name
+    .purgem endconst
+    .endm
+.section .rodata
+.align   \align
+\name:
+.endm
+
+/*
+ *============================================================================
+ * LoongArch register alias
+ *============================================================================
+ */
+
+#define a0 $a0
+#define a1 $a1
+#define a2 $a2
+#define a3 $a3
+#define a4 $a4
+#define a5 $a5
+#define a6 $a6
+#define a7 $a7
+
+#define t0 $t0
+#define t1 $t1
+#define t2 $t2
+#define t3 $t3
+#define t4 $t4
+#define t5 $t5
+#define t6 $t6
+#define t7 $t7
+#define t8 $t8
+
+#define s0 $s0
+#define s1 $s1
+#define s2 $s2
+#define s3 $s3
+#define s4 $s4
+#define s5 $s5
+#define s6 $s6
+#define s7 $s7
+#define s8 $s8
+
+#define zero $zero
+#define sp   $sp
+#define ra   $ra
+
+#define f0  $f0
+#define f1  $f1
+#define f2  $f2
+#define f3  $f3
+#define f4  $f4
+#define f5  $f5
+#define f6  $f6
+#define f7  $f7
+#define f8  $f8
+#define f9  $f9
+#define f10 $f10
+#define f11 $f11
+#define f12 $f12
+#define f13 $f13
+#define f14 $f14
+#define f15 $f15
+#define f16 $f16
+#define f17 $f17
+#define f18 $f18
+#define f19 $f19
+#define f20 $f20
+#define f21 $f21
+#define f22 $f22
+#define f23 $f23
+#define f24 $f24
+#define f25 $f25
+#define f26 $f26
+#define f27 $f27
+#define f28 $f28
+#define f29 $f29
+#define f30 $f30
+#define f31 $f31
+
+#define vr0 $vr0
+#define vr1 $vr1
+#define vr2 $vr2
+#define vr3 $vr3
+#define vr4 $vr4
+#define vr5 $vr5
+#define vr6 $vr6
+#define vr7 $vr7
+#define vr8 $vr8
+#define vr9 $vr9
+#define vr10 $vr10
+#define vr11 $vr11
+#define vr12 $vr12
+#define vr13 $vr13
+#define vr14 $vr14
+#define vr15 $vr15
+#define vr16 $vr16
+#define vr17 $vr17
+#define vr18 $vr18
+#define vr19 $vr19
+#define vr20 $vr20
+#define vr21 $vr21
+#define vr22 $vr22
+#define vr23 $vr23
+#define vr24 $vr24
+#define vr25 $vr25
+#define vr26 $vr26
+#define vr27 $vr27
+#define vr28 $vr28
+#define vr29 $vr29
+#define vr30 $vr30
+#define vr31 $vr31
+
+#define xr0 $xr0
+#define xr1 $xr1
+#define xr2 $xr2
+#define xr3 $xr3
+#define xr4 $xr4
+#define xr5 $xr5
+#define xr6 $xr6
+#define xr7 $xr7
+#define xr8 $xr8
+#define xr9 $xr9
+#define xr10 $xr10
+#define xr11 $xr11
+#define xr12 $xr12
+#define xr13 $xr13
+#define xr14 $xr14
+#define xr15 $xr15
+#define xr16 $xr16
+#define xr17 $xr17
+#define xr18 $xr18
+#define xr19 $xr19
+#define xr20 $xr20
+#define xr21 $xr21
+#define xr22 $xr22
+#define xr23 $xr23
+#define xr24 $xr24
+#define xr25 $xr25
+#define xr26 $xr26
+#define xr27 $xr27
+#define xr28 $xr28
+#define xr29 $xr29
+#define xr30 $xr30
+#define xr31 $xr31
+
+/*
+ *============================================================================
+ * LSX/LASX synthesize instructions
+ *============================================================================
+ */
+
+/*
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - vj, vk
+ *               Outputs - vd
+ *               Return Type - halfword
+ */
+.macro vdp2.h.bu vd, vj, vk
+    vmulwev.h.bu      \vd,    \vj,    \vk
+    vmaddwod.h.bu     \vd,    \vj,    \vk
+.endm
+
+.macro vdp2.h.bu.b vd, vj, vk
+    vmulwev.h.bu.b    \vd,    \vj,    \vk
+    vmaddwod.h.bu.b   \vd,    \vj,    \vk
+.endm
+
+.macro vdp2.w.h vd, vj, vk
+    vmulwev.w.h       \vd,    \vj,    \vk
+    vmaddwod.w.h      \vd,    \vj,    \vk
+.endm
+
+.macro xvdp2.h.bu xd, xj, xk
+    xvmulwev.h.bu    \xd,    \xj,    \xk
+    xvmaddwod.h.bu   \xd,    \xj,    \xk
+.endm
+
+.macro xvdp2.h.bu.b xd, xj, xk
+    xvmulwev.h.bu.b    \xd,  \xj,    \xk
+    xvmaddwod.h.bu.b   \xd,  \xj,    \xk
+.endm
+
+.macro xvdp2.w.h xd, xj, xk
+    xvmulwev.w.h       \xd,  \xj,    \xk
+    xvmaddwod.w.h      \xd,  \xj,    \xk
+.endm
+
+/*
+ * Description : Dot product & addition of halfword vector elements
+ * Arguments   : Inputs  - vj, vk
+ *               Outputs - vd
+ *               Return Type - twice size of input
+ */
+.macro vdp2add.h.bu vd, vj, vk
+    vmaddwev.h.bu     \vd,    \vj,    \vk
+    vmaddwod.h.bu     \vd,    \vj,    \vk
+.endm
+
+.macro vdp2add.h.bu.b vd, vj, vk
+    vmaddwev.h.bu.b   \vd,    \vj,    \vk
+    vmaddwod.h.bu.b   \vd,    \vj,    \vk
+.endm
+
+.macro vdp2add.w.h vd, vj, vk
+    vmaddwev.w.h      \vd,    \vj,    \vk
+    vmaddwod.w.h      \vd,    \vj,    \vk
+.endm
+
+.macro xvdp2add.h.bu.b xd, xj, xk
+    xvmaddwev.h.bu.b   \xd,  \xj,    \xk
+    xvmaddwod.h.bu.b   \xd,  \xj,    \xk
+.endm
+
+.macro xvdp2add.w.h xd, xj, xk
+    xvmaddwev.w.h      \xd,  \xj,    \xk
+    xvmaddwod.w.h      \xd,  \xj,    \xk
+.endm
+
+/*
+ * Description : Range each element of vector
+ * clip: vj > vk ? vj : vk && vj < va ? vj : va
+ * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
+ */
+.macro vclip.h  vd,  vj, vk, va
+    vmax.h    \vd,  \vj,   \vk
+    vmin.h    \vd,  \vd,   \va
+.endm
+
+.macro vclip255.w  vd, vj
+    vmaxi.w   \vd,   \vj,  0
+    vsat.wu   \vd,   \vd,  7
+.endm
+
+.macro vclip255.h  vd, vj
+    vmaxi.h   \vd,   \vj,  0
+    vsat.hu   \vd,   \vd,  7
+.endm
+
+.macro xvclip.h  xd,  xj, xk, xa
+    xvmax.h    \xd,  \xj,   \xk
+    xvmin.h    \xd,  \xd,   \xa
+.endm
+
+.macro xvclip255.h  xd, xj
+    xvmaxi.h   \xd,   \xj,  0
+    xvsat.hu   \xd,   \xd,  7
+.endm
+
+.macro xvclip255.w  xd, xj
+    xvmaxi.w   \xd,   \xj,  0
+    xvsat.wu   \xd,   \xd,  7
+.endm
+
+/*
+ * Description : Store elements of vector
+ * vd : Data vector to be stroed
+ * rk : Address of data storage
+ * ra : Offset of address
+ * si : Index of data in vd
+ */
+.macro vstelmx.b vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.b   \vd,  \rk,  0, \si
+.endm
+
+.macro vstelmx.h vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.h   \vd,  \rk,  0, \si
+.endm
+
+.macro vstelmx.w vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.w   \vd,  \rk,  0, \si
+.endm
+
+.macro vstelmx.d  vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.d   \vd,  \rk,  0, \si
+.endm
+
+.macro vmov xd, xj
+    vor.v  \xd,  \xj,  \xj
+.endm
+
+.macro xmov xd, xj
+    xvor.v  \xd,  \xj,  \xj
+.endm
+
+.macro xvstelmx.d  xd, rk, ra, si
+    add.d      \rk, \rk,  \ra
+    xvstelm.d  \xd, \rk,  0, \si
+.endm
+
+/*
+ *============================================================================
+ * LSX/LASX custom macros
+ *============================================================================
+ */
+
+/*
+ * Load 4 float, double, V128, v256 elements with stride.
+ */
+.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    fld.s     \out0,    \src,    0
+    fldx.s    \out1,    \src,    \stride
+    fldx.s    \out2,    \src,    \stride2
+    fldx.s    \out3,    \src,    \stride3
+.endm
+
+.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    fld.d     \out0,    \src,    0
+    fldx.d    \out1,    \src,    \stride
+    fldx.d    \out2,    \src,    \stride2
+    fldx.d    \out3,    \src,    \stride3
+.endm
+
+.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    vld     \out0,    \src,    0
+    vldx    \out1,    \src,    \stride
+    vldx    \out2,    \src,    \stride2
+    vldx    \out3,    \src,    \stride3
+.endm
+
+.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    xvld    \out0,    \src,    0
+    xvldx   \out1,    \src,    \stride
+    xvldx   \out2,    \src,    \stride2
+    xvldx   \out3,    \src,    \stride3
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                          tmp0, tmp1
+    vilvl.h   \tmp0,  \in1,   \in0
+    vilvl.h   \tmp1,  \in3,   \in2
+    vilvl.w   \out0,  \tmp1,  \tmp0
+    vilvh.w   \out2,  \tmp1,  \tmp0
+    vilvh.d   \out1,  \out0,  \out0
+    vilvh.d   \out3,  \out0,  \out2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     :
+ * Example     :
+ *               1, 2, 3, 4            1, 5, 9,13
+ *               5, 6, 7, 8    to      2, 6,10,14
+ *               9,10,11,12  =====>    3, 7,11,15
+ *              13,14,15,16            4, 8,12,16
+ */
+.macro LSX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
+                          _tmp0, _tmp1
+
+    vilvl.w    \_tmp0,   \_in1,    \_in0
+    vilvh.w    \_out1,   \_in1,    \_in0
+    vilvl.w    \_tmp1,   \_in3,    \_in2
+    vilvh.w    \_out3,   \_in3,    \_in2
+
+    vilvl.d    \_out0,   \_tmp1,   \_tmp0
+    vilvl.d    \_out2,   \_out3,   \_out1
+    vilvh.d    \_out3,   \_out3,   \_out1
+    vilvh.d    \_out1,   \_tmp1,   \_tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,   \
+                          out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
+                          tmp3, tmp4, tmp5, tmp6, tmp7
+    vilvl.h      \tmp0,    \in6,   \in4
+    vilvl.h      \tmp1,    \in7,   \in5
+    vilvl.h      \tmp2,    \in2,   \in0
+    vilvl.h      \tmp3,    \in3,   \in1
+
+    vilvl.h      \tmp4,    \tmp1,  \tmp0
+    vilvh.h      \tmp5,    \tmp1,  \tmp0
+    vilvl.h      \tmp6,    \tmp3,  \tmp2
+    vilvh.h      \tmp7,    \tmp3,  \tmp2
+
+    vilvh.h      \tmp0,    \in6,   \in4
+    vilvh.h      \tmp1,    \in7,   \in5
+    vilvh.h      \tmp2,    \in2,   \in0
+    vilvh.h      \tmp3,    \in3,   \in1
+
+    vpickev.d    \out0,    \tmp4,  \tmp6
+    vpickod.d    \out1,    \tmp4,  \tmp6
+    vpickev.d    \out2,    \tmp5,  \tmp7
+    vpickod.d    \out3,    \tmp5,  \tmp7
+
+    vilvl.h      \tmp4,    \tmp1,  \tmp0
+    vilvh.h      \tmp5,    \tmp1,  \tmp0
+    vilvl.h      \tmp6,    \tmp3,  \tmp2
+    vilvh.h      \tmp7,    \tmp3,  \tmp2
+
+    vpickev.d    \out4,    \tmp4,  \tmp6
+    vpickod.d    \out5,    \tmp4,  \tmp6
+    vpickev.d    \out6,    \tmp5,  \tmp7
+    vpickod.d    \out7,    \tmp5,  \tmp7
+.endm
+
+/*
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7,        \
+                            in8, in9, in10, in11, in12, in13, in14, in15,  \
+                            out0, out1, out2, out3, out4, out5, out6, out7,\
+                            tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+    xvilvl.b   \tmp0,    \in2,     \in0
+    xvilvl.b   \tmp1,    \in3,     \in1
+    xvilvl.b   \tmp2,    \in6,     \in4
+    xvilvl.b   \tmp3,    \in7,     \in5
+    xvilvl.b   \tmp4,    \in10,    \in8
+    xvilvl.b   \tmp5,    \in11,    \in9
+    xvilvl.b   \tmp6,    \in14,    \in12
+    xvilvl.b   \tmp7,    \in15,    \in13
+    xvilvl.b   \out0,    \tmp1,    \tmp0
+    xvilvh.b   \out1,    \tmp1,    \tmp0
+    xvilvl.b   \out2,    \tmp3,    \tmp2
+    xvilvh.b   \out3,    \tmp3,    \tmp2
+    xvilvl.b   \out4,    \tmp5,    \tmp4
+    xvilvh.b   \out5,    \tmp5,    \tmp4
+    xvilvl.b   \out6,    \tmp7,    \tmp6
+    xvilvh.b   \out7,    \tmp7,    \tmp6
+    xvilvl.w   \tmp0,    \out2,    \out0
+    xvilvh.w   \tmp2,    \out2,    \out0
+    xvilvl.w   \tmp4,    \out3,    \out1
+    xvilvh.w   \tmp6,    \out3,    \out1
+    xvilvl.w   \tmp1,    \out6,    \out4
+    xvilvh.w   \tmp3,    \out6,    \out4
+    xvilvl.w   \tmp5,    \out7,    \out5
+    xvilvh.w   \tmp7,    \out7,    \out5
+    xvilvl.d   \out0,    \tmp1,    \tmp0
+    xvilvh.d   \out1,    \tmp1,    \tmp0
+    xvilvl.d   \out2,    \tmp3,    \tmp2
+    xvilvh.d   \out3,    \tmp3,    \tmp2
+    xvilvl.d   \out4,    \tmp5,    \tmp4
+    xvilvh.d   \out5,    \tmp5,    \tmp4
+    xvilvl.d   \out6,    \tmp7,    \tmp6
+    xvilvh.d   \out7,    \tmp7,    \tmp6
+.endm
+
+/*
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LSX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7,        \
+                           in8, in9, in10, in11, in12, in13, in14, in15,  \
+                           out0, out1, out2, out3, out4, out5, out6, out7,\
+                           tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+    vilvl.b   \tmp0,    \in2,     \in0
+    vilvl.b   \tmp1,    \in3,     \in1
+    vilvl.b   \tmp2,    \in6,     \in4
+    vilvl.b   \tmp3,    \in7,     \in5
+    vilvl.b   \tmp4,    \in10,    \in8
+    vilvl.b   \tmp5,    \in11,    \in9
+    vilvl.b   \tmp6,    \in14,    \in12
+    vilvl.b   \tmp7,    \in15,    \in13
+
+    vilvl.b   \out0,    \tmp1,    \tmp0
+    vilvh.b   \out1,    \tmp1,    \tmp0
+    vilvl.b   \out2,    \tmp3,    \tmp2
+    vilvh.b   \out3,    \tmp3,    \tmp2
+    vilvl.b   \out4,    \tmp5,    \tmp4
+    vilvh.b   \out5,    \tmp5,    \tmp4
+    vilvl.b   \out6,    \tmp7,    \tmp6
+    vilvh.b   \out7,    \tmp7,    \tmp6
+    vilvl.w   \tmp0,    \out2,    \out0
+    vilvh.w   \tmp2,    \out2,    \out0
+    vilvl.w   \tmp4,    \out3,    \out1
+    vilvh.w   \tmp6,    \out3,    \out1
+    vilvl.w   \tmp1,    \out6,    \out4
+    vilvh.w   \tmp3,    \out6,    \out4
+    vilvl.w   \tmp5,    \out7,    \out5
+    vilvh.w   \tmp7,    \out7,    \out5
+    vilvl.d   \out0,    \tmp1,    \tmp0
+    vilvh.d   \out1,    \tmp1,    \tmp0
+    vilvl.d   \out2,    \tmp3,    \tmp2
+    vilvh.d   \out3,    \tmp3,    \tmp2
+    vilvl.d   \out4,    \tmp5,    \tmp4
+    vilvh.d   \out5,    \tmp5,    \tmp4
+    vilvl.d   \out6,    \tmp7,    \tmp6
+    vilvh.d   \out7,    \tmp7,    \tmp6
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                           tmp0, tmp1
+    xvilvl.h   \tmp0,  \in1,   \in0
+    xvilvl.h   \tmp1,  \in3,   \in2
+    xvilvl.w   \out0,  \tmp1,  \tmp0
+    xvilvh.w   \out2,  \tmp1,  \tmp0
+    xvilvh.d   \out1,  \out0,  \out0
+    xvilvh.d   \out3,  \out0,  \out2
+.endm
+
+/*
+ * Description : Transpose 4x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                           tmp0, tmp1
+    xvilvl.h      \tmp0,    \in2,   \in0
+    xvilvl.h      \tmp1,    \in3,   \in1
+    xvilvl.h      \out2,    \tmp1,  \tmp0
+    xvilvh.h      \out3,    \tmp1,  \tmp0
+
+    xvilvl.d      \out0,    \out2,  \out2
+    xvilvh.d      \out1,    \out2,  \out2
+    xvilvl.d      \out2,    \out3,  \out3
+    xvilvh.d      \out3,    \out3,  \out3
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7,         \
+                           out0, out1, out2, out3, out4, out5, out6, out7, \
+                           tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+    xvilvl.h     \tmp0,   \in6,     \in4
+    xvilvl.h     \tmp1,   \in7,     \in5
+    xvilvl.h     \tmp2,   \in2,     \in0
+    xvilvl.h     \tmp3,   \in3,     \in1
+
+    xvilvl.h     \tmp4,   \tmp1,    \tmp0
+    xvilvh.h     \tmp5,   \tmp1,    \tmp0
+    xvilvl.h     \tmp6,   \tmp3,    \tmp2
+    xvilvh.h     \tmp7,   \tmp3,    \tmp2
+
+    xvilvh.h     \tmp0,   \in6,     \in4
+    xvilvh.h     \tmp1,   \in7,     \in5
+    xvilvh.h     \tmp2,   \in2,     \in0
+    xvilvh.h     \tmp3,   \in3,     \in1
+
+    xvpickev.d   \out0,   \tmp4,    \tmp6
+    xvpickod.d   \out1,   \tmp4,    \tmp6
+    xvpickev.d   \out2,   \tmp5,    \tmp7
+    xvpickod.d   \out3,   \tmp5,    \tmp7
+
+    xvilvl.h     \tmp4,   \tmp1,    \tmp0
+    xvilvh.h     \tmp5,   \tmp1,    \tmp0
+    xvilvl.h     \tmp6,   \tmp3,    \tmp2
+    xvilvh.h     \tmp7,   \tmp3,    \tmp2
+
+    xvpickev.d   \out4,   \tmp4,    \tmp6
+    xvpickod.d   \out5,   \tmp4,    \tmp6
+    xvpickev.d   \out6,   \tmp5,    \tmp7
+    xvpickod.d   \out7,   \tmp5,    \tmp7
+.endm
+
+/*
+ * Description : Transpose 2x4x4 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                             tmp0, tmp1, tmp2
+    xvilvh.h   \tmp1,    \in0,     \in1
+    xvilvl.h   \out1,    \in0,     \in1
+    xvilvh.h   \tmp0,    \in2,     \in3
+    xvilvl.h   \out3,    \in2,     \in3
+
+    xvilvh.w   \tmp2,    \out3,    \out1
+    xvilvl.w   \out3,    \out3,    \out1
+
+    xvilvl.w   \out2,    \tmp0,    \tmp1
+    xvilvh.w   \tmp1,    \tmp0,    \tmp1
+
+    xvilvh.d   \out0,    \out2,    \out3
+    xvilvl.d   \out2,    \out2,    \out3
+    xvilvh.d   \out1,    \tmp1,    \tmp2
+    xvilvl.d   \out3,    \tmp1,    \tmp2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     :
+ * Example     :
+ *               1, 2, 3, 4,  1, 2, 3, 4        1,5, 9,13, 1,5, 9,13
+ *               5, 6, 7, 8,  5, 6, 7, 8   to   2,6,10,14, 2,6,10,14
+ *               9,10,11,12,  9,10,11,12 =====> 3,7,11,15, 3,7,11,15
+ *              13,14,15,16, 13,14,15,16        4,8,12,16, 4,8,12,16
+ */
+.macro LASX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
+                           _tmp0, _tmp1
+
+    xvilvl.w    \_tmp0,   \_in1,    \_in0
+    xvilvh.w    \_out1,   \_in1,    \_in0
+    xvilvl.w    \_tmp1,   \_in3,    \_in2
+    xvilvh.w    \_out3,   \_in3,    \_in2
+
+    xvilvl.d    \_out0,   \_tmp1,   \_tmp0
+    xvilvl.d    \_out2,   \_out3,   \_out1
+    xvilvh.d    \_out3,   \_out3,   \_out1
+    xvilvh.d    \_out1,   \_tmp1,   \_tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *               _out7
+ * Example     : LASX_TRANSPOSE8x8_W
+ *        _in0 : 1,2,3,4,5,6,7,8
+ *        _in1 : 2,2,3,4,5,6,7,8
+ *        _in2 : 3,2,3,4,5,6,7,8
+ *        _in3 : 4,2,3,4,5,6,7,8
+ *        _in4 : 5,2,3,4,5,6,7,8
+ *        _in5 : 6,2,3,4,5,6,7,8
+ *        _in6 : 7,2,3,4,5,6,7,8
+ *        _in7 : 8,2,3,4,5,6,7,8
+ *
+ *       _out0 : 1,2,3,4,5,6,7,8
+ *       _out1 : 2,2,2,2,2,2,2,2
+ *       _out2 : 3,3,3,3,3,3,3,3
+ *       _out3 : 4,4,4,4,4,4,4,4
+ *       _out4 : 5,5,5,5,5,5,5,5
+ *       _out5 : 6,6,6,6,6,6,6,6
+ *       _out6 : 7,7,7,7,7,7,7,7
+ *       _out7 : 8,8,8,8,8,8,8,8
+ */
+.macro LASX_TRANSPOSE8x8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,\
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7,\
+                           _tmp0, _tmp1, _tmp2, _tmp3
+    xvilvl.w    \_tmp0,   \_in2,    \_in0
+    xvilvl.w    \_tmp1,   \_in3,    \_in1
+    xvilvh.w    \_tmp2,   \_in2,    \_in0
+    xvilvh.w    \_tmp3,   \_in3,    \_in1
+    xvilvl.w    \_out0,   \_tmp1,   \_tmp0
+    xvilvh.w    \_out1,   \_tmp1,   \_tmp0
+    xvilvl.w    \_out2,   \_tmp3,   \_tmp2
+    xvilvh.w    \_out3,   \_tmp3,   \_tmp2
+
+    xvilvl.w    \_tmp0,   \_in6,    \_in4
+    xvilvl.w    \_tmp1,   \_in7,    \_in5
+    xvilvh.w    \_tmp2,   \_in6,    \_in4
+    xvilvh.w    \_tmp3,   \_in7,    \_in5
+    xvilvl.w    \_out4,   \_tmp1,   \_tmp0
+    xvilvh.w    \_out5,   \_tmp1,   \_tmp0
+    xvilvl.w    \_out6,   \_tmp3,   \_tmp2
+    xvilvh.w    \_out7,   \_tmp3,   \_tmp2
+
+    xmov        \_tmp0,   \_out0
+    xmov        \_tmp1,   \_out1
+    xmov        \_tmp2,   \_out2
+    xmov        \_tmp3,   \_out3
+    xvpermi.q   \_out0,   \_out4,   0x02
+    xvpermi.q   \_out1,   \_out5,   0x02
+    xvpermi.q   \_out2,   \_out6,   0x02
+    xvpermi.q   \_out3,   \_out7,   0x02
+    xvpermi.q   \_out4,   \_tmp0,   0x31
+    xvpermi.q   \_out5,   \_tmp1,   0x31
+    xvpermi.q   \_out6,   \_tmp2,   0x31
+    xvpermi.q   \_out7,   \_tmp3,   0x31
+.endm
+
+/*
+ * Description : Transpose 4x4 block with double-word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Example     : LASX_TRANSPOSE4x4_D
+ *        _in0 : 1,2,3,4
+ *        _in1 : 1,2,3,4
+ *        _in2 : 1,2,3,4
+ *        _in3 : 1,2,3,4
+ *
+ *       _out0 : 1,1,1,1
+ *       _out1 : 2,2,2,2
+ *       _out2 : 3,3,3,3
+ *       _out3 : 4,4,4,4
+ */
+.macro LASX_TRANSPOSE4x4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
+                           _tmp0, _tmp1
+    xvilvl.d    \_tmp0,   \_in1,    \_in0
+    xvilvh.d    \_out1,   \_in1,    \_in0
+    xvilvh.d    \_tmp1,   \_in3,    \_in2
+    xvilvl.d    \_out2,   \_in3,    \_in2
+
+    xvor.v      \_out0,   \_tmp0,   \_tmp0
+    xvor.v      \_out3,   \_tmp1,   \_tmp1
+
+    xvpermi.q   \_out0,   \_out2,   0x02
+    xvpermi.q   \_out2,   \_tmp0,   0x31
+    xvpermi.q   \_out3,   \_out1,   0x31
+    xvpermi.q   \_out1,   \_tmp1,   0x02
+.endm
+
+/*
+ * Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Details     : Butterfly operation
+ * Example     : LSX_BUTTERFLY_4
+ *               _out0 = _in0 + _in3;
+ *               _out1 = _in1 + _in2;
+ *               _out2 = _in1 - _in2;
+ *               _out3 = _in0 - _in3;
+ */
+.macro LSX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    vadd.b   \_out0,   \_in0,   \_in3
+    vadd.b   \_out1,   \_in1,   \_in2
+    vsub.b   \_out2,   \_in1,   \_in2
+    vsub.b   \_out3,   \_in0,   \_in3
+.endm
+.macro LSX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    vadd.h   \_out0,   \_in0,   \_in3
+    vadd.h   \_out1,   \_in1,   \_in2
+    vsub.h   \_out2,   \_in1,   \_in2
+    vsub.h   \_out3,   \_in0,   \_in3
+.endm
+.macro LSX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    vadd.w   \_out0,   \_in0,   \_in3
+    vadd.w   \_out1,   \_in1,   \_in2
+    vsub.w   \_out2,   \_in1,   \_in2
+    vsub.w   \_out3,   \_in0,   \_in3
+.endm
+.macro LSX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    vadd.d   \_out0,   \_in0,   \_in3
+    vadd.d   \_out1,   \_in1,   \_in2
+    vsub.d   \_out2,   \_in1,   \_in2
+    vsub.d   \_out3,   \_in0,   \_in3
+.endm
+
+.macro LASX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    xvadd.b   \_out0,   \_in0,   \_in3
+    xvadd.b   \_out1,   \_in1,   \_in2
+    xvsub.b   \_out2,   \_in1,   \_in2
+    xvsub.b   \_out3,   \_in0,   \_in3
+.endm
+.macro LASX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    xvadd.h   \_out0,   \_in0,   \_in3
+    xvadd.h   \_out1,   \_in1,   \_in2
+    xvsub.h   \_out2,   \_in1,   \_in2
+    xvsub.h   \_out3,   \_in0,   \_in3
+.endm
+.macro LASX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    xvadd.w   \_out0,   \_in0,   \_in3
+    xvadd.w   \_out1,   \_in1,   \_in2
+    xvsub.w   \_out2,   \_in1,   \_in2
+    xvsub.w   \_out3,   \_in0,   \_in3
+.endm
+.macro LASX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    xvadd.d   \_out0,   \_in0,   \_in3
+    xvadd.d   \_out1,   \_in1,   \_in2
+    xvsub.d   \_out2,   \_in1,   \_in2
+    xvsub.d   \_out3,   \_in0,   \_in3
+.endm
+
+/*
+ * Description : Butterfly of 8 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     : LASX_BUTTERFLY_8
+ *               _out0 = _in0 + _in7;
+ *               _out1 = _in1 + _in6;
+ *               _out2 = _in2 + _in5;
+ *               _out3 = _in3 + _in4;
+ *               _out4 = _in3 - _in4;
+ *               _out5 = _in2 - _in5;
+ *               _out6 = _in1 - _in6;
+ *               _out7 = _in0 - _in7;
+ */
+.macro LSX_BUTTERFLY_8_B _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                         _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    vadd.b    \_out0,    \_in0,    \_in7
+    vadd.b    \_out1,    \_in1,    \_in6
+    vadd.b    \_out2,    \_in2,    \_in5
+    vadd.b    \_out3,    \_in3,    \_in4
+    vsub.b    \_out4,    \_in3,    \_in4
+    vsub.b    \_out5,    \_in2,    \_in5
+    vsub.b    \_out6,    \_in1,    \_in6
+    vsub.b    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LSX_BUTTERFLY_8_H _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                         _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    vadd.h    \_out0,    \_in0,    \_in7
+    vadd.h    \_out1,    \_in1,    \_in6
+    vadd.h    \_out2,    \_in2,    \_in5
+    vadd.h    \_out3,    \_in3,    \_in4
+    vsub.h    \_out4,    \_in3,    \_in4
+    vsub.h    \_out5,    \_in2,    \_in5
+    vsub.h    \_out6,    \_in1,    \_in6
+    vsub.h    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LSX_BUTTERFLY_8_W _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                         _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    vadd.w    \_out0,    \_in0,    \_in7
+    vadd.w    \_out1,    \_in1,    \_in6
+    vadd.w    \_out2,    \_in2,    \_in5
+    vadd.w    \_out3,    \_in3,    \_in4
+    vsub.w    \_out4,    \_in3,    \_in4
+    vsub.w    \_out5,    \_in2,    \_in5
+    vsub.w    \_out6,    \_in1,    \_in6
+    vsub.w    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LSX_BUTTERFLY_8_D _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                         _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    vadd.d    \_out0,    \_in0,    \_in7
+    vadd.d    \_out1,    \_in1,    \_in6
+    vadd.d    \_out2,    \_in2,    \_in5
+    vadd.d    \_out3,    \_in3,    \_in4
+    vsub.d    \_out4,    \_in3,    \_in4
+    vsub.d    \_out5,    \_in2,    \_in5
+    vsub.d    \_out6,    \_in1,    \_in6
+    vsub.d    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LASX_BUTTERFLY_8_B _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    xvadd.b    \_out0,    \_in0,    \_in7
+    xvadd.b    \_out1,    \_in1,    \_in6
+    xvadd.b    \_out2,    \_in2,    \_in5
+    xvadd.b    \_out3,    \_in3,    \_in4
+    xvsub.b    \_out4,    \_in3,    \_in4
+    xvsub.b    \_out5,    \_in2,    \_in5
+    xvsub.b    \_out6,    \_in1,    \_in6
+    xvsub.b    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LASX_BUTTERFLY_8_H _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    xvadd.h    \_out0,    \_in0,    \_in7
+    xvadd.h    \_out1,    \_in1,    \_in6
+    xvadd.h    \_out2,    \_in2,    \_in5
+    xvadd.h    \_out3,    \_in3,    \_in4
+    xvsub.h    \_out4,    \_in3,    \_in4
+    xvsub.h    \_out5,    \_in2,    \_in5
+    xvsub.h    \_out6,    \_in1,    \_in6
+    xvsub.h    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LASX_BUTTERFLY_8_W _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    xvadd.w    \_out0,    \_in0,    \_in7
+    xvadd.w    \_out1,    \_in1,    \_in6
+    xvadd.w    \_out2,    \_in2,    \_in5
+    xvadd.w    \_out3,    \_in3,    \_in4
+    xvsub.w    \_out4,    \_in3,    \_in4
+    xvsub.w    \_out5,    \_in2,    \_in5
+    xvsub.w    \_out6,    \_in1,    \_in6
+    xvsub.w    \_out7,    \_in0,    \_in7
+.endm
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "idctdsp_loongarch.h"
+
+#define LASX_TRANSPOSE4x16(in_0, in_1, in_2, in_3, out_0, out_1, out_2, out_3) \
+{                                                                              \
+    __m256i temp_0, temp_1, temp_2, temp_3;                                    \
+    __m256i temp_4, temp_5, temp_6, temp_7;                                    \
+    DUP4_ARG3(__lasx_xvpermi_q, in_2, in_0, 0x20, in_2, in_0, 0x31, in_3, in_1,\
+              0x20, in_3, in_1, 0x31, temp_0, temp_1, temp_2, temp_3);         \
+    DUP2_ARG2(__lasx_xvilvl_h, temp_1, temp_0, temp_3, temp_2, temp_4, temp_6);\
+    DUP2_ARG2(__lasx_xvilvh_h, temp_1, temp_0, temp_3, temp_2, temp_5, temp_7);\
+    DUP2_ARG2(__lasx_xvilvl_w, temp_6, temp_4, temp_7, temp_5, out_0, out_2);  \
+    DUP2_ARG2(__lasx_xvilvh_w, temp_6, temp_4, temp_7, temp_5, out_1, out_3);  \
+}
+
+#define LASX_IDCTROWCONDDC                                                     \
+    const_val  = 16383 * ((1 << 19) / 16383);                                  \
+    const_val1 = __lasx_xvreplgr2vr_w(const_val);                              \
+    DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96,          \
+              in0, in1, in2, in3);                                             \
+    LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3);                \
+    a0 = __lasx_xvpermi_d(in0, 0xD8);                                          \
+    a0 = __lasx_vext2xv_w_h(a0);                                               \
+    temp  = __lasx_xvslli_w(a0, 3);                                            \
+    a1 = __lasx_xvpermi_d(in0, 0x8D);                                          \
+    a1 = __lasx_vext2xv_w_h(a1);                                               \
+    a2 = __lasx_xvpermi_d(in1, 0xD8);                                          \
+    a2 = __lasx_vext2xv_w_h(a2);                                               \
+    a3 = __lasx_xvpermi_d(in1, 0x8D);                                          \
+    a3 = __lasx_vext2xv_w_h(a3);                                               \
+    b0 = __lasx_xvpermi_d(in2, 0xD8);                                          \
+    b0 = __lasx_vext2xv_w_h(b0);                                               \
+    b1 = __lasx_xvpermi_d(in2, 0x8D);                                          \
+    b1 = __lasx_vext2xv_w_h(b1);                                               \
+    b2 = __lasx_xvpermi_d(in3, 0xD8);                                          \
+    b2 = __lasx_vext2xv_w_h(b2);                                               \
+    b3 = __lasx_xvpermi_d(in3, 0x8D);                                          \
+    b3 = __lasx_vext2xv_w_h(b3);                                               \
+    select_vec = a0 | a1 | a2 | a3 | b0 | b1 | b2 | b3;                        \
+    select_vec = __lasx_xvslti_wu(select_vec, 1);                              \
+                                                                               \
+    DUP4_ARG2(__lasx_xvrepl128vei_h, w1, 2, w1, 3, w1, 4, w1, 5,               \
+              w2, w3, w4, w5);                                                 \
+    DUP2_ARG2(__lasx_xvrepl128vei_h, w1, 6, w1, 7, w6, w7);                    \
+    w1 = __lasx_xvrepl128vei_h(w1, 1);                                         \
+                                                                               \
+    /* part of FUNC6(idctRowCondDC) */                                         \
+    temp0 = __lasx_xvmaddwl_w_h(const_val0, in0, w4);                          \
+    DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2);             \
+    a0    = __lasx_xvadd_w(temp0, temp1);                                      \
+    a1    = __lasx_xvadd_w(temp0, temp2);                                      \
+    a2    = __lasx_xvsub_w(temp0, temp2);                                      \
+    a3    = __lasx_xvsub_w(temp0, temp1);                                      \
+                                                                               \
+    DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1);                \
+    b0 = __lasx_xvdp2_w_h(temp0, temp1);                                       \
+    temp1 = __lasx_xvneg_h(w7);                                                \
+    temp2 = __lasx_xvilvl_h(temp1, w3);                                        \
+    b1 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
+    temp1 = __lasx_xvneg_h(w1);                                                \
+    temp2 = __lasx_xvilvl_h(temp1, w5);                                        \
+    b2 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
+    temp1 = __lasx_xvneg_h(w5);                                                \
+    temp2 = __lasx_xvilvl_h(temp1, w7);                                        \
+    b3 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
+                                                                               \
+    /* if (AV_RAN64A(row + 4)) */                                              \
+    DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1);                \
+    a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1);                                \
+    temp1 = __lasx_xvilvl_h(w2, w4);                                           \
+    a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1);                                \
+    temp1 = __lasx_xvneg_h(w4);                                                \
+    temp2 = __lasx_xvilvl_h(w2, temp1);                                        \
+    a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2);                                \
+    temp1 = __lasx_xvneg_h(w6);                                                \
+    temp2 = __lasx_xvilvl_h(temp1, w4);                                        \
+    a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2);                                \
+                                                                               \
+    DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1);                \
+    b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1);                                \
+    DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2);                  \
+    b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1);                                \
+    b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2);                                \
+    temp1 = __lasx_xvneg_h(w1);                                                \
+    temp2 = __lasx_xvilvl_h(temp1, w3);                                        \
+    b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2);                                \
+                                                                               \
+    DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3,                  \
+              temp0, temp1, temp2, temp3);                                     \
+    DUP4_ARG2(__lasx_xvsub_w, a0, b0, a1, b1, a2, b2, a3, b3,                  \
+              a0, a1, a2, a3);                                                 \
+    DUP4_ARG2(__lasx_xvsrai_w, temp0, 11, temp1, 11, temp2, 11, temp3, 11,     \
+              temp0, temp1, temp2, temp3);                                     \
+    DUP4_ARG2(__lasx_xvsrai_w, a0, 11, a1, 11, a2, 11, a3, 11, a0, a1, a2, a3);\
+    DUP4_ARG3(__lasx_xvbitsel_v, temp0, temp, select_vec, temp1, temp,         \
+              select_vec, temp2, temp, select_vec, temp3, temp, select_vec,    \
+              in0, in1, in2, in3);                                             \
+    DUP4_ARG3(__lasx_xvbitsel_v, a0, temp, select_vec, a1, temp,               \
+              select_vec, a2, temp, select_vec, a3, temp, select_vec,          \
+              a0, a1, a2, a3);                                                 \
+    DUP4_ARG2(__lasx_xvpickev_h, in1, in0, in3, in2, a2, a3, a0, a1,           \
+              in0, in1, in2, in3);                                             \
+    DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,    \
+              in0, in1, in2, in3);                                             \
+
+#define LASX_IDCTCOLS                                                          \
+    /* part of FUNC6(idctSparaseCol) */                                        \
+    LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3);                \
+    temp0 = __lasx_xvmaddwl_w_h(const_val1, in0, w4);                          \
+    DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2);             \
+    a0    = __lasx_xvadd_w(temp0, temp1);                                      \
+    a1    = __lasx_xvadd_w(temp0, temp2);                                      \
+    a2    = __lasx_xvsub_w(temp0, temp2);                                      \
+    a3    = __lasx_xvsub_w(temp0, temp1);                                      \
+                                                                               \
+    DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1);                \
+    b0 = __lasx_xvdp2_w_h(temp0, temp1);                                       \
+    temp1 = __lasx_xvneg_h(w7);                                                \
+    temp2 = __lasx_xvilvl_h(temp1, w3);                                        \
+    b1 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
+    temp1 = __lasx_xvneg_h(w1);                                                \
+    temp2 = __lasx_xvilvl_h(temp1, w5);                                        \
+    b2 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
+    temp1 = __lasx_xvneg_h(w5);                                                \
+    temp2 = __lasx_xvilvl_h(temp1, w7);                                        \
+    b3 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
+                                                                               \
+    /* if (AV_RAN64A(row + 4)) */                                              \
+    DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1);                \
+    a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1);                                \
+    temp1 = __lasx_xvilvl_h(w2, w4);                                           \
+    a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1);                                \
+    temp1 = __lasx_xvneg_h(w4);                                                \
+    temp2 = __lasx_xvilvl_h(w2, temp1);                                        \
+    a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2);                                \
+    temp1 = __lasx_xvneg_h(w6);                                                \
+    temp2 = __lasx_xvilvl_h(temp1, w4);                                        \
+    a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2);                                \
+                                                                               \
+    DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1);                \
+    b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1);                                \
+    DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2);                  \
+    b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1);                                \
+    b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2);                                \
+    temp1 = __lasx_xvneg_h(w1);                                                \
+    temp2 = __lasx_xvilvl_h(temp1, w3);                                        \
+    b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2);                                \
+                                                                               \
+    DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3,                  \
+              temp0, temp1, temp2, temp3);                                     \
+    DUP4_ARG2(__lasx_xvsub_w, a3, b3, a2, b2, a1, b1, a0, b0,                  \
+              a3, a2, a1, a0);                                                 \
+    DUP4_ARG3(__lasx_xvsrani_h_w, temp1, temp0, 20, temp3, temp2, 20, a2, a3,  \
+              20, a0, a1, 20, in0, in1, in2, in3);                             \
+
+void ff_simple_idct_lasx(int16_t *block)
+{
+    int32_t const_val = 1 << 10;
+    __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
+                  0x4B42539F58C50000, 0x11A822A332493FFF};
+    __m256i in0, in1, in2, in3;
+    __m256i w2, w3, w4, w5, w6, w7;
+    __m256i a0, a1, a2, a3;
+    __m256i b0, b1, b2, b3;
+    __m256i temp0, temp1, temp2, temp3;
+    __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
+    __m256i const_val1, select_vec, temp;
+
+    LASX_IDCTROWCONDDC
+    LASX_IDCTCOLS
+    DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
+              in0, in1, in2, in3);
+    __lasx_xvst(in0, block, 0);
+    __lasx_xvst(in1, block, 32);
+    __lasx_xvst(in2, block, 64);
+    __lasx_xvst(in3, block, 96);
+}
+
+void ff_simple_idct_put_lasx(uint8_t *dst, ptrdiff_t dst_stride,
+                             int16_t *block)
+{
+    int32_t const_val = 1 << 10;
+    ptrdiff_t dst_stride_2x = dst_stride << 1;
+    ptrdiff_t dst_stride_4x = dst_stride << 2;
+    ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride;
+    __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
+                  0x4B42539F58C50000, 0x11A822A332493FFF};
+    __m256i in0, in1, in2, in3;
+    __m256i w2, w3, w4, w5, w6, w7;
+    __m256i a0, a1, a2, a3;
+    __m256i b0, b1, b2, b3;
+    __m256i temp0, temp1, temp2, temp3;
+    __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
+    __m256i const_val1, select_vec, temp;
+
+    LASX_IDCTROWCONDDC
+    LASX_IDCTCOLS
+    DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
+              in0, in1, in2, in3);
+    DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3);
+    DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1);
+    __lasx_xvstelm_d(in0, dst, 0, 0);
+    __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2);
+    __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1);
+    __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3);
+    dst += dst_stride_4x;
+    __lasx_xvstelm_d(in1, dst, 0, 0);
+    __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2);
+    __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1);
+    __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3);
+}
+
+void ff_simple_idct_add_lasx(uint8_t *dst, ptrdiff_t dst_stride,
+                             int16_t *block)
+{
+    int32_t const_val = 1 << 10;
+    uint8_t *dst1 = dst;
+    ptrdiff_t dst_stride_2x = dst_stride << 1;
+    ptrdiff_t dst_stride_4x = dst_stride << 2;
+    ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+    __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
+                  0x4B42539F58C50000, 0x11A822A332493FFF};
+    __m256i sh = {0x0003000200010000, 0x000B000A00090008,
+                  0x0007000600050004, 0x000F000E000D000C};
+    __m256i in0, in1, in2, in3;
+    __m256i w2, w3, w4, w5, w6, w7;
+    __m256i a0, a1, a2, a3;
+    __m256i b0, b1, b2, b3;
+    __m256i temp0, temp1, temp2, temp3;
+    __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
+    __m256i const_val1, select_vec, temp;
+
+    LASX_IDCTROWCONDDC
+    LASX_IDCTCOLS
+    a0    = __lasx_xvldrepl_d(dst1, 0);
+    a0    = __lasx_vext2xv_hu_bu(a0);
+    dst1 += dst_stride;
+    a1    = __lasx_xvldrepl_d(dst1, 0);
+    a1    = __lasx_vext2xv_hu_bu(a1);
+    dst1 += dst_stride;
+    a2    = __lasx_xvldrepl_d(dst1, 0);
+    a2    = __lasx_vext2xv_hu_bu(a2);
+    dst1 += dst_stride;
+    a3    = __lasx_xvldrepl_d(dst1, 0);
+    a3    = __lasx_vext2xv_hu_bu(a3);
+    dst1 += dst_stride;
+    b0    = __lasx_xvldrepl_d(dst1, 0);
+    b0    = __lasx_vext2xv_hu_bu(b0);
+    dst1 += dst_stride;
+    b1    = __lasx_xvldrepl_d(dst1, 0);
+    b1    = __lasx_vext2xv_hu_bu(b1);
+    dst1 += dst_stride;
+    b2    = __lasx_xvldrepl_d(dst1, 0);
+    b2    = __lasx_vext2xv_hu_bu(b2);
+    dst1 += dst_stride;
+    b3    = __lasx_xvldrepl_d(dst1, 0);
+    b3    = __lasx_vext2xv_hu_bu(b3);
+    DUP4_ARG3(__lasx_xvshuf_h, sh, a1, a0, sh, a3, a2, sh, b1, b0, sh, b3, b2,
+              temp0, temp1, temp2, temp3);
+    DUP4_ARG2(__lasx_xvadd_h, temp0, in0, temp1, in1, temp2, in2, temp3, in3,
+              in0, in1, in2, in3);
+    DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
+              in0, in1, in2, in3);
+    DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3);
+    DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1);
+    __lasx_xvstelm_d(in0, dst, 0, 0);
+    __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2);
+    __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1);
+    __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3);
+    dst += dst_stride_4x;
+    __lasx_xvstelm_d(in1, dst, 0, 0);
+    __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2);
+    __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1);
+    __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3);
+}
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/vc1dsp.h"
+#include "vc1dsp_loongarch.h"
+
+#define FN_ASSIGN(OP, X, Y, INSN) \
+    dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = ff_##OP##vc1_mspel_mc##X##Y##INSN; \
+    dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = ff_##OP##vc1_mspel_mc##X##Y##_16##INSN
+
+#define FN_ASSIGN_V(OP, Y, INSN) \
+    dsp->OP##vc1_mspel_pixels_tab[0][4*Y] = ff_##OP##vc1_mspel_mc0##Y##_16##INSN
+
+#define FN_ASSIGN_H(OP, X, INSN) \
+    dsp->OP##vc1_mspel_pixels_tab[0][X] = ff_##OP##vc1_mspel_mc##X##0_16##INSN
+
+av_cold void ff_vc1dsp_init_loongarch(VC1DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_lasx(cpu_flags)) {
+        dsp->vc1_inv_trans_8x8    = ff_vc1_inv_trans_8x8_lasx;
+        dsp->vc1_inv_trans_4x8    = ff_vc1_inv_trans_4x8_lasx;
+        dsp->vc1_inv_trans_8x4    = ff_vc1_inv_trans_8x4_lasx;
+        dsp->vc1_inv_trans_4x4    = ff_vc1_inv_trans_4x4_lasx;
+        dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_lasx;
+        dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_lasx;
+        dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_lasx;
+        dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_lasx;
+        FN_ASSIGN(put_, 1, 1, _lasx);
+        FN_ASSIGN(put_, 1, 2, _lasx);
+        FN_ASSIGN(put_, 1, 3, _lasx);
+        FN_ASSIGN(put_, 2, 1, _lasx);
+        FN_ASSIGN(put_, 2, 2, _lasx);
+        FN_ASSIGN(put_, 2, 3, _lasx);
+        FN_ASSIGN(put_, 3, 1, _lasx);
+        FN_ASSIGN(put_, 3, 2, _lasx);
+        FN_ASSIGN(put_, 3, 3, _lasx);
+        FN_ASSIGN_V(put_, 1, _lasx);
+        FN_ASSIGN_V(put_, 2, _lasx);
+        FN_ASSIGN_V(put_, 3, _lasx);
+        FN_ASSIGN_H(put_, 1, _lasx);
+        FN_ASSIGN_H(put_, 2, _lasx);
+        FN_ASSIGN_H(put_, 3, _lasx);
+        dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_no_rnd_vc1_chroma_mc8_lasx;
+    }
+}
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_VC1DSP_LOONGARCH_H
+#define AVCODEC_LOONGARCH_VC1DSP_LOONGARCH_H
+
+#include "libavcodec/vc1dsp.h"
+#include "libavutil/avassert.h"
+
+void ff_vc1_inv_trans_8x8_lasx(int16_t block[64]);
+void ff_vc1_inv_trans_8x8_dc_lasx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_8x4_lasx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_lasx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_lasx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x8_lasx(uint8_t *dest, ptrdiff_t stride, int16_t *blokc);
+void ff_vc1_inv_trans_4x4_dc_lasx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x4_lasx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
+#define FF_PUT_VC1_MSPEL_MC_LASX(hmode, vmode)                                \
+void ff_put_vc1_mspel_mc ## hmode ## vmode ## _lasx(uint8_t *dst,             \
+                                                  const uint8_t *src,         \
+                                                  ptrdiff_t stride, int rnd); \
+void ff_put_vc1_mspel_mc ## hmode ## vmode ## _16_lasx(uint8_t *dst,          \
+                                                  const uint8_t *src,         \
+                                                  ptrdiff_t stride, int rnd);
+
+FF_PUT_VC1_MSPEL_MC_LASX(1, 1);
+FF_PUT_VC1_MSPEL_MC_LASX(1, 2);
+FF_PUT_VC1_MSPEL_MC_LASX(1, 3);
+
+FF_PUT_VC1_MSPEL_MC_LASX(2, 1);
+FF_PUT_VC1_MSPEL_MC_LASX(2, 2);
+FF_PUT_VC1_MSPEL_MC_LASX(2, 3);
+
+FF_PUT_VC1_MSPEL_MC_LASX(3, 1);
+FF_PUT_VC1_MSPEL_MC_LASX(3, 2);
+FF_PUT_VC1_MSPEL_MC_LASX(3, 3);
+
+#define FF_PUT_VC1_MSPEL_MC_V_LASX(vmode)                                 \
+void ff_put_vc1_mspel_mc0 ## vmode ## _16_lasx(uint8_t *dst,              \
+                                               const uint8_t *src,        \
+                                               ptrdiff_t stride, int rnd);
+
+FF_PUT_VC1_MSPEL_MC_V_LASX(1);
+FF_PUT_VC1_MSPEL_MC_V_LASX(2);
+FF_PUT_VC1_MSPEL_MC_V_LASX(3);
+
+#define FF_PUT_VC1_MSPEL_MC_H_LASX(hmode)                                 \
+void ff_put_vc1_mspel_mc ## hmode ## 0_16_lasx(uint8_t *dst,              \
+                                               const uint8_t *src,        \
+                                               ptrdiff_t stride, int rnd);
+
+FF_PUT_VC1_MSPEL_MC_H_LASX(1);
+FF_PUT_VC1_MSPEL_MC_H_LASX(2);
+FF_PUT_VC1_MSPEL_MC_H_LASX(3);
+
+void ff_put_no_rnd_vc1_chroma_mc8_lasx(uint8_t *dst /* align 8 */,
+                                       const uint8_t *src /* align 1 */,
+                                       ptrdiff_t stride, int h, int x, int y);
+
+#endif /* AVCODEC_LOONGARCH_VC1DSP_LOONGARCH_H */
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Xiwei Gu <guxiwei-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/videodsp.h"
+#include "libavutil/attributes.h"
+
+static void prefetch_loongarch(const uint8_t *mem, ptrdiff_t stride, int h)
+{
+    register const uint8_t *p = mem;
+
+    __asm__ volatile (
+        "1:                                     \n\t"
+        "preld      0,     %[p],     0          \n\t"
+        "preld      0,     %[p],     32         \n\t"
+        "addi.d     %[h],  %[h],     -1         \n\t"
+        "add.d      %[p],  %[p],     %[stride]  \n\t"
+
+        "blt        $r0,   %[h],     1b         \n\t"
+        : [p] "+r" (p), [h] "+r" (h)
+        : [stride] "r" (stride)
+    );
+}
+
+av_cold void ff_videodsp_init_loongarch(VideoDSPContext *ctx, int bpc)
+{
+    ctx->prefetch = prefetch_loongarch;
+}
@@ -0,0 +1,591 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp8dsp.h"
+#include "vp8dsp_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+#define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out,  \
+                           mask_in, hev_in)                             \
+{                                                                       \
+    __m128i p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;               \
+    __m128i filt, filt1, filt2, cnst4b, cnst3b;                         \
+    __m128i q0_sub_p0_l, q0_sub_p0_h, filt_h, filt_l, cnst3h;           \
+                                                                        \
+    p1_m = __lsx_vxori_b(p1_in_out, 0x80);                              \
+    p0_m = __lsx_vxori_b(p0_in_out, 0x80);                              \
+    q0_m = __lsx_vxori_b(q0_in_out, 0x80);                              \
+    q1_m = __lsx_vxori_b(q1_in_out, 0x80);                              \
+    filt = __lsx_vssub_b(p1_m, q1_m);                                   \
+    filt = filt & hev_in;                                               \
+                                                                        \
+    q0_sub_p0 = __lsx_vsub_b(q0_m, p0_m);                               \
+    filt_sign = __lsx_vslti_b(filt, 0);                                 \
+                                                                        \
+    cnst3h = __lsx_vreplgr2vr_h(3);                                     \
+    q0_sub_p0_l = __lsx_vilvl_b(q0_sub_p0, q0_sub_p0);                  \
+    q0_sub_p0_l = __lsx_vdp2_h_b(q0_sub_p0_l, cnst3h);                  \
+    filt_l = __lsx_vilvl_b(filt_sign, filt);                            \
+    filt_l = __lsx_vadd_h(filt_l, q0_sub_p0_l);                         \
+    filt_l = __lsx_vsat_h(filt_l, 7);                                   \
+                                                                        \
+    q0_sub_p0_h = __lsx_vilvh_b(q0_sub_p0, q0_sub_p0);                  \
+    q0_sub_p0_h = __lsx_vdp2_h_b(q0_sub_p0_h, cnst3h);                  \
+    filt_h = __lsx_vilvh_b(filt_sign, filt);                            \
+    filt_h = __lsx_vadd_h(filt_h, q0_sub_p0_h);                         \
+    filt_h = __lsx_vsat_h(filt_h, 7);                                   \
+                                                                        \
+    filt = __lsx_vpickev_b(filt_h, filt_l);                             \
+    filt = filt & mask_in;                                              \
+    cnst4b = __lsx_vreplgr2vr_b(4);                                     \
+    filt1 = __lsx_vsadd_b(filt, cnst4b);                                \
+    filt1 = __lsx_vsrai_b(filt1, 3);                                    \
+                                                                        \
+    cnst3b = __lsx_vreplgr2vr_b(3);                                     \
+    filt2 = __lsx_vsadd_b(filt, cnst3b);                                \
+    filt2 = __lsx_vsrai_b(filt2, 3);                                    \
+                                                                        \
+    q0_m = __lsx_vssub_b(q0_m, filt1);                                  \
+    q0_in_out = __lsx_vxori_b(q0_m, 0x80);                              \
+    p0_m = __lsx_vsadd_b(p0_m, filt2);                                  \
+    p0_in_out = __lsx_vxori_b(p0_m, 0x80);                              \
+                                                                        \
+    filt = __lsx_vsrari_b(filt1, 1);                                    \
+    hev_in = __lsx_vxori_b(hev_in, 0xff);                               \
+    filt = filt & hev_in;                                               \
+                                                                        \
+    q1_m = __lsx_vssub_b(q1_m, filt);                                   \
+    q1_in_out = __lsx_vxori_b(q1_m, 0x80);                              \
+    p1_m = __lsx_vsadd_b(p1_m, filt);                                   \
+    p1_in_out = __lsx_vxori_b(p1_m, 0x80);                              \
+}
+
+#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev)             \
+{                                                                   \
+    __m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;                     \
+    __m128i filt, q0_sub_p0, cnst4b, cnst3b;                        \
+    __m128i u, filt1, filt2, filt_sign, q0_sub_p0_sign;             \
+    __m128i q0_sub_p0_l, q0_sub_p0_h, filt_l, u_l, u_h, filt_h;     \
+    __m128i cnst3h, cnst27h, cnst18h, cnst63h;                      \
+                                                                    \
+    cnst3h = __lsx_vreplgr2vr_h(3);                                 \
+                                                                    \
+    p2_m = __lsx_vxori_b(p2, 0x80);                                 \
+    p1_m = __lsx_vxori_b(p1, 0x80);                                 \
+    p0_m = __lsx_vxori_b(p0, 0x80);                                 \
+    q0_m = __lsx_vxori_b(q0, 0x80);                                 \
+    q1_m = __lsx_vxori_b(q1, 0x80);                                 \
+    q2_m = __lsx_vxori_b(q2, 0x80);                                 \
+                                                                    \
+    filt = __lsx_vssub_b(p1_m, q1_m);                               \
+    q0_sub_p0 = __lsx_vsub_b(q0_m, p0_m);                           \
+    q0_sub_p0_sign = __lsx_vslti_b(q0_sub_p0, 0);                   \
+    filt_sign = __lsx_vslti_b(filt, 0);                             \
+                                                                    \
+    /* right part */                                                \
+    q0_sub_p0_l = __lsx_vilvl_b(q0_sub_p0_sign, q0_sub_p0);         \
+    q0_sub_p0_l = __lsx_vmul_h(q0_sub_p0_l, cnst3h);                \
+    filt_l = __lsx_vilvl_b(filt_sign, filt);                        \
+    filt_l = __lsx_vadd_h(filt_l, q0_sub_p0_l);                     \
+    filt_l = __lsx_vsat_h(filt_l, 7);                               \
+                                                                    \
+    /* left part */                                                 \
+    q0_sub_p0_h = __lsx_vilvh_b(q0_sub_p0_sign, q0_sub_p0);         \
+    q0_sub_p0_h = __lsx_vmul_h(q0_sub_p0_h, cnst3h);                \
+    filt_h = __lsx_vilvh_b(filt_sign, filt);                        \
+    filt_h = __lsx_vadd_h(filt_h,  q0_sub_p0_h);                    \
+    filt_h = __lsx_vsat_h(filt_h, 7);                               \
+                                                                    \
+    /* combine left and right part */                               \
+    filt = __lsx_vpickev_b(filt_h, filt_l);                         \
+    filt = filt & mask;                                             \
+    filt2 = filt & hev;                                             \
+    /* filt_val &= ~hev */                                          \
+    hev = __lsx_vxori_b(hev, 0xff);                                 \
+    filt = filt & hev;                                              \
+    cnst4b = __lsx_vreplgr2vr_b(4);                                 \
+    filt1 = __lsx_vsadd_b(filt2, cnst4b);                           \
+    filt1 = __lsx_vsrai_b(filt1, 3);                                \
+    cnst3b = __lsx_vreplgr2vr_b(3);                                 \
+    filt2 = __lsx_vsadd_b(filt2, cnst3b);                           \
+    filt2 = __lsx_vsrai_b(filt2, 3);                                \
+    q0_m = __lsx_vssub_b(q0_m, filt1);                              \
+    p0_m = __lsx_vsadd_b(p0_m, filt2);                              \
+                                                                    \
+    filt_sign = __lsx_vslti_b(filt, 0);                             \
+    filt_l = __lsx_vilvl_b(filt_sign, filt);                        \
+    filt_h = __lsx_vilvh_b(filt_sign, filt);                        \
+                                                                    \
+    cnst27h = __lsx_vreplgr2vr_h(27);                               \
+    cnst63h = __lsx_vreplgr2vr_h(63);                               \
+                                                                    \
+    /* right part */                                                \
+    u_l = __lsx_vmul_h(filt_l, cnst27h);                            \
+    u_l = __lsx_vadd_h(u_l, cnst63h);                               \
+    u_l = __lsx_vsrai_h(u_l, 7);                                    \
+    u_l = __lsx_vsat_h(u_l, 7);                                     \
+    /* left part */                                                 \
+    u_h = __lsx_vmul_h(filt_h, cnst27h);                            \
+    u_h = __lsx_vadd_h(u_h, cnst63h);                               \
+    u_h = __lsx_vsrai_h(u_h, 7);                                    \
+    u_h = __lsx_vsat_h(u_h, 7);                                     \
+    /* combine left and right part */                               \
+    u = __lsx_vpickev_b(u_h, u_l);                                  \
+    q0_m = __lsx_vssub_b(q0_m, u);                                  \
+    q0 = __lsx_vxori_b(q0_m, 0x80);                                 \
+    p0_m = __lsx_vsadd_b(p0_m, u);                                  \
+    p0 = __lsx_vxori_b(p0_m, 0x80);                                 \
+    cnst18h = __lsx_vreplgr2vr_h(18);                               \
+    u_l = __lsx_vmul_h(filt_l, cnst18h);                            \
+    u_l = __lsx_vadd_h(u_l, cnst63h);                               \
+    u_l = __lsx_vsrai_h(u_l, 7);                                    \
+    u_l = __lsx_vsat_h(u_l, 7);                                     \
+                                                                    \
+    /* left part */                                                 \
+    u_h = __lsx_vmul_h(filt_h, cnst18h);                            \
+    u_h = __lsx_vadd_h(u_h, cnst63h);                               \
+    u_h = __lsx_vsrai_h(u_h, 7);                                    \
+    u_h = __lsx_vsat_h(u_h, 7);                                     \
+    /* combine left and right part */                               \
+    u = __lsx_vpickev_b(u_h, u_l);                                  \
+    q1_m = __lsx_vssub_b(q1_m, u);                                  \
+    q1 = __lsx_vxori_b(q1_m, 0x80);                                 \
+    p1_m = __lsx_vsadd_b(p1_m, u);                                  \
+    p1 = __lsx_vxori_b(p1_m, 0x80);                                 \
+    u_l = __lsx_vslli_h(filt_l, 3);                                 \
+    u_l = __lsx_vadd_h(u_l, filt_l);                                \
+    u_l = __lsx_vadd_h(u_l, cnst63h);                               \
+    u_l = __lsx_vsrai_h(u_l, 7);                                    \
+    u_l = __lsx_vsat_h(u_l, 7);                                     \
+                                                                    \
+    /* left part */                                                 \
+    u_h = __lsx_vslli_h(filt_h, 3);                                 \
+    u_h = __lsx_vadd_h(u_h, filt_h);                                \
+    u_h = __lsx_vadd_h(u_h, cnst63h);                               \
+    u_h = __lsx_vsrai_h(u_h, 7);                                    \
+    u_h = __lsx_vsat_h(u_h, 7);                                     \
+    /* combine left and right part */                               \
+    u = __lsx_vpickev_b(u_h, u_l);                                  \
+    q2_m = __lsx_vssub_b(q2_m, u);                                  \
+    q2 = __lsx_vxori_b(q2_m, 0x80);                                 \
+    p2_m = __lsx_vsadd_b(p2_m, u);                                  \
+    p2 = __lsx_vxori_b(p2_m, 0x80);                                 \
+}
+
+#define LPF_MASK_HEV(p3_src, p2_src, p1_src, p0_src,                \
+                     q0_src, q1_src, q2_src, q3_src,                \
+                     limit_src, b_limit_src, thresh_src,            \
+                     hev_dst, mask_dst, flat_dst)                   \
+{                                                                   \
+    __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
+    __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
+                                                                    \
+    /* absolute subtraction of pixel values */                      \
+    p3_asub_p2_m = __lsx_vabsd_bu(p3_src, p2_src);                  \
+    p2_asub_p1_m = __lsx_vabsd_bu(p2_src, p1_src);                  \
+    p1_asub_p0_m = __lsx_vabsd_bu(p1_src, p0_src);                  \
+    q1_asub_q0_m = __lsx_vabsd_bu(q1_src, q0_src);                  \
+    q2_asub_q1_m = __lsx_vabsd_bu(q2_src, q1_src);                  \
+    q3_asub_q2_m = __lsx_vabsd_bu(q3_src, q2_src);                  \
+    p0_asub_q0_m = __lsx_vabsd_bu(p0_src, q0_src);                  \
+    p1_asub_q1_m = __lsx_vabsd_bu(p1_src, q1_src);                  \
+                                                                    \
+    /* calculation of hev */                                        \
+    flat_dst = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m);           \
+    hev_dst = __lsx_vslt_bu(thresh_src, flat_dst);                  \
+    /* calculation of mask */                                       \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m);      \
+    p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1);                  \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m);      \
+    mask_dst = __lsx_vslt_bu(b_limit_src, p0_asub_q0_m);            \
+    mask_dst = __lsx_vmax_bu(flat_dst, mask_dst);                   \
+    p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m);       \
+    mask_dst = __lsx_vmax_bu(p3_asub_p2_m, mask_dst);               \
+    q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m);       \
+    mask_dst = __lsx_vmax_bu(q2_asub_q1_m, mask_dst);               \
+    mask_dst = __lsx_vslt_bu(limit_src, mask_dst);                  \
+    mask_dst = __lsx_vxori_b(mask_dst, 0xff);                       \
+}
+
+#define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride)      \
+{                                                                   \
+    __lsx_vstelm_w(in0, pdst, 0, in0_idx);                          \
+    __lsx_vstelm_h(in1, pdst + stride, 0, in1_idx);                 \
+}
+
+#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)     \
+{                                                           \
+    __lsx_vstelm_w(in, pdst, 0, idx0);                      \
+    pdst += stride;                                         \
+    __lsx_vstelm_w(in, pdst, 0, idx1);                      \
+    pdst += stride;                                         \
+    __lsx_vstelm_w(in, pdst, 0, idx2);                      \
+    pdst += stride;                                         \
+    __lsx_vstelm_w(in, pdst, 0, idx3);                      \
+    pdst += stride;                                         \
+}
+
+void ff_vp8_v_loop_filter16_lsx(uint8_t *dst, ptrdiff_t stride, int b_limit_in,
+                                int limit_in, int thresh_in)
+{
+    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+    __m128i mask, hev, flat, thresh, limit, b_limit;
+
+    ptrdiff_t stride2 = stride << 1;
+    ptrdiff_t stride3 = stride2 + stride;
+    ptrdiff_t stride4 = stride2 << 1;
+
+    b_limit = __lsx_vreplgr2vr_b(b_limit_in);
+    limit = __lsx_vreplgr2vr_b(limit_in);
+    thresh = __lsx_vreplgr2vr_b(thresh_in);
+
+    /*load vector elements*/
+    DUP4_ARG2(__lsx_vld, dst - stride4, 0, dst - stride3, 0, dst - stride2, 0,
+              dst - stride, 0, p3, p2, p1, p0);
+    DUP4_ARG2(__lsx_vld, dst, 0, dst + stride, 0, dst + stride2, 0, dst + stride3, 0,
+              q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+    /*store vector elements*/
+    __lsx_vst(p2, dst - stride3, 0);
+    __lsx_vst(p1, dst - stride2, 0);
+    __lsx_vst(p0, dst - stride,  0);
+    __lsx_vst(q0, dst,           0);
+
+    __lsx_vst(q1, dst + stride,  0);
+    __lsx_vst(q2, dst + stride2, 0);
+}
+
+void ff_vp8_v_loop_filter8uv_lsx(uint8_t *dst_u, uint8_t *dst_v,
+                                 ptrdiff_t stride, int b_limit_in,
+                                 int limit_in, int thresh_in)
+{
+    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+    __m128i mask, hev, flat, thresh, limit, b_limit;
+    __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+    __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+    ptrdiff_t stride2 = stride << 1;
+    ptrdiff_t stride3 = stride2 + stride;
+    ptrdiff_t stride4 = stride2 << 1;
+
+    b_limit = __lsx_vreplgr2vr_b(b_limit_in);
+    limit = __lsx_vreplgr2vr_b(limit_in);
+    thresh = __lsx_vreplgr2vr_b(thresh_in);
+
+    DUP4_ARG2(__lsx_vld, dst_u - stride4, 0, dst_u - stride3, 0, dst_u - stride2, 0,
+              dst_u - stride, 0, p3_u, p2_u, p1_u, p0_u);
+    DUP4_ARG2(__lsx_vld, dst_u, 0, dst_u + stride, 0, dst_u + stride2, 0,
+              dst_u + stride3, 0, q0_u, q1_u, q2_u, q3_u);
+
+    DUP4_ARG2(__lsx_vld, dst_v - stride4, 0, dst_v - stride3, 0, dst_v - stride2, 0,
+              dst_v - stride, 0, p3_v, p2_v, p1_v, p0_v);
+    DUP4_ARG2(__lsx_vld, dst_v, 0, dst_v + stride, 0, dst_v + stride2, 0,
+              dst_v + stride3, 0, q0_v, q1_v, q2_v, q3_v);
+
+    /* rht 8 element of p3 are u pixel and left 8 element of p3 are v pixei */
+    DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+    DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+    __lsx_vstelm_d(p2, dst_u - stride3, 0, 0);
+    __lsx_vstelm_d(p1, dst_u - stride2, 0, 0);
+    __lsx_vstelm_d(p0, dst_u - stride , 0, 0);
+    __lsx_vstelm_d(q0, dst_u,           0, 0);
+
+    __lsx_vstelm_d(q1, dst_u + stride,  0, 0);
+    __lsx_vstelm_d(q2, dst_u + stride2, 0, 0);
+
+    __lsx_vstelm_d(p2, dst_v - stride3, 0, 1);
+    __lsx_vstelm_d(p1, dst_v - stride2, 0, 1);
+    __lsx_vstelm_d(p0, dst_v - stride , 0, 1);
+    __lsx_vstelm_d(q0, dst_v,           0, 1);
+
+    __lsx_vstelm_d(q1, dst_v + stride,  0, 1);
+    __lsx_vstelm_d(q2, dst_v + stride2, 0, 1);
+}
+
+void ff_vp8_h_loop_filter16_lsx(uint8_t *dst, ptrdiff_t stride, int b_limit_in,
+                                int limit_in, int thresh_in)
+{
+    uint8_t *temp_src;
+    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+    __m128i mask, hev, flat, thresh, limit, b_limit;
+    __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    __m128i row9, row10, row11, row12, row13, row14, row15;
+    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    ptrdiff_t stride2 = stride << 1;
+    ptrdiff_t stride3 = stride2 + stride;
+    ptrdiff_t stride4 = stride2 << 1;
+
+    b_limit = __lsx_vreplgr2vr_b(b_limit_in);
+    limit = __lsx_vreplgr2vr_b(limit_in);
+    thresh = __lsx_vreplgr2vr_b(thresh_in);
+
+    temp_src = dst - 4;
+    DUP4_ARG2(__lsx_vld, temp_src, 0, temp_src + stride, 0, temp_src + stride2, 0,
+              temp_src + stride3, 0, row0, row1, row2, row3);
+    temp_src += stride4;
+    DUP4_ARG2(__lsx_vld, temp_src, 0, temp_src + stride, 0, temp_src + stride2, 0,
+              temp_src + stride3, 0, row4, row5, row6, row7);
+
+    temp_src += stride4;
+    DUP4_ARG2(__lsx_vld, temp_src, 0, temp_src + stride, 0, temp_src + stride2, 0,
+              temp_src + stride3, 0, row8, row9, row10, row11);
+    temp_src += stride4;
+    DUP4_ARG2(__lsx_vld, temp_src, 0, temp_src + stride, 0, temp_src + stride2, 0,
+              temp_src + stride3, 0, row12, row13, row14, row15);
+    LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, row9, row10,
+                        row11, row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+    tmp0 = __lsx_vilvl_b(p1, p2);
+    tmp1 = __lsx_vilvl_b(q0, p0);
+
+    tmp3 = __lsx_vilvl_h(tmp1, tmp0);
+    tmp4 = __lsx_vilvh_h(tmp1, tmp0);
+
+    tmp0 = __lsx_vilvh_b(p1, p2);
+    tmp1 = __lsx_vilvh_b(q0, p0);
+
+    tmp6 = __lsx_vilvl_h(tmp1, tmp0);
+    tmp7 = __lsx_vilvh_h(tmp1, tmp0);
+
+    tmp2 = __lsx_vilvl_b(q2, q1);
+    tmp5 = __lsx_vilvh_b(q2, q1);
+
+    temp_src = dst - 3;
+    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4);
+    temp_src += stride;
+    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4);
+    temp_src += stride;
+    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4);
+    temp_src += stride;
+    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4);
+    temp_src += stride;
+    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4);
+    temp_src += stride;
+    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4);
+    temp_src += stride;
+    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4);
+    temp_src += stride;
+    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4);
+    temp_src += stride;
+    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4);
+    temp_src += stride;
+    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4);
+    temp_src += stride;
+    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4);
+    temp_src += stride;
+    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4);
+    temp_src += stride;
+    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4);
+    temp_src += stride;
+    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4);
+    temp_src += stride;
+    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4);
+    temp_src += stride;
+    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4);
+}
+
+void ff_vp8_h_loop_filter8uv_lsx(uint8_t *dst_u, uint8_t *dst_v,
+                                 ptrdiff_t stride, int b_limit_in,
+                                 int limit_in, int thresh_in)
+{
+    uint8_t *temp_src;
+    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+    __m128i mask, hev, flat, thresh, limit, b_limit;
+    __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    __m128i row9, row10, row11, row12, row13, row14, row15;
+    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    ptrdiff_t stride2 = stride << 1;
+    ptrdiff_t stride3 = stride2 + stride;
+    ptrdiff_t stride4 = stride2 << 1;
+
+    b_limit = __lsx_vreplgr2vr_b(b_limit_in);
+    limit = __lsx_vreplgr2vr_b(limit_in);
+    thresh = __lsx_vreplgr2vr_b(thresh_in);
+
+    temp_src = dst_u - 4;
+    DUP4_ARG2(__lsx_vld, temp_src, 0, temp_src + stride, 0, temp_src + stride2, 0,
+              temp_src + stride3, 0, row0, row1, row2, row3);
+    temp_src += stride4;
+    DUP4_ARG2(__lsx_vld, temp_src, 0, temp_src + stride, 0, temp_src + stride2, 0,
+              temp_src + stride3, 0, row4, row5, row6, row7);
+
+    temp_src = dst_v - 4;
+    DUP4_ARG2(__lsx_vld, temp_src, 0, temp_src + stride, 0, temp_src + stride2, 0,
+              temp_src + stride3, 0, row8, row9, row10, row11);
+    temp_src += stride4;
+    DUP4_ARG2(__lsx_vld, temp_src, 0, temp_src + stride, 0, temp_src + stride2, 0,
+              temp_src + stride3, 0, row12, row13, row14, row15);
+
+    LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+    tmp0 = __lsx_vilvl_b(p1, p2);
+    tmp1 = __lsx_vilvl_b(q0, p0);
+
+    tmp3 = __lsx_vilvl_h(tmp1, tmp0);
+    tmp4 = __lsx_vilvh_h(tmp1, tmp0);
+
+    tmp0 = __lsx_vilvh_b(p1, p2);
+    tmp1 = __lsx_vilvh_b(q0, p0);
+
+    tmp6 = __lsx_vilvl_h(tmp1, tmp0);
+    tmp7 = __lsx_vilvh_h(tmp1, tmp0);
+
+    tmp2 = __lsx_vilvl_b(q2, q1);
+    tmp5 = __lsx_vilvh_b(q2, q1);
+
+    dst_u -= 3;
+    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, dst_u, 4);
+    dst_u += stride;
+    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, dst_u, 4);
+    dst_u += stride;
+    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, dst_u, 4);
+    dst_u += stride;
+    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, dst_u, 4);
+    dst_u += stride;
+    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, dst_u, 4);
+    dst_u += stride;
+    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, dst_u, 4);
+    dst_u += stride;
+    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, dst_u, 4);
+    dst_u += stride;
+    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, dst_u, 4);
+
+    dst_v -= 3;
+    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, dst_v, 4);
+    dst_v += stride;
+    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, dst_v, 4);
+    dst_v += stride;
+    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, dst_v, 4);
+    dst_v += stride;
+    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, dst_v, 4);
+    dst_v += stride;
+    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, dst_v, 4);
+    dst_v += stride;
+    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, dst_v, 4);
+    dst_v += stride;
+    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, dst_v, 4);
+    dst_v += stride;
+    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, dst_v, 4);
+}
+
+void ff_vp8_v_loop_filter16_inner_lsx(uint8_t *src, ptrdiff_t stride,
+                                      int32_t e, int32_t i, int32_t h)
+{
+    __m128i mask, hev, flat;
+    __m128i thresh, b_limit, limit;
+    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+
+    ptrdiff_t stride2 = stride << 1;
+    ptrdiff_t stride3 = stride2 + stride;
+    ptrdiff_t stride4 = stride2 << 1;
+
+    /* load vector elements */
+    src -= stride4;
+    DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride2, 0,
+              src + stride3, 0, p3, p2, p1, p0);
+    src += stride4;
+    DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride2, 0,
+              src + stride3, 0, q0, q1, q2, q3);
+    thresh = __lsx_vreplgr2vr_b(h);
+    b_limit = __lsx_vreplgr2vr_b(e);
+    limit = __lsx_vreplgr2vr_b(i);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+    __lsx_vst(p1, src - stride2, 0);
+    __lsx_vst(p0, src - stride,  0);
+    __lsx_vst(q0, src,           0);
+    __lsx_vst(q1, src + stride,  0);
+}
+
+void ff_vp8_h_loop_filter16_inner_lsx(uint8_t *src, ptrdiff_t stride,
+                                      int32_t e, int32_t i, int32_t h)
+{
+    __m128i mask, hev, flat;
+    __m128i thresh, b_limit, limit;
+    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+
+    ptrdiff_t stride2 = stride << 1;
+    ptrdiff_t stride3 = stride2 + stride;
+    ptrdiff_t stride4 = stride2 << 1;
+
+    src -= 4;
+    DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride2, 0,
+              src + stride3, 0, tmp0, tmp1, tmp2, tmp3);
+    src += stride4;
+    DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride2, 0,
+              src + stride3, 0, tmp4, tmp5, tmp6, tmp7);
+    src += stride4;
+    DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride2, 0,
+              src + stride3, 0, tmp8, tmp9, tmp10, tmp11);
+    src += stride4;
+    DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride2, 0,
+              src + stride3, 0, tmp12, tmp13, tmp14, tmp15);
+    src -= 3 * stride4;
+
+    LSX_TRANSPOSE16x8_B(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+                        tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = __lsx_vreplgr2vr_b(h);
+    b_limit = __lsx_vreplgr2vr_b(e);
+    limit = __lsx_vreplgr2vr_b(i);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+    DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+    tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+    tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+
+    src += 2;
+    ST_W4(tmp2, 0, 1, 2, 3, src, stride);
+    ST_W4(tmp3, 0, 1, 2, 3, src, stride);
+
+    DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
+    tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+    tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+
+    ST_W4(tmp2, 0, 1, 2, 3, src, stride);
+    ST_W4(tmp3, 0, 1, 2, 3, src, stride);
+    src -= 4 * stride4;
+}
@@ -0,0 +1,952 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "libavcodec/vp8dsp.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "vp8dsp_loongarch.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    /* 4 width cases */
+    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static const int8_t subpel_filters_lsx[7][8] = {
+    {-6, 123, 12, -1, 0, 0, 0, 0},
+    {2, -11, 108, 36, -8, 1, 0, 0},     /* New 1/4 pel 6 tap filter */
+    {-9, 93, 50, -6, 0, 0, 0, 0},
+    {3, -16, 77, 77, -16, 3, 0, 0},     /* New 1/2 pel 6 tap filter */
+    {-6, 50, 93, -9, 0, 0, 0, 0},
+    {1, -8, 36, 108, -11, 2, 0, 0},     /* New 1/4 pel 6 tap filter */
+    {-1, 12, 123, -6, 0, 0, 0, 0},
+};
+
+#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)         \
+( {                                                                 \
+    __m128i out0_m;                                                 \
+                                                                    \
+    out0_m = __lsx_vdp2_h_b(in0, coeff0);                           \
+    out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1);                \
+    out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2);                \
+                                                                    \
+    out0_m;                                                         \
+} )
+
+#define VSHF_B3_SB(in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
+                out0, out1, out2)                                      \
+{                                                                      \
+    DUP2_ARG3(__lsx_vshuf_b, in1, in0, mask0, in3, in2, mask1,         \
+              out0, out1);                                             \
+    out2 = __lsx_vshuf_b(in5, in4, mask2);                             \
+}
+
+#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2,                 \
+                        filt_h0, filt_h1, filt_h2)                       \
+( {                                                                      \
+    __m128i vec0_m, vec1_m, vec2_m;                                      \
+    __m128i hz_out_m;                                                    \
+                                                                         \
+    VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2,  \
+               vec0_m, vec1_m, vec2_m);                                  \
+    hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m,                      \
+                            filt_h0, filt_h1, filt_h2);                  \
+                                                                         \
+    hz_out_m = __lsx_vsrari_h(hz_out_m, 7);                              \
+    hz_out_m = __lsx_vsat_h(hz_out_m, 7);                                \
+                                                                         \
+    hz_out_m;                                                            \
+} )
+
+#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                            \
+                                   mask0, mask1, mask2,                               \
+                                   filt0, filt1, filt2,                               \
+                                   out0, out1, out2, out3)                            \
+{                                                                                     \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;           \
+                                                                                      \
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2,        \
+              mask0, src3, src3, mask0, vec0_m, vec1_m, vec2_m, vec3_m);              \
+    DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0,            \
+              vec3_m, filt0, out0, out1, out2, out3);                                 \
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2,        \
+              mask1, src3, src3, mask1, vec0_m, vec1_m, vec2_m, vec3_m);              \
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2,        \
+              mask2, src3, src3, mask2, vec4_m, vec5_m, vec6_m, vec7_m);              \
+    DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1,            \
+              out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, out3);      \
+    DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2,            \
+              out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2, out3);      \
+}
+
+#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)           \
+( {                                                             \
+    __m128i tmp0;                                               \
+                                                                \
+    tmp0 = __lsx_vdp2_h_b(vec0, filt0);                         \
+    tmp0 = __lsx_vdp2add_h_b(tmp0, vec1, filt1);                \
+                                                                \
+    tmp0;                                                       \
+} )
+
+#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)    \
+( {                                                                    \
+    __m128i vec0_m, vec1_m;                                            \
+    __m128i hz_out_m;                                                  \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1,     \
+              vec0_m, vec1_m);                                         \
+    hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1);  \
+                                                                       \
+    hz_out_m = __lsx_vsrari_h(hz_out_m, 7);                            \
+    hz_out_m = __lsx_vsat_h(hz_out_m, 7);                              \
+                                                                       \
+    hz_out_m;                                                          \
+} )
+
+void ff_put_vp8_epel8_h6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                             const uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_lsx[mx - 1];
+    __m128i src0, src1, src2, src3, filt0, filt1, filt2;
+    __m128i mask0, mask1, mask2;
+    __m128i out0, out1, out2, out3;
+
+    ptrdiff_t src_stride2 = src_stride << 1;
+    ptrdiff_t src_stride3 = src_stride2 + src_stride;
+    ptrdiff_t src_stride4 = src_stride2 << 1;
+
+    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+    src -= 2;
+
+    /* rearranging filter */
+    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+    filt2 = __lsx_vldrepl_h(filter, 4);
+
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
+              src + src_stride3, 0, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
+              src0, src1, src2, src3);
+    src += src_stride4;
+    HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1, out2, out3);
+
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+
+    for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
+                  src + src_stride3, 0, src0, src1, src2, src3);
+        DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
+                  src0, src1, src2, src3);
+        src += src_stride4;
+        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out0, out1, out2, out3);
+
+        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+
+        __lsx_vstelm_d(out0, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(out0, dst, 0, 1);
+        dst += dst_stride;
+        __lsx_vstelm_d(out1, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(out1, dst, 0, 1);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_vp8_epel16_h6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                              const uint8_t *src, ptrdiff_t src_stride,
+                              int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_lsx[mx - 1];
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1;
+    __m128i filt2, mask0, mask1, mask2;
+    __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+
+    ptrdiff_t src_stride2 = src_stride << 1;
+    ptrdiff_t src_stride3 = src_stride2 + src_stride;
+    ptrdiff_t src_stride4 = src_stride2 << 1;
+
+    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+    src -= 2;
+    /* rearranging filter */
+    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+    filt2 = __lsx_vldrepl_h(filter, 4);
+
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2,
+                  0, src + src_stride3, 0, src0 ,src2, src4, src6);
+        DUP4_ARG2(__lsx_vld, src, 8, src + src_stride, 8, src + src_stride2,
+                  8, src + src_stride3, 8, src1, src3, src5, src7);
+
+        DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
+                  src0, src1, src2, src3);
+        DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128,
+                  src4, src5, src6, src7);
+        src += src_stride4;
+
+        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out0, out1, out2, out3);
+        HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out4, out5, out6, out7);
+        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+        __lsx_vst(out0, dst, 0);
+        dst += dst_stride;
+        __lsx_vst(out1, dst, 0);
+        dst += dst_stride;
+
+        DUP2_ARG3(__lsx_vssrarni_b_h, out5, out4, 7, out7, out6, 7, out4, out5);
+        DUP2_ARG2(__lsx_vxori_b, out4, 128, out5, 128, out4, out5);
+        __lsx_vst(out4, dst, 0);
+        dst += dst_stride;
+        __lsx_vst(out5, dst, 0);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_vp8_epel8_v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                             const uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_lsx[my - 1];
+    __m128i src0, src1, src2, src3, src4, src7, src8, src9, src10;
+    __m128i src10_l, src32_l, src76_l, src98_l, src21_l, src43_l, src87_l;
+    __m128i src109_l, filt0, filt1, filt2;
+    __m128i out0_l, out1_l, out2_l, out3_l;
+
+    ptrdiff_t src_stride2 = src_stride << 1;
+    ptrdiff_t src_stride3 = src_stride2 + src_stride;
+    ptrdiff_t src_stride4 = src_stride2 << 1;
+
+    src -= src_stride2;
+    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+    filt2 = __lsx_vldrepl_h(filter, 4);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
+              src + src_stride3, 0, src0, src1, src2, src3);
+    src += src_stride4;
+    src4 = __lsx_vld(src, 0);
+    src += src_stride;
+
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
+              src0, src1, src2, src3);
+    src4 = __lsx_vxori_b(src4, 128);
+
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src2, src1, src4,
+              src3, src10_l, src32_l, src21_l, src43_l);
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2,
+                  0, src + src_stride3, 0, src7, src8, src9, src10);
+        DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10,
+                  128, src7, src8, src9, src10);
+        src += src_stride4;
+
+        DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10,
+                  src9, src76_l, src87_l, src98_l, src109_l);
+
+        out0_l = DPADD_SH3_SH(src10_l, src32_l, src76_l, filt0, filt1, filt2);
+        out1_l = DPADD_SH3_SH(src21_l, src43_l, src87_l, filt0, filt1, filt2);
+        out2_l = DPADD_SH3_SH(src32_l, src76_l, src98_l, filt0, filt1, filt2);
+        out3_l = DPADD_SH3_SH(src43_l, src87_l, src109_l, filt0, filt1, filt2);
+
+        DUP2_ARG3(__lsx_vssrarni_b_h, out1_l, out0_l, 7, out3_l, out2_l, 7,
+                  out0_l, out1_l);
+        DUP2_ARG2(__lsx_vxori_b, out0_l, 128, out1_l, 128, out0_l, out1_l);
+
+        __lsx_vstelm_d(out0_l, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(out0_l, dst, 0, 1);
+        dst += dst_stride;
+        __lsx_vstelm_d(out1_l, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(out1_l, dst, 0, 1);
+        dst += dst_stride;
+
+        src10_l = src76_l;
+        src32_l = src98_l;
+        src21_l = src87_l;
+        src43_l = src109_l;
+        src4 = src10;
+    }
+}
+
+void ff_put_vp8_epel16_v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                              const uint8_t *src, ptrdiff_t src_stride,
+                              int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_lsx[my - 1];
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    __m128i src10_l, src32_l, src54_l, src76_l, src21_l, src43_l, src65_l, src87_l;
+    __m128i src10_h, src32_h, src54_h, src76_h, src21_h, src43_h, src65_h, src87_h;
+    __m128i filt0, filt1, filt2;
+    __m128i tmp0, tmp1, tmp2, tmp3;
+
+    ptrdiff_t src_stride2 = src_stride << 1;
+    ptrdiff_t src_stride3 = src_stride2 + src_stride;
+    ptrdiff_t src_stride4 = src_stride2 << 1;
+
+    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+    filt2 = __lsx_vldrepl_h(filter, 4);
+
+    DUP4_ARG2(__lsx_vld, src - src_stride2, 0, src - src_stride, 0, src, 0,
+              src + src_stride, 0, src0, src1, src2, src3);
+    src4 = __lsx_vld(src + src_stride2, 0);
+    src += src_stride3;
+
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    src4 = __lsx_vxori_b(src4, 128);
+
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src4, src3, src2, src1,
+              src10_l, src32_l, src43_l, src21_l);
+    DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src4, src3, src2, src1,
+              src10_h, src32_h, src43_h, src21_h);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
+                  src + src_stride3, 0, src5, src6, src7, src8);
+        src += src_stride4;
+        DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128,
+                  src5, src6, src7, src8);
+
+        DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+                  src54_l, src65_l, src76_l, src87_l);
+        DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7,
+                  src54_h, src65_h, src76_h, src87_h);
+
+        tmp0 = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+        tmp1 = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+        tmp2 = DPADD_SH3_SH(src10_h, src32_h, src54_h, filt0, filt1, filt2);
+        tmp3 = DPADD_SH3_SH(src21_h, src43_h, src65_h, filt0, filt1, filt2);
+
+        DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+        __lsx_vst(tmp0, dst, 0);
+        dst += dst_stride;
+        __lsx_vst(tmp1, dst, 0);
+        dst += dst_stride;
+
+        tmp0 = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+        tmp1 = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+        tmp2 = DPADD_SH3_SH(src32_h, src54_h, src76_h, filt0, filt1, filt2);
+        tmp3 = DPADD_SH3_SH(src43_h, src65_h, src87_h, filt0, filt1, filt2);
+
+        DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+        __lsx_vst(tmp0, dst, 0);
+        dst += dst_stride;
+        __lsx_vst(tmp1, dst, 0);
+        dst += dst_stride;
+
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src10_h = src54_h;
+        src32_h = src76_h;
+        src21_h = src65_h;
+        src43_h = src87_h;
+        src4 = src8;
+    }
+}
+
+void ff_put_vp8_epel8_h6v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                               const uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_lsx[mx - 1];
+    const int8_t *filter_vert = subpel_filters_lsx[my - 1];
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    __m128i filt_hz0, filt_hz1, filt_hz2;
+    __m128i mask0, mask1, mask2, filt_vt0, filt_vt1, filt_vt2;
+    __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+    __m128i tmp0, tmp1, tmp2, tmp3;
+
+    ptrdiff_t src_stride2 = src_stride << 1;
+    ptrdiff_t src_stride3 = src_stride2 + src_stride;
+    ptrdiff_t src_stride4 = src_stride2 << 1;
+
+    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+    src -= (2 + src_stride2);
+
+    /* rearranging filter */
+    DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1);
+    filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
+
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
+              src + src_stride3, 0, src0, src1, src2, src3);
+    src += src_stride4;
+    src4 = __lsx_vld(src, 0);
+    src +=  src_stride;
+
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
+              src0 ,src1, src2, src3);
+    src4 = __lsx_vxori_b(src4, 128);
+
+    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+
+    DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1);
+    filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
+
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4);
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
+                  src + src_stride3, 0, src5, src6, src7, src8);
+        src += src_stride4;
+
+        DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128,
+                  src5, src6, src7, src8);
+
+        hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2,filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out5 = __lsx_vpackev_b(hz_out6, hz_out5);
+        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+
+        out7 = __lsx_vpackev_b(hz_out7, hz_out6);
+        tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out6 = __lsx_vpackev_b(hz_out8, hz_out7);
+        tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
+
+        DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
+        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+        __lsx_vstelm_d(tmp0, dst, 0, 0);
+
+        dst += dst_stride;
+        __lsx_vstelm_d(tmp0, dst, 0, 1);
+        dst += dst_stride;
+        __lsx_vstelm_d(tmp1, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(tmp1, dst, 0, 1);
+        dst += dst_stride;
+
+        hz_out4 = hz_out8;
+        out0 = out2;
+        out1 = out7;
+        out3 = out5;
+        out4 = out6;
+    }
+}
+
+void ff_put_vp8_epel16_h6v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                                const uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_vp8_epel8_h6v6_lsx(dst, dst_stride, src, src_stride, height, mx, my);
+        src += 8;
+        dst += 8;
+    }
+}
+
+void ff_put_vp8_epel8_v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                             const uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_lsx[my - 1];
+    __m128i src0, src1, src2, src7, src8, src9, src10;
+    __m128i src10_l, src72_l, src98_l, src21_l, src87_l, src109_l, filt0, filt1;
+    __m128i out0, out1, out2, out3;
+
+    ptrdiff_t src_stride2 = src_stride << 1;
+    ptrdiff_t src_stride3 = src_stride2 + src_stride;
+    ptrdiff_t src_stride4 = src_stride2 << 1;
+
+    src -= src_stride;
+
+    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+    DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src0, src1);
+    src2 = __lsx_vld(src + src_stride2, 0);
+    src += src_stride3;
+
+    DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+    src2 = __lsx_vxori_b(src2, 128);
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
+                  src + src_stride3, 0, src7, src8, src9, src10);
+        src += src_stride4;
+
+        DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
+                  src7, src8, src9, src10);
+        DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9,
+                  src72_l, src87_l, src98_l, src109_l);
+
+        out0 = FILT_4TAP_DPADD_S_H(src10_l, src72_l, filt0, filt1);
+        out1 = FILT_4TAP_DPADD_S_H(src21_l, src87_l, filt0, filt1);
+        out2 = FILT_4TAP_DPADD_S_H(src72_l, src98_l, filt0, filt1);
+        out3 = FILT_4TAP_DPADD_S_H(src87_l, src109_l, filt0, filt1);
+        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+
+        __lsx_vstelm_d(out0, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(out0, dst, 0, 1);
+        dst += dst_stride;
+        __lsx_vstelm_d(out1, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(out1, dst, 0, 1);
+        dst += dst_stride;
+
+        src10_l = src98_l;
+        src21_l = src109_l;
+        src2 = src10;
+    }
+}
+
+void ff_put_vp8_epel16_v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                              const uint8_t *src, ptrdiff_t src_stride,
+                              int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_lsx[my - 1];
+    __m128i src0, src1, src2, src3, src4, src5, src6;
+    __m128i src10_l, src32_l, src54_l, src21_l, src43_l, src65_l, src10_h;
+    __m128i src32_h, src54_h, src21_h, src43_h, src65_h, filt0, filt1;
+    __m128i tmp0, tmp1, tmp2, tmp3;
+
+    ptrdiff_t src_stride2 = src_stride << 1;
+    ptrdiff_t src_stride3 = src_stride2 + src_stride;
+    ptrdiff_t src_stride4 = src_stride2 << 1;
+
+    src -= src_stride;
+    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+    DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src0, src1);
+    src2 = __lsx_vld(src + src_stride2, 0);
+    src += src_stride3;
+
+    DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+    src2 = __lsx_vxori_b(src2, 128);
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_l, src21_l);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_h, src21_h);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2,
+                  0, src + src_stride3, 0, src3, src4, src5, src6);
+        src += src_stride4;
+
+        DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128,
+                  src3, src4, src5, src6);
+        DUP4_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src5, src4, src6,
+                  src5, src32_l, src43_l, src54_l, src65_l);
+        DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6,
+                  src5, src32_h, src43_h, src54_h, src65_h);
+
+        tmp0 = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
+        tmp1 = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
+        tmp2 = FILT_4TAP_DPADD_S_H(src10_h, src32_h, filt0, filt1);
+        tmp3 = FILT_4TAP_DPADD_S_H(src21_h, src43_h, filt0, filt1);
+        DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+
+        __lsx_vst(tmp0, dst, 0);
+        dst += dst_stride;
+        __lsx_vst(tmp1, dst, 0);
+        dst += dst_stride;
+
+        tmp0 = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
+        tmp1 = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
+        tmp2 = FILT_4TAP_DPADD_S_H(src32_h, src54_h, filt0, filt1);
+        tmp3 = FILT_4TAP_DPADD_S_H(src43_h, src65_h, filt0, filt1);
+        DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+
+        __lsx_vst(tmp0, dst, 0);
+        dst += dst_stride;
+        __lsx_vst(tmp1, dst, 0);
+        dst += dst_stride;
+
+        src10_l = src54_l;
+        src21_l = src65_l;
+        src10_h = src54_h;
+        src21_h = src65_h;
+        src2 = src6;
+    }
+}
+
+void ff_put_vp8_epel8_h6v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                               const uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_lsx[mx - 1];
+    const int8_t *filter_vert = subpel_filters_lsx[my - 1];
+    __m128i src0, src1, src2, src3, src4, src5, src6;
+    __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+    __m128i filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
+    __m128i tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
+
+    ptrdiff_t src_stride2 = src_stride << 1;
+    ptrdiff_t src_stride3 = src_stride2 + src_stride;
+    ptrdiff_t src_stride4 = src_stride2 << 1;
+
+    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+    src -= (2 + src_stride);
+
+    /* rearranging filter */
+    DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1);
+    filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
+
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+    DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src0, src1);
+    src2 = __lsx_vld(src + src_stride2, 0);
+    src += src_stride3;
+
+    DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+    src2 = __lsx_vxori_b(src2, 128);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
+
+    DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
+                  src + src_stride3, 0, src3, src4, src5, src6);
+        src += src_stride4;
+
+        DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128,
+                  src3, src4, src5, src6);
+
+        hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
+
+        hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
+
+        hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2);
+        tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+        DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
+        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+
+        __lsx_vstelm_d(tmp0, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(tmp0, dst, 0, 1);
+        dst += dst_stride;
+        __lsx_vstelm_d(tmp1, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(tmp1, dst, 0, 1);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_vp8_epel16_h6v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                                const uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_vp8_epel8_h6v4_lsx(dst, dst_stride, src, src_stride, height,
+                                  mx, my);
+        src += 8;
+        dst += 8;
+    }
+}
+
+void ff_put_vp8_epel8_h4v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                               const uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_lsx[mx - 1];
+    const int8_t *filter_vert = subpel_filters_lsx[my - 1];
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    __m128i filt_hz0, filt_hz1, mask0, mask1;
+    __m128i filt_vt0, filt_vt1, filt_vt2;
+    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+
+    ptrdiff_t src_stride2 = src_stride << 1;
+    ptrdiff_t src_stride3 = src_stride2 + src_stride;
+    ptrdiff_t src_stride4 = src_stride2 << 1;
+
+    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+    src -= (1 + src_stride2);
+
+    /* rearranging filter */
+    DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1);
+    mask1 = __lsx_vaddi_bu(mask0, 2);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
+              src + src_stride3, 0, src0, src1, src2, src3);
+    src += src_stride4;
+    src4 = __lsx_vld(src, 0);
+    src += src_stride;
+
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
+              src0, src1, src2, src3);
+    src4 = __lsx_vxori_b(src4, 128);
+
+    tmp0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+    tmp1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+    tmp2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+    tmp3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+    tmp4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+
+    DUP4_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp2, tmp1,
+              tmp4, tmp3, out0, out1, out3, out4);
+
+    DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1);
+    filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
+                  src + src_stride3, 0, src5, src6, src7, src8);
+        src += src_stride4;
+
+        DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128,
+                  src5, src6, src7, src8);
+
+        tmp5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+        out2 = __lsx_vpackev_b(tmp5, tmp4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        tmp6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+        out5 = __lsx_vpackev_b(tmp6, tmp5);
+        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+        tmp7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
+        out6 = __lsx_vpackev_b(tmp7, tmp6);
+        tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
+
+        tmp8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
+        out7 = __lsx_vpackev_b(tmp8, tmp7);
+        tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
+
+        DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
+        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+
+        __lsx_vstelm_d(tmp0, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(tmp0, dst, 0, 1);
+        dst += dst_stride;
+        __lsx_vstelm_d(tmp1, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(tmp1, dst, 0, 1);
+        dst += dst_stride;
+
+        tmp4 = tmp8;
+        out0 = out2;
+        out1 = out6;
+        out3 = out5;
+        out4 = out7;
+    }
+}
+
+void ff_put_vp8_epel16_h4v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                                const uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_vp8_epel8_h4v6_lsx(dst, dst_stride, src, src_stride, height,
+                                  mx, my);
+        src += 8;
+        dst += 8;
+    }
+}
+
+void ff_put_vp8_pixels8_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                            const uint8_t *src, ptrdiff_t src_stride,
+                            int height, int mx, int my)
+{
+    int32_t cnt;
+    __m128i src0, src1, src2, src3;
+
+    ptrdiff_t src_stride2 = src_stride << 1;
+    ptrdiff_t src_stride3 = src_stride2 + src_stride;
+    ptrdiff_t src_stride4 = src_stride2 << 1;
+
+    if (0 == height % 8) {
+        for (cnt = height >> 3; cnt--;) {
+            DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
+                      src + src_stride3, 0, src0, src1, src2, src3);
+            src += src_stride4;
+
+            __lsx_vstelm_d(src0, dst, 0, 0);
+            dst += dst_stride;
+            __lsx_vstelm_d(src1, dst, 0, 0);
+            dst += dst_stride;
+            __lsx_vstelm_d(src2, dst, 0, 0);
+            dst += dst_stride;
+            __lsx_vstelm_d(src3, dst, 0, 0);
+            dst += dst_stride;
+
+            DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
+                      src + src_stride3, 0, src0, src1, src2, src3);
+            src += src_stride4;
+
+            __lsx_vstelm_d(src0, dst, 0, 0);
+            dst += dst_stride;
+            __lsx_vstelm_d(src1, dst, 0, 0);
+            dst += dst_stride;
+            __lsx_vstelm_d(src2, dst, 0, 0);
+            dst += dst_stride;
+            __lsx_vstelm_d(src3, dst, 0, 0);
+            dst += dst_stride;
+        }
+    } else if( 0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
+                      src + src_stride3, 0, src0, src1, src2, src3);
+            src += src_stride4;
+
+            __lsx_vstelm_d(src0, dst, 0, 0);
+            dst += dst_stride;
+            __lsx_vstelm_d(src1, dst, 0, 0);
+            dst += dst_stride;
+            __lsx_vstelm_d(src2, dst, 0, 0);
+            dst += dst_stride;
+            __lsx_vstelm_d(src3, dst, 0, 0);
+            dst += dst_stride;
+        }
+    }
+}
+
+void ff_put_vp8_pixels16_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                             const uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    int32_t width = 16;
+    int32_t cnt, loop_cnt;
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+
+    ptrdiff_t src_stride2 = src_stride << 1;
+    ptrdiff_t src_stride3 = src_stride2 + src_stride;
+    ptrdiff_t src_stride4 = src_stride2 << 1;
+
+    ptrdiff_t dst_stride2 = dst_stride << 1;
+    ptrdiff_t dst_stride3 = dst_stride2 + dst_stride;
+    ptrdiff_t dst_stride4 = dst_stride2 << 1;
+
+    if (0 == height % 8) {
+        for (cnt = (width >> 4); cnt--;) {
+            src_tmp = src;
+            dst_tmp = dst;
+            for (loop_cnt = (height >> 3); loop_cnt--;) {
+                DUP4_ARG2(__lsx_vld, src_tmp, 0, src_tmp + src_stride, 0,
+                          src_tmp + src_stride2, 0, src_tmp + src_stride3, 0,
+                          src4, src5, src6, src7);
+                src_tmp += src_stride4;
+
+                __lsx_vst(src4, dst_tmp,               0);
+                __lsx_vst(src5, dst_tmp + dst_stride,  0);
+                __lsx_vst(src6, dst_tmp + dst_stride2, 0);
+                __lsx_vst(src7, dst_tmp + dst_stride3, 0);
+                dst_tmp += dst_stride4;
+
+                DUP4_ARG2(__lsx_vld, src_tmp, 0, src_tmp + src_stride, 0,
+                          src_tmp + src_stride2, 0, src_tmp + src_stride3, 0,
+                          src4, src5, src6, src7);
+                src_tmp += src_stride4;
+
+                __lsx_vst(src4, dst_tmp,               0);
+                __lsx_vst(src5, dst_tmp + dst_stride,  0);
+                __lsx_vst(src6, dst_tmp + dst_stride2, 0);
+                __lsx_vst(src7, dst_tmp + dst_stride3, 0);
+                dst_tmp += dst_stride4;
+            }
+            src += 16;
+            dst += 16;
+        }
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
+                      src + src_stride3, 0, src0, src1, src2, src3);
+            src += 4 * src_stride4;
+
+            __lsx_vst(src0, dst,               0);
+            __lsx_vst(src1, dst + dst_stride,  0);
+            __lsx_vst(src2, dst + dst_stride2, 0);
+            __lsx_vst(src3, dst + dst_stride3, 0);
+            dst += dst_stride4;
+       }
+    }
+}
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * VP8 compatible video decoder
+ */
+
+#include "libavutil/loongarch/cpu.h"
+#include "libavcodec/vp8dsp.h"
+#include "libavutil/attributes.h"
+#include "vp8dsp_loongarch.h"
+
+#define VP8_MC_LOONGARCH_FUNC(IDX, SIZE)                                          \
+    dsp->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel##SIZE##_h6_lsx;     \
+    dsp->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel##SIZE##_v4_lsx;     \
+    dsp->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel##SIZE##_h6v4_lsx;   \
+    dsp->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel##SIZE##_v6_lsx;     \
+    dsp->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel##SIZE##_h4v6_lsx;   \
+    dsp->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel##SIZE##_h6v6_lsx;
+
+#define VP8_MC_LOONGARCH_COPY(IDX, SIZE)                                          \
+    dsp->put_vp8_epel_pixels_tab[IDX][0][0] = ff_put_vp8_pixels##SIZE##_lsx;      \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][0][0] = ff_put_vp8_pixels##SIZE##_lsx;
+
+av_cold void ff_vp8dsp_init_loongarch(VP8DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_lsx(cpu_flags)) {
+        VP8_MC_LOONGARCH_FUNC(0, 16);
+        VP8_MC_LOONGARCH_FUNC(1, 8);
+
+        VP8_MC_LOONGARCH_COPY(0, 16);
+        VP8_MC_LOONGARCH_COPY(1, 8);
+
+        dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_lsx;
+        dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_lsx;
+        dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_lsx;
+        dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_lsx;
+
+        dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_lsx;
+        dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_lsx;
+    }
+}
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_VP8DSP_LOONGARCH_H
+#define AVCODEC_LOONGARCH_VP8DSP_LOONGARCH_H
+
+#include "libavcodec/vp8dsp.h"
+
+void ff_put_vp8_pixels8_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                            const uint8_t *src, ptrdiff_t src_stride,
+                            int h, int x, int y);
+void ff_put_vp8_pixels16_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                             const uint8_t *src, ptrdiff_t src_stride,
+                             int h, int x, int y);
+
+void ff_put_vp8_epel16_h6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                              const uint8_t *src, ptrdiff_t src_stride,
+                              int h, int mx, int my);
+void ff_put_vp8_epel16_v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                              const uint8_t *src, ptrdiff_t src_stride,
+                              int h, int mx, int my);
+void ff_put_vp8_epel16_v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                              const uint8_t *src, ptrdiff_t src_stride,
+                              int h, int mx, int my);
+void ff_put_vp8_epel16_h6v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                                const uint8_t *src, ptrdiff_t src_stride,
+                                int h, int mx, int my);
+void ff_put_vp8_epel16_h4v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                                const uint8_t *src, ptrdiff_t src_stride,
+                                int h, int mx, int my);
+void ff_put_vp8_epel16_h6v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                                const uint8_t *src, ptrdiff_t src_stride,
+                                int h, int mx, int my);
+
+void ff_put_vp8_epel8_v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                             const uint8_t *src, ptrdiff_t src_stride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel8_v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                             const uint8_t *src, ptrdiff_t src_stride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel8_h6v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                               const uint8_t *src, ptrdiff_t src_stride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel8_h4v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                               const uint8_t *src, ptrdiff_t src_stride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel8_h6v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                               const uint8_t *src, ptrdiff_t src_stride,
+                               int h, int mx, int my);
+
+void ff_put_vp8_epel8_h6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                             const uint8_t *src, ptrdiff_t src_stride,
+                             int h, int mx, int my);
+
+/* loop filter */
+void ff_vp8_v_loop_filter16_inner_lsx(uint8_t *dst, ptrdiff_t stride,
+                                      int32_t e, int32_t i, int32_t h);
+void ff_vp8_h_loop_filter16_inner_lsx(uint8_t *src, ptrdiff_t stride,
+                                      int32_t e, int32_t i, int32_t h);
+
+void ff_vp8_v_loop_filter16_lsx(uint8_t *dst, ptrdiff_t stride,
+                                int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_h_loop_filter16_lsx(uint8_t *dst, ptrdiff_t stride,
+                                int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_h_loop_filter8uv_lsx(uint8_t *dst_u, uint8_t *dst_v,
+                                 ptrdiff_t stride,
+                                 int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_v_loop_filter8uv_lsx(uint8_t *dst_u, uint8_t *dst_v,
+                                 ptrdiff_t stride,
+                                 int flim_e, int flim_i, int hev_thresh);
+
+#endif  // #ifndef AVCODEC_LOONGARCH_VP8DSP_LOONGARCH_H
@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "vp9dsp_loongarch.h"
+
+#define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4,   \
+                 _dst5, _dst6, _dst7, _dst, _stride,  \
+                 _stride2, _stride3, _stride4)        \
+{                                                     \
+    __lsx_vst(_dst0, _dst, 0);                        \
+    __lsx_vstx(_dst1, _dst, _stride);                 \
+    __lsx_vstx(_dst2, _dst, _stride2);                \
+    __lsx_vstx(_dst3, _dst, _stride3);                \
+    _dst += _stride4;                                 \
+    __lsx_vst(_dst4, _dst, 0);                        \
+    __lsx_vstx(_dst5, _dst, _stride);                 \
+    __lsx_vstx(_dst6, _dst, _stride2);                \
+    __lsx_vstx(_dst7, _dst, _stride3);                \
+}
+
+#define LSX_ST_8X16(_dst0, _dst1, _dst2, _dst3, _dst4,   \
+                    _dst5, _dst6, _dst7, _dst, _stride)  \
+{                                                        \
+    __lsx_vst(_dst0, _dst, 0);                           \
+    __lsx_vst(_dst0, _dst, 16);                          \
+    _dst += _stride;                                     \
+    __lsx_vst(_dst1, _dst, 0);                           \
+    __lsx_vst(_dst1, _dst, 16);                          \
+    _dst += _stride;                                     \
+    __lsx_vst(_dst2, _dst, 0);                           \
+    __lsx_vst(_dst2, _dst, 16);                          \
+    _dst += _stride;                                     \
+    __lsx_vst(_dst3, _dst, 0);                           \
+    __lsx_vst(_dst3, _dst, 16);                          \
+    _dst += _stride;                                     \
+    __lsx_vst(_dst4, _dst, 0);                           \
+    __lsx_vst(_dst4, _dst, 16);                          \
+    _dst += _stride;                                     \
+    __lsx_vst(_dst5, _dst, 0);                           \
+    __lsx_vst(_dst5, _dst, 16);                          \
+    _dst += _stride;                                     \
+    __lsx_vst(_dst6, _dst, 0);                           \
+    __lsx_vst(_dst6, _dst, 16);                          \
+    _dst += _stride;                                     \
+    __lsx_vst(_dst7, _dst, 0);                           \
+    __lsx_vst(_dst7, _dst, 16);                          \
+    _dst += _stride;                                     \
+}
+
+void ff_vert_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
+                       const uint8_t *src)
+{
+    __m128i src0;
+    ptrdiff_t stride2 = dst_stride << 1;
+    ptrdiff_t stride3 = stride2 + dst_stride;
+    ptrdiff_t stride4 = stride2 << 1;
+    src0 = __lsx_vld(src, 0);
+    LSX_ST_8(src0, src0, src0, src0, src0, src0, src0, src0, dst,
+             dst_stride, stride2, stride3, stride4);
+    dst += stride4;
+    LSX_ST_8(src0, src0, src0, src0, src0, src0, src0, src0, dst,
+             dst_stride, stride2, stride3, stride4);
+}
+
+void ff_vert_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
+                       const uint8_t *src)
+{
+    uint32_t row;
+    __m128i src0, src1;
+
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    for (row = 32; row--;) {
+        __lsx_vst(src0, dst, 0);
+        __lsx_vst(src1, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+void ff_hor_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
+                      const uint8_t *top)
+{
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i src8, src9, src10, src11, src12, src13, src14, src15;
+    ptrdiff_t stride2 = dst_stride << 1;
+    ptrdiff_t stride3 = stride2 + dst_stride;
+    ptrdiff_t stride4 = stride2 << 1;
+
+    src15 = __lsx_vldrepl_b(src, 0);
+    src14 = __lsx_vldrepl_b(src, 1);
+    src13 = __lsx_vldrepl_b(src, 2);
+    src12 = __lsx_vldrepl_b(src, 3);
+    src11 = __lsx_vldrepl_b(src, 4);
+    src10 = __lsx_vldrepl_b(src, 5);
+    src9  = __lsx_vldrepl_b(src, 6);
+    src8  = __lsx_vldrepl_b(src, 7);
+    src7  = __lsx_vldrepl_b(src, 8);
+    src6  = __lsx_vldrepl_b(src, 9);
+    src5  = __lsx_vldrepl_b(src, 10);
+    src4  = __lsx_vldrepl_b(src, 11);
+    src3  = __lsx_vldrepl_b(src, 12);
+    src2  = __lsx_vldrepl_b(src, 13);
+    src1  = __lsx_vldrepl_b(src, 14);
+    src0  = __lsx_vldrepl_b(src, 15);
+    LSX_ST_8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
+             dst_stride, stride2, stride3, stride4);
+    dst += stride4;
+    LSX_ST_8(src8, src9, src10, src11, src12, src13, src14, src15, dst,
+             dst_stride, stride2, stride3, stride4);
+}
+
+void ff_hor_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
+                      const uint8_t *top)
+{
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i src8, src9, src10, src11, src12, src13, src14, src15;
+    __m128i src16, src17, src18, src19, src20, src21, src22, src23;
+    __m128i src24, src25, src26, src27, src28, src29, src30, src31;
+
+    src31 = __lsx_vldrepl_b(src, 0);
+    src30 = __lsx_vldrepl_b(src, 1);
+    src29 = __lsx_vldrepl_b(src, 2);
+    src28 = __lsx_vldrepl_b(src, 3);
+    src27 = __lsx_vldrepl_b(src, 4);
+    src26 = __lsx_vldrepl_b(src, 5);
+    src25 = __lsx_vldrepl_b(src, 6);
+    src24 = __lsx_vldrepl_b(src, 7);
+    src23 = __lsx_vldrepl_b(src, 8);
+    src22 = __lsx_vldrepl_b(src, 9);
+    src21 = __lsx_vldrepl_b(src, 10);
+    src20 = __lsx_vldrepl_b(src, 11);
+    src19 = __lsx_vldrepl_b(src, 12);
+    src18 = __lsx_vldrepl_b(src, 13);
+    src17 = __lsx_vldrepl_b(src, 14);
+    src16 = __lsx_vldrepl_b(src, 15);
+    src15 = __lsx_vldrepl_b(src, 16);
+    src14 = __lsx_vldrepl_b(src, 17);
+    src13 = __lsx_vldrepl_b(src, 18);
+    src12 = __lsx_vldrepl_b(src, 19);
+    src11 = __lsx_vldrepl_b(src, 20);
+    src10 = __lsx_vldrepl_b(src, 21);
+    src9  = __lsx_vldrepl_b(src, 22);
+    src8  = __lsx_vldrepl_b(src, 23);
+    src7  = __lsx_vldrepl_b(src, 24);
+    src6  = __lsx_vldrepl_b(src, 25);
+    src5  = __lsx_vldrepl_b(src, 26);
+    src4  = __lsx_vldrepl_b(src, 27);
+    src3  = __lsx_vldrepl_b(src, 28);
+    src2  = __lsx_vldrepl_b(src, 29);
+    src1  = __lsx_vldrepl_b(src, 30);
+    src0  = __lsx_vldrepl_b(src, 31);
+    LSX_ST_8X16(src0, src1, src2, src3, src4, src5, src6, src7,
+                dst, dst_stride);
+    LSX_ST_8X16(src8, src9, src10, src11, src12, src13, src14, src15,
+                dst, dst_stride);
+    LSX_ST_8X16(src16, src17, src18, src19, src20, src21, src22, src23,
+                dst, dst_stride);
+    LSX_ST_8X16(src24, src25, src26, src27, src28, src29, src30, src31,
+                dst, dst_stride);
+}
+
+void ff_dc_4x4_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
+                   const uint8_t *src_top)
+{
+    __m128i tmp0, tmp1, dst0;
+
+    tmp0 = __lsx_vldrepl_w(src_top, 0);
+    tmp1 = __lsx_vldrepl_w(src_left, 0);
+    dst0 = __lsx_vilvl_w(tmp1, tmp0);
+    dst0 = __lsx_vhaddw_hu_bu(dst0, dst0);
+    dst0 = __lsx_vhaddw_wu_hu(dst0, dst0);
+    dst0 = __lsx_vhaddw_du_wu(dst0, dst0);
+    dst0 = __lsx_vsrari_w(dst0, 3);
+    dst0 = __lsx_vshuf4i_b(dst0, 0);
+    __lsx_vstelm_w(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(dst0, dst, 0, 0);
+}
+
+#define INTRA_DC_TL_4X4(dir)                                            \
+void ff_dc_##dir##_4x4_lsx(uint8_t *dst, ptrdiff_t dst_stride,          \
+                          const uint8_t *left,                          \
+                          const uint8_t *top)                           \
+{                                                                       \
+    __m128i tmp0, dst0;                                                 \
+                                                                        \
+    tmp0 = __lsx_vldrepl_w(dir, 0);                                     \
+    dst0 = __lsx_vhaddw_hu_bu(tmp0, tmp0);                              \
+    dst0 = __lsx_vhaddw_wu_hu(dst0, dst0);                              \
+    dst0 = __lsx_vsrari_w(dst0, 2);                                     \
+    dst0 = __lsx_vshuf4i_b(dst0, 0);                                    \
+    __lsx_vstelm_w(dst0, dst, 0, 0);                                    \
+    dst += dst_stride;                                                  \
+    __lsx_vstelm_w(dst0, dst, 0, 0);                                    \
+    dst += dst_stride;                                                  \
+    __lsx_vstelm_w(dst0, dst, 0, 0);                                    \
+    dst += dst_stride;                                                  \
+    __lsx_vstelm_w(dst0, dst, 0, 0);                                    \
+}
+INTRA_DC_TL_4X4(top);
+INTRA_DC_TL_4X4(left);
+
+void ff_dc_8x8_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
+                   const uint8_t *src_top)
+{
+    __m128i tmp0, tmp1, dst0;
+
+    tmp0 = __lsx_vldrepl_d(src_top, 0);
+    tmp1 = __lsx_vldrepl_d(src_left, 0);
+    dst0 = __lsx_vilvl_d(tmp1, tmp0);
+    dst0 = __lsx_vhaddw_hu_bu(dst0, dst0);
+    dst0 = __lsx_vhaddw_wu_hu(dst0, dst0);
+    dst0 = __lsx_vhaddw_du_wu(dst0, dst0);
+    dst0 = __lsx_vhaddw_qu_du(dst0, dst0);
+    dst0 = __lsx_vsrari_w(dst0, 4);
+    dst0 = __lsx_vreplvei_b(dst0, 0);
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+}
+
+#define INTRA_DC_TL_8X8(dir)                                                  \
+void ff_dc_##dir##_8x8_lsx(uint8_t *dst, ptrdiff_t dst_stride,                \
+                           const uint8_t *left,                               \
+                           const uint8_t *top)                                \
+{                                                                             \
+    __m128i tmp0, dst0;                                                       \
+                                                                              \
+    tmp0 = __lsx_vldrepl_d(dir, 0);                                           \
+    dst0 = __lsx_vhaddw_hu_bu(tmp0, tmp0);                                    \
+    dst0 = __lsx_vhaddw_wu_hu(dst0, dst0);                                    \
+    dst0 = __lsx_vhaddw_du_wu(dst0, dst0);                                    \
+    dst0 = __lsx_vsrari_w(dst0, 3);                                           \
+    dst0 = __lsx_vreplvei_b(dst0, 0);                                         \
+    __lsx_vstelm_d(dst0, dst, 0, 0);                                          \
+    dst += dst_stride;                                                        \
+    __lsx_vstelm_d(dst0, dst, 0, 0);                                          \
+    dst += dst_stride;                                                        \
+    __lsx_vstelm_d(dst0, dst, 0, 0);                                          \
+    dst += dst_stride;                                                        \
+    __lsx_vstelm_d(dst0, dst, 0, 0);                                          \
+    dst += dst_stride;                                                        \
+    __lsx_vstelm_d(dst0, dst, 0, 0);                                          \
+    dst += dst_stride;                                                        \
+    __lsx_vstelm_d(dst0, dst, 0, 0);                                          \
+    dst += dst_stride;                                                        \
+    __lsx_vstelm_d(dst0, dst, 0, 0);                                          \
+    dst += dst_stride;                                                        \
+    __lsx_vstelm_d(dst0, dst, 0, 0);                                          \
+}
+
+INTRA_DC_TL_8X8(top);
+INTRA_DC_TL_8X8(left);
+
+void ff_dc_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                     const uint8_t *src_left, const uint8_t *src_top)
+{
+    __m128i tmp0, tmp1, dst0;
+    ptrdiff_t stride2 = dst_stride << 1;
+    ptrdiff_t stride3 = stride2 + dst_stride;
+    ptrdiff_t stride4 = stride2 << 1;
+
+    tmp0 = __lsx_vld(src_top, 0);
+    tmp1 = __lsx_vld(src_left, 0);
+    DUP2_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp0, tmp1);
+    dst0 = __lsx_vadd_h(tmp0, tmp1);
+    dst0 = __lsx_vhaddw_wu_hu(dst0, dst0);
+    dst0 = __lsx_vhaddw_du_wu(dst0, dst0);
+    dst0 = __lsx_vhaddw_qu_du(dst0, dst0);
+    dst0 = __lsx_vsrari_w(dst0, 5);
+    dst0 = __lsx_vreplvei_b(dst0, 0);
+    LSX_ST_8(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst,
+             dst_stride, stride2, stride3, stride4);
+    dst += stride4;
+    LSX_ST_8(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst,
+             dst_stride, stride2, stride3, stride4);
+}
+
+#define INTRA_DC_TL_16X16(dir)                                                \
+void ff_dc_##dir##_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride,              \
+                             const uint8_t *left,                             \
+                             const uint8_t *top)                              \
+{                                                                             \
+    __m128i tmp0, dst0;                                                       \
+    ptrdiff_t stride2 = dst_stride << 1;                                      \
+    ptrdiff_t stride3 = stride2 + dst_stride;                                 \
+    ptrdiff_t stride4 = stride2 << 1;                                         \
+                                                                              \
+    tmp0 = __lsx_vld(dir, 0);                                                 \
+    dst0 = __lsx_vhaddw_hu_bu(tmp0, tmp0);                                    \
+    dst0 = __lsx_vhaddw_wu_hu(dst0, dst0);                                    \
+    dst0 = __lsx_vhaddw_du_wu(dst0, dst0);                                    \
+    dst0 = __lsx_vhaddw_qu_du(dst0, dst0);                                    \
+    dst0 = __lsx_vsrari_w(dst0, 4);                                           \
+    dst0 = __lsx_vreplvei_b(dst0, 0);                                         \
+    LSX_ST_8(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst,             \
+             dst_stride, stride2, stride3, stride4);                          \
+    dst += stride4;                                                           \
+    LSX_ST_8(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst,             \
+             dst_stride, stride2, stride3, stride4);                          \
+}
+
+INTRA_DC_TL_16X16(top);
+INTRA_DC_TL_16X16(left);
+
+void ff_dc_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                     const uint8_t *src_left, const uint8_t *src_top)
+{
+    __m128i tmp0, tmp1, tmp2, tmp3, dst0;
+
+    DUP2_ARG2(__lsx_vld, src_top, 0, src_top, 16, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vld, src_left, 0, src_left, 16, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2,
+              tmp3, tmp3, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp0, tmp1);
+    dst0 = __lsx_vadd_h(tmp0, tmp1);
+    dst0 = __lsx_vhaddw_wu_hu(dst0, dst0);
+    dst0 = __lsx_vhaddw_du_wu(dst0, dst0);
+    dst0 = __lsx_vhaddw_qu_du(dst0, dst0);
+    dst0 = __lsx_vsrari_w(dst0, 6);
+    dst0 = __lsx_vreplvei_b(dst0, 0);
+    LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0,
+                dst, dst_stride);
+    LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0,
+                dst, dst_stride);
+    LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0,
+                dst, dst_stride);
+    LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0,
+                dst, dst_stride);
+}
+
+#define INTRA_DC_TL_32X32(dir)                                               \
+void ff_dc_##dir##_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride,             \
+                             const uint8_t *left,                            \
+                             const uint8_t *top)                             \
+{                                                                            \
+    __m128i tmp0, tmp1, dst0;                                                \
+                                                                             \
+    DUP2_ARG2(__lsx_vld, dir, 0, dir, 16, tmp0, tmp1);                       \
+    DUP2_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp0, tmp1);       \
+    dst0 = __lsx_vadd_h(tmp0, tmp1);                                         \
+    dst0 = __lsx_vhaddw_wu_hu(dst0, dst0);                                   \
+    dst0 = __lsx_vhaddw_du_wu(dst0, dst0);                                   \
+    dst0 = __lsx_vhaddw_qu_du(dst0, dst0);                                   \
+    dst0 = __lsx_vsrari_w(dst0, 5);                                          \
+    dst0 = __lsx_vreplvei_b(dst0, 0);                                        \
+    LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0,              \
+                dst, dst_stride);                                            \
+    LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0,              \
+                dst, dst_stride);                                            \
+    LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0,              \
+                dst, dst_stride);                                            \
+    LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0,              \
+                dst, dst_stride);                                            \
+}
+
+INTRA_DC_TL_32X32(top);
+INTRA_DC_TL_32X32(left);
+
+#define INTRA_PREDICT_VALDC_16X16_LSX(val)                             \
+void ff_dc_##val##_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride,       \
+                             const uint8_t *left, const uint8_t *top)  \
+{                                                                      \
+    __m128i out = __lsx_vldi(val);                                     \
+    ptrdiff_t stride2 = dst_stride << 1;                               \
+    ptrdiff_t stride3 = stride2 + dst_stride;                          \
+    ptrdiff_t stride4 = stride2 << 1;                                  \
+                                                                       \
+    LSX_ST_8(out, out, out, out, out, out, out, out, dst,              \
+             dst_stride, stride2, stride3, stride4);                   \
+    dst += stride4;                                                    \
+    LSX_ST_8(out, out, out, out, out, out, out, out, dst,              \
+             dst_stride, stride2, stride3, stride4);                   \
+}
+
+INTRA_PREDICT_VALDC_16X16_LSX(127);
+INTRA_PREDICT_VALDC_16X16_LSX(128);
+INTRA_PREDICT_VALDC_16X16_LSX(129);
+
+#define INTRA_PREDICT_VALDC_32X32_LSX(val)                               \
+void ff_dc_##val##_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride,         \
+                             const uint8_t *left, const uint8_t *top)    \
+{                                                                        \
+    __m128i out = __lsx_vldi(val);                                       \
+                                                                         \
+    LSX_ST_8X16(out, out, out, out, out, out, out, out, dst, dst_stride);\
+    LSX_ST_8X16(out, out, out, out, out, out, out, out, dst, dst_stride);\
+    LSX_ST_8X16(out, out, out, out, out, out, out, out, dst, dst_stride);\
+    LSX_ST_8X16(out, out, out, out, out, out, out, out, dst, dst_stride);\
+}
+
+INTRA_PREDICT_VALDC_32X32_LSX(127);
+INTRA_PREDICT_VALDC_32X32_LSX(128);
+INTRA_PREDICT_VALDC_32X32_LSX(129);
+
+void ff_tm_4x4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                   const uint8_t *src_left, const uint8_t *src_top_ptr)
+{
+    uint8_t top_left = src_top_ptr[-1];
+    __m128i tmp0, tmp1, tmp2, tmp3, reg0, reg1;
+    __m128i src0, src1, src2, src3;
+    __m128i dst0, dst1, dst2, dst3;
+
+    reg0 = __lsx_vreplgr2vr_h(top_left);
+    reg1 = __lsx_vld(src_top_ptr, 0);
+    DUP4_ARG2(__lsx_vldrepl_b, src_left, 0, src_left, 1, src_left, 2, src_left,
+              3, tmp3, tmp2, tmp1, tmp0);
+    DUP4_ARG2(__lsx_vilvl_b, tmp0, reg1, tmp1, reg1, tmp2, reg1, tmp3, reg1,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vhaddw_hu_bu, src0, src0, src1, src1, src2, src2, src3,
+              src3, dst0, dst1, dst2, dst3);
+    DUP4_ARG2(__lsx_vssub_hu, dst0, reg0, dst1, reg0, dst2, reg0, dst3, reg0,
+              dst0, dst1, dst2, dst3);
+    DUP4_ARG2(__lsx_vsat_hu, dst0, 7, dst1, 7, dst2, 7, dst3, 7,
+              dst0, dst1, dst2, dst3);
+    DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
+    __lsx_vstelm_w(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(dst0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(dst1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(dst1, dst, 0, 2);
+}
+
+void ff_tm_8x8_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                   const uint8_t *src_left, const uint8_t *src_top_ptr)
+{
+    uint8_t top_left = src_top_ptr[-1];
+    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i reg0, reg1;
+
+    reg0 = __lsx_vreplgr2vr_h(top_left);
+    reg1 = __lsx_vld(src_top_ptr, 0);
+    DUP4_ARG2(__lsx_vldrepl_b, src_left, 0, src_left, 1, src_left, 2, src_left,
+              3, tmp7, tmp6, tmp5, tmp4);
+    DUP4_ARG2(__lsx_vldrepl_b, src_left, 4, src_left, 5, src_left, 6, src_left,
+              7, tmp3, tmp2, tmp1, tmp0);
+    DUP4_ARG2(__lsx_vilvl_b, tmp0, reg1, tmp1, reg1, tmp2, reg1, tmp3, reg1,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vilvl_b, tmp4, reg1, tmp5, reg1, tmp6, reg1, tmp7, reg1,
+              src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vhaddw_hu_bu, src0, src0, src1, src1, src2, src2, src3,
+              src3, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vhaddw_hu_bu, src4, src4, src5, src5, src6, src6, src7,
+              src7, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3, reg0,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7, reg0,
+              src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7,
+              src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, src5, src4, src7, src6,
+              src0, src1, src2, src3);
+    __lsx_vstelm_d(src0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(src0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(src1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(src1, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(src2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(src2, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(src3, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(src3, dst, 0, 1);
+}
+
+void ff_tm_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                     const uint8_t *src_left, const uint8_t *src_top_ptr)
+{
+    uint8_t top_left = src_top_ptr[-1];
+    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i reg0, reg1;
+    ptrdiff_t stride2 = dst_stride << 1;
+    ptrdiff_t stride3 = stride2 + dst_stride;
+    ptrdiff_t stride4 = stride2 << 1;
+
+    reg0 = __lsx_vreplgr2vr_h(top_left);
+    reg1 = __lsx_vld(src_top_ptr, 0);
+    DUP4_ARG2(__lsx_vldrepl_b, src_left, 0, src_left, 1, src_left, 2, src_left,
+              3, tmp15, tmp14, tmp13, tmp12);
+    DUP4_ARG2(__lsx_vldrepl_b, src_left, 4, src_left, 5, src_left, 6, src_left,
+              7, tmp11, tmp10, tmp9, tmp8);
+    DUP4_ARG2(__lsx_vldrepl_b, src_left, 8, src_left, 9, src_left, 10,
+              src_left, 11, tmp7, tmp6, tmp5, tmp4);
+    DUP4_ARG2(__lsx_vldrepl_b, src_left, 12, src_left, 13, src_left, 14,
+              src_left, 15, tmp3, tmp2, tmp1, tmp0);
+    DUP4_ARG2(__lsx_vaddwev_h_bu, tmp0, reg1, tmp1, reg1, tmp2, reg1, tmp3,
+              reg1, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vaddwod_h_bu, tmp0, reg1, tmp1, reg1, tmp2, reg1, tmp3,
+              reg1, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3, reg0,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7, reg0,
+              src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7,
+              src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vpackev_b, src4, src0, src5, src1, src6, src2, src7, src3,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vaddwev_h_bu, tmp4, reg1, tmp5, reg1, tmp6, reg1, tmp7,
+              reg1, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vaddwod_h_bu, tmp4, reg1, tmp5, reg1, tmp6, reg1, tmp7,
+              reg1, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3, reg0,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7, reg0,
+              src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7,
+              src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vpackev_b, src4, src0, src5, src1, src6, src2, src7, src3,
+              tmp4, tmp5, tmp6, tmp7);
+    DUP4_ARG2(__lsx_vaddwev_h_bu, tmp8, reg1, tmp9, reg1, tmp10, reg1, tmp11,
+              reg1, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vaddwod_h_bu, tmp8, reg1, tmp9, reg1, tmp10, reg1, tmp11,
+              reg1, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3, reg0,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7, reg0,
+              src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7,
+              src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vpackev_b, src4, src0, src5, src1, src6, src2, src7, src3,
+              tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG2(__lsx_vaddwev_h_bu, tmp12, reg1, tmp13, reg1, tmp14, reg1,
+              tmp15, reg1, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vaddwod_h_bu, tmp12, reg1, tmp13, reg1, tmp14, reg1,
+              tmp15, reg1, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3, reg0,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7, reg0,
+              src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7,
+              src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vpackev_b, src4, src0, src5, src1, src6, src2, src7, src3,
+              tmp12, tmp13, tmp14, tmp15);
+    LSX_ST_8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, dst,
+             dst_stride, stride2, stride3, stride4);
+    dst += stride4;
+    LSX_ST_8(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, dst,
+             dst_stride, stride2, stride3, stride4);
+}
+
+void ff_tm_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride,
+                     const uint8_t *src_left, const uint8_t *src_top_ptr)
+{
+    uint8_t top_left = src_top_ptr[-1];
+    uint32_t loop_cnt;
+    __m128i tmp0, tmp1, tmp2, tmp3, reg0, reg1, reg2;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    reg0 = __lsx_vreplgr2vr_h(top_left);
+    DUP2_ARG2(__lsx_vld, src_top_ptr, 0, src_top_ptr, 16, reg1, reg2);
+
+    src_left += 28;
+    for (loop_cnt = 8; loop_cnt--;) {
+        DUP4_ARG2(__lsx_vldrepl_b, src_left, 0, src_left, 1, src_left, 2,
+                  src_left, 3, tmp3, tmp2, tmp1, tmp0);
+        src_left -= 4;
+        DUP4_ARG2(__lsx_vaddwev_h_bu, tmp0, reg1, tmp1, reg1, tmp2, reg1,
+                  tmp3, reg1, src0, src1, src2, src3);
+        DUP4_ARG2(__lsx_vaddwod_h_bu, tmp0, reg1, tmp1, reg1, tmp2, reg1,
+                  tmp3, reg1, src4, src5, src6, src7);
+        DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3,
+                  reg0, src0, src1, src2, src3);
+        DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7,
+                  reg0, src4, src5, src6, src7);
+        DUP4_ARG2(__lsx_vaddwev_h_bu, tmp0, reg2, tmp1, reg2, tmp2, reg2,
+                  tmp3, reg2, dst0, dst1, dst2, dst3);
+        DUP4_ARG2(__lsx_vaddwod_h_bu, tmp0, reg2, tmp1, reg2, tmp2, reg2,
+                  tmp3, reg2, dst4, dst5, dst6, dst7);
+        DUP4_ARG2(__lsx_vssub_hu, dst0, reg0, dst1, reg0, dst2, reg0, dst3,
+                  reg0, dst0, dst1, dst2, dst3);
+        DUP4_ARG2(__lsx_vssub_hu, dst4, reg0, dst5, reg0, dst6, reg0, dst7,
+                  reg0, dst4, dst5, dst6, dst7);
+        DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7,
+                  src0, src1, src2, src3);
+        DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7,
+                  src4, src5, src6, src7);
+        DUP4_ARG2(__lsx_vsat_hu, dst0, 7, dst1, 7, dst2, 7, dst3, 7,
+                  dst0, dst1, dst2, dst3);
+        DUP4_ARG2(__lsx_vsat_hu, dst4, 7, dst5, 7, dst6, 7, dst7, 7,
+                  dst4, dst5, dst6, dst7);
+        DUP4_ARG2(__lsx_vpackev_b, src4, src0, src5, src1, src6, src2, src7,
+                  src3, src0, src1, src2, src3);
+        DUP4_ARG2(__lsx_vpackev_b, dst4, dst0, dst5, dst1, dst6, dst2, dst7,
+                  dst3, dst0, dst1, dst2, dst3);
+        __lsx_vst(src0, dst, 0);
+        __lsx_vst(dst0, dst, 16);
+        dst += dst_stride;
+        __lsx_vst(src1, dst, 0);
+        __lsx_vst(dst1, dst, 16);
+        dst += dst_stride;
+        __lsx_vst(src2, dst, 0);
+        __lsx_vst(dst2, dst, 16);
+        dst += dst_stride;
+        __lsx_vst(src3, dst, 0);
+        __lsx_vst(dst3, dst, 16);
+        dst += dst_stride;
+    }
+}
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/vp9dsp.h"
+#include "vp9dsp_loongarch.h"
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type)  \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_smooth_##sz##dir##_lsx;             \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_regular_##sz##dir##_lsx;            \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_sharp_##sz##dir##_lsx;
+
+#define init_subpel2(idx, idxh, idxv, dir, type)      \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type);  \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type);  \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type);  \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type);  \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type);
+
+#define init_subpel3(idx, type)         \
+    init_subpel2(idx, 1, 0, h, type);   \
+    init_subpel2(idx, 0, 1, v, type);   \
+    init_subpel2(idx, 1, 1, hv, type);
+
+#define init_fpel(idx1, idx2, sz, type)                                    \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = ff_##type##sz##_lsx;  \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = ff_##type##sz##_lsx;  \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = ff_##type##sz##_lsx;  \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_##type##sz##_lsx;
+
+#define init_copy(idx, sz)                    \
+    init_fpel(idx, 0, sz, copy);              \
+    init_fpel(idx, 1, sz, avg);
+
+#define init_intra_pred1_lsx(tx, sz)                            \
+    dsp->intra_pred[tx][VERT_PRED]    = ff_vert_##sz##_lsx;     \
+    dsp->intra_pred[tx][HOR_PRED]     = ff_hor_##sz##_lsx;      \
+    dsp->intra_pred[tx][DC_PRED]      = ff_dc_##sz##_lsx;       \
+    dsp->intra_pred[tx][LEFT_DC_PRED] = ff_dc_left_##sz##_lsx;  \
+    dsp->intra_pred[tx][TOP_DC_PRED]  = ff_dc_top_##sz##_lsx;   \
+    dsp->intra_pred[tx][DC_128_PRED]  = ff_dc_128_##sz##_lsx;   \
+    dsp->intra_pred[tx][DC_127_PRED]  = ff_dc_127_##sz##_lsx;   \
+    dsp->intra_pred[tx][DC_129_PRED]  = ff_dc_129_##sz##_lsx;   \
+    dsp->intra_pred[tx][TM_VP8_PRED]  = ff_tm_##sz##_lsx;       \
+
+#define init_intra_pred2_lsx(tx, sz)                            \
+    dsp->intra_pred[tx][DC_PRED]      = ff_dc_##sz##_lsx;       \
+    dsp->intra_pred[tx][LEFT_DC_PRED] = ff_dc_left_##sz##_lsx;  \
+    dsp->intra_pred[tx][TOP_DC_PRED]  = ff_dc_top_##sz##_lsx;   \
+    dsp->intra_pred[tx][TM_VP8_PRED]  = ff_tm_##sz##_lsx;       \
+
+#define init_idct(tx, nm)                        \
+    dsp->itxfm_add[tx][DCT_DCT]   =              \
+    dsp->itxfm_add[tx][ADST_DCT]  =              \
+    dsp->itxfm_add[tx][DCT_ADST]  =              \
+    dsp->itxfm_add[tx][ADST_ADST] = nm##_add_lsx;
+
+#define init_itxfm(tx, sz)                                     \
+    dsp->itxfm_add[tx][DCT_DCT] = ff_idct_idct_##sz##_add_lsx;
+
+av_cold void ff_vp9dsp_init_loongarch(VP9DSPContext *dsp, int bpp)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_lsx(cpu_flags))
+        if (bpp == 8) {
+            init_subpel3(0, put);
+            init_subpel3(1, avg);
+            init_copy(0, 64);
+            init_copy(1, 32);
+            init_copy(2, 16);
+            init_copy(3, 8);
+            init_intra_pred1_lsx(TX_16X16, 16x16);
+            init_intra_pred1_lsx(TX_32X32, 32x32);
+            init_intra_pred2_lsx(TX_4X4, 4x4);
+            init_intra_pred2_lsx(TX_8X8, 8x8);
+            init_itxfm(TX_8X8, 8x8);
+            init_itxfm(TX_16X16, 16x16);
+            init_idct(TX_32X32, ff_idct_idct_32x32);
+            dsp->loop_filter_8[0][0] = ff_loop_filter_h_4_8_lsx;
+            dsp->loop_filter_8[0][1] = ff_loop_filter_v_4_8_lsx;
+            dsp->loop_filter_8[1][0] = ff_loop_filter_h_8_8_lsx;
+            dsp->loop_filter_8[1][1] = ff_loop_filter_v_8_8_lsx;
+            dsp->loop_filter_8[2][0] = ff_loop_filter_h_16_8_lsx;
+            dsp->loop_filter_8[2][1] = ff_loop_filter_v_16_8_lsx;
+
+            dsp->loop_filter_16[0] = ff_loop_filter_h_16_16_lsx;
+            dsp->loop_filter_16[1] = ff_loop_filter_v_16_16_lsx;
+
+            dsp->loop_filter_mix2[0][0][0] = ff_loop_filter_h_44_16_lsx;
+            dsp->loop_filter_mix2[0][0][1] = ff_loop_filter_v_44_16_lsx;
+            dsp->loop_filter_mix2[0][1][0] = ff_loop_filter_h_48_16_lsx;
+            dsp->loop_filter_mix2[0][1][1] = ff_loop_filter_v_48_16_lsx;
+            dsp->loop_filter_mix2[1][0][0] = ff_loop_filter_h_84_16_lsx;
+            dsp->loop_filter_mix2[1][0][1] = ff_loop_filter_v_84_16_lsx;
+            dsp->loop_filter_mix2[1][1][0] = ff_loop_filter_h_88_16_lsx;
+            dsp->loop_filter_mix2[1][1][1] = ff_loop_filter_v_88_16_lsx;
+        }
+}
+
+#undef init_subpel1
+#undef init_subpel2
+#undef init_subpel3
+#undef init_copy
+#undef init_fpel
+#undef init_intra_pred1_lsx
+#undef init_intra_pred2_lsx
+#undef init_idct
+#undef init_itxfm
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_VP9DSP_LOONGARCH_H
+#define AVCODEC_LOONGARCH_VP9DSP_LOONGARCH_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define VP9_8TAP_LOONGARCH_LSX_FUNC(SIZE, type, type_idx)                    \
+void ff_put_8tap_##type##_##SIZE##h_lsx(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_put_8tap_##type##_##SIZE##v_lsx(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_put_8tap_##type##_##SIZE##hv_lsx(uint8_t *dst, ptrdiff_t dststride,  \
+                                         const uint8_t *src,                 \
+                                         ptrdiff_t srcstride,                \
+                                         int h, int mx, int my);             \
+                                                                             \
+void ff_avg_8tap_##type##_##SIZE##h_lsx(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_avg_8tap_##type##_##SIZE##v_lsx(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_avg_8tap_##type##_##SIZE##hv_lsx(uint8_t *dst, ptrdiff_t dststride,  \
+                                         const uint8_t *src,                 \
+                                         ptrdiff_t srcstride,                \
+                                         int h, int mx, int my);
+
+#define VP9_COPY_LOONGARCH_LSX_FUNC(SIZE)                          \
+void ff_copy##SIZE##_lsx(uint8_t *dst, ptrdiff_t dststride,        \
+                         const uint8_t *src, ptrdiff_t srcstride,  \
+                         int h, int mx, int my);                   \
+                                                                   \
+void ff_avg##SIZE##_lsx(uint8_t *dst, ptrdiff_t dststride,         \
+                        const uint8_t *src, ptrdiff_t srcstride,   \
+                        int h, int mx, int my);
+
+VP9_8TAP_LOONGARCH_LSX_FUNC(64, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_LOONGARCH_LSX_FUNC(32, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_LOONGARCH_LSX_FUNC(16, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_LOONGARCH_LSX_FUNC(8, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_LOONGARCH_LSX_FUNC(4, regular, FILTER_8TAP_REGULAR);
+
+VP9_8TAP_LOONGARCH_LSX_FUNC(64, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_LOONGARCH_LSX_FUNC(32, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_LOONGARCH_LSX_FUNC(16, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_LOONGARCH_LSX_FUNC(8, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_LOONGARCH_LSX_FUNC(4, sharp, FILTER_8TAP_SHARP);
+
+VP9_8TAP_LOONGARCH_LSX_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_LOONGARCH_LSX_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_LOONGARCH_LSX_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_LOONGARCH_LSX_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_LOONGARCH_LSX_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+
+VP9_COPY_LOONGARCH_LSX_FUNC(64);
+VP9_COPY_LOONGARCH_LSX_FUNC(32);
+VP9_COPY_LOONGARCH_LSX_FUNC(16);
+VP9_COPY_LOONGARCH_LSX_FUNC(8);
+
+#undef VP9_8TAP_LOONGARCH_LSX_FUNC
+#undef VP9_COPY_LOONGARCH_LSX_FUNC
+
+void ff_vert_16x16_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                       const uint8_t *top);
+void ff_vert_32x32_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                       const uint8_t *top);
+void ff_hor_16x16_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                      const uint8_t *top);
+void ff_hor_32x32_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                      const uint8_t *top);
+void ff_dc_4x4_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                   const uint8_t *top);
+void ff_dc_8x8_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                   const uint8_t *top);
+void ff_dc_16x16_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                     const uint8_t *top);
+void ff_dc_32x32_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                     const uint8_t *top);
+void ff_dc_left_4x4_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                        const uint8_t *top);
+void ff_dc_left_8x8_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                        const uint8_t *top);
+void ff_dc_left_16x16_lsx(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *left, const uint8_t *top);
+void ff_dc_left_32x32_lsx(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *left, const uint8_t *top);
+void ff_dc_top_4x4_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                       const uint8_t *top);
+void ff_dc_top_8x8_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                       const uint8_t *top);
+void ff_dc_top_16x16_lsx(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_top_32x32_lsx(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_128_16x16_lsx(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_128_32x32_lsx(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_127_16x16_lsx(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_127_32x32_lsx(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_129_16x16_lsx(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_129_32x32_lsx(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_tm_4x4_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                   const uint8_t *top);
+void ff_tm_8x8_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                   const uint8_t *top);
+void ff_tm_16x16_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                     const uint8_t *top);
+void ff_tm_32x32_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                     const uint8_t *top);
+void ff_loop_filter_h_16_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                               int32_t i, int32_t h);
+void ff_loop_filter_v_16_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                               int32_t i, int32_t h);
+void ff_loop_filter_h_4_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                              int32_t i, int32_t h);
+void ff_loop_filter_v_4_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                              int32_t i, int32_t h);
+void ff_loop_filter_h_44_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_44_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_h_8_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                              int32_t i, int32_t h);
+void ff_loop_filter_v_8_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                              int32_t i, int32_t h);
+void ff_loop_filter_h_88_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_88_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_h_84_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_84_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_h_48_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_48_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_h_16_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_16_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_idct_idct_8x8_add_lsx(uint8_t *dst, ptrdiff_t stride,
+                              int16_t *block, int eob);
+void ff_idct_idct_16x16_add_lsx(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob);
+void ff_idct_idct_32x32_add_lsx(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob);
+
+#endif /* AVCODEC_LOONGARCH_VP9DSP_LOONGARCH_H */