Lakka-LibreELEC/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch

From 504df93cfe5416b394755e79b7b81ee0119cf09c Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 26 Apr 2021 12:34:50 +0100
Subject: [PATCH 001/161] Add pi configs and scripts

---
 pi-util/BUILD.txt                  |  59 ++++++++
 pi-util/NOTES.txt                  |  69 +++++++++
 pi-util/TESTMESA.txt               |  82 +++++++++++
 pi-util/clean_usr_libs.sh          |  26 ++++
 pi-util/conf_arm64_native.sh       |  45 ++++++
 pi-util/conf_h265.2016.csv         | 195 ++++++++++++++++++++++++++
 pi-util/conf_h265.2016_HEVC_v1.csv | 147 ++++++++++++++++++++
 pi-util/conf_h265.csv              | 144 +++++++++++++++++++
 pi-util/conf_native.sh             | 108 +++++++++++++++
 pi-util/ffconf.py                  | 215 +++++++++++++++++++++++++++++
 pi-util/ffperf.py                  | 128 +++++++++++++++++
 pi-util/genpatch.sh                |  35 +++++
 pi-util/make_array.py              |  23 +++
 pi-util/mkinst.sh                  |   5 +
 pi-util/patkodi.sh                 |   9 ++
 pi-util/perfcmp.py                 | 101 ++++++++++++++
 pi-util/qem.sh                     |   9 ++
 pi-util/v3dusage.py                | 128 +++++++++++++++++
 18 files changed, 1528 insertions(+)
 create mode 100644 pi-util/BUILD.txt
 create mode 100644 pi-util/NOTES.txt
 create mode 100644 pi-util/TESTMESA.txt
 create mode 100755 pi-util/clean_usr_libs.sh
 create mode 100644 pi-util/conf_arm64_native.sh
 create mode 100644 pi-util/conf_h265.2016.csv
 create mode 100644 pi-util/conf_h265.2016_HEVC_v1.csv
 create mode 100644 pi-util/conf_h265.csv
 create mode 100755 pi-util/conf_native.sh
 create mode 100755 pi-util/ffconf.py
 create mode 100755 pi-util/ffperf.py
 create mode 100755 pi-util/genpatch.sh
 create mode 100755 pi-util/make_array.py
 create mode 100755 pi-util/mkinst.sh
 create mode 100644 pi-util/patkodi.sh
 create mode 100755 pi-util/perfcmp.py
 create mode 100755 pi-util/qem.sh
 create mode 100755 pi-util/v3dusage.py

diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
new file mode 100644
index 0000000000..b050971f63
--- /dev/null
+++ b/pi-util/BUILD.txt
@@ -0,0 +1,59 @@
+Building Pi FFmpeg
+==================
+
+Current only building on a Pi is supported.
+This builds ffmpeg the way I've tested it
+
+Get all dependencies - the current package dependencies are good enough
+
+$ sudo apt-get build-dep ffmpeg
+
+Configure using the pi-util/conf_native.sh script
+-------------------------------------------------
+
+This sets the normal release options and creates an ouutput dir to build into
+The directory name will depend on system and options but will be under out/
+
+There are a few choices here
+ --mmal  build including the legacy mmal-based decoders and zero-copy code
+         this requires appropriate libraries which currently will exist for
+         armv7 but not arm64
+ --noshared
+         Build a static image rather than a shared library one.  Static is
+         easier for testing as there is no need to worry about library
+         paths being confused and therefore running the wrong code,  Shared
+         is what is needed, in most cases, when building for use by other
+         programs.
+
+So for a static build
+---------------------
+
+$ pi-util/conf_native.sh --noshared
+
+$ make -j8 -C out/<wherever the script said it was building to>
+
+You can now run ffmpeg directly from where it was built
+
+For a shared build
+------------------
+
+$ pi-util/conf_native.sh
+
+You will normally want an install target if shared. Note that the script has
+set this up to be generated in out/<builddir>/install, you don't have to worry
+about overwriting your system libs.
+
+$ make -j8 -C out/<builddir> install
+
+You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was
+built or install the image on the system - you have to be careful to get rid
+of all other ffmpeg libs or confusion may result.  There is a little script
+that wipes all other versions - obviously use with care!
+
+$ sudo pi-util/clean_usr_libs.sh
+
+Then simply copying from the install to /usr works
+
+$ sudo cp -r out/<builddir>/install/* /usr
+
+
diff --git a/pi-util/NOTES.txt b/pi-util/NOTES.txt
new file mode 100644
index 0000000000..fcce72226a
--- /dev/null
+++ b/pi-util/NOTES.txt
@@ -0,0 +1,69 @@
+Notes on the hevc_rpi decoder & associated support code
+-------------------------------------------------------
+
+There are 3 main parts to the existing code:
+
+1) The decoder - this is all in libavcodec as rpi_hevc*.
+
+2) A few filters to deal with Sand frames and a small patch to
+automatically select the sand->i420 converter when required.
+
+3) A kludge in ffmpeg.c to display the decoded video. This could & should
+be converted into a proper ffmpeg display module.
+
+
+Decoder
+-------
+
+The decoder is a modified version of the existing ffmpeg hevc decoder.
+Generally it is ~100% faster than the existing ffmpeg hevc s/w decoder.
+More complex bitstreams can be up to ~200% faster but particularly easy
+streams can cut its advantage down to ~50%.  This means that a Pi3+ can
+display nearly all 8-bit 1080p30 streams and with some overclocking it can
+display most lower bitrate 10-bit 1080p30 streams - this latter case is
+not helped by the requirement to downsample to 8-bit before display on a
+Pi.
+
+It has had co-processor offload added for inter-pred and large block
+residual transform.  Various parts have had optimized ARM NEON assembler
+added and the existing ARM asm sections have been profiled and
+re-optimized for A53. The main C code has been substantially reworked at
+its lower levels in an attempt to optimize it and minimize memory
+bandwidth. To some extent code paths that deal with frame types that it
+doesn't support have been pruned.
+
+It outputs frames in Broadcom Sand format. This is a somewhat annoying
+layout that doesn't fit into ffmpegs standard frame descriptions. It has
+vertical stripes of 128 horizontal pixels (64 in 10 bit forms) with Y for
+the stripe followed by interleaved U & V, that is then followed by the Y
+for the next stripe, etc. The final stripe is always padded to
+stripe-width. This is used in an attempt to help with cache locality and
+cut down on the number of dram bank switches. It is annoying to use for
+inter-pred with conventional processing but the way the Pi QPU (which is
+used for inter-pred) works means that it has negligible downsides here and
+the improved memory performance exceeds the overhead of the increased
+complexity in the rest of the code.
+
+Frames must be allocated out of GPU memory (as otherwise they can't be
+accessed by the co-processors). Utility functions (in rpi_zc.c) have been
+written to make this easier. As the frames are already in GPU memory they
+can be displayed by the Pi h/w without any further copying.
+
+
+Known non-features
+------------------
+
+Frame allocation should probably be done in some other way in order to fit
+into the standard framework better.
+
+Sand frames are currently declared as software frames, there is an
+argument that they should be hardware frames but they aren't really.
+
+There must be a better way of auto-selecting the hevc_rpi decoder over the
+normal s/w hevc decoder, but I became confused by the existing h/w
+acceleration framework and what I wanted to do didn't seem to fit in
+neatly.
+
+Display should be a proper device rather than a kludge in ffmpeg.c
+
+
diff --git a/pi-util/TESTMESA.txt b/pi-util/TESTMESA.txt
new file mode 100644
index 0000000000..92bc13a3df
--- /dev/null
+++ b/pi-util/TESTMESA.txt
@@ -0,0 +1,82 @@
+# Setup & Build instructions for testing Argon30 mesa support (on Pi4)
+
+# These assume that the drm_mmal test for Sand8 has been built on this Pi
+# as build relies on many of the same files
+
+# 1st get everything required to build ffmpeg
+# If sources aren't already enabled on your Pi then enable them
+sudo su
+sed "s/#deb-src/deb-src/" /etc/apt/sources.list > /tmp/sources.list
+sed "s/#deb-src/deb-src/" /etc/apt/sources.list.d/raspi.list > /tmp/raspi.list
+mv /tmp/sources.list /etc/apt/
+mv /tmp/raspi.list /etc/apt/sources.list.d/
+apt update
+
+# Get dependancies
+sudo apt build-dep ffmpeg
+
+sudo apt install meson libepoxy-dev libxcb-dri3-dev libxcb1-dev libx11-dev libx11-xcb-dev libdrm-dev
+
+# Enable H265 V4L2 request decoder
+sudo su
+echo dtoverlay=rpivid-v4l2 >> /boot/config.txt
+# You may also want to add more CMA if you are going to try 4k videos
+# Change the dtoverlay=vc4-fkms-v3d line in config.txt to read
+# dtoverlay=vc4-fkms-v3d,cma-512
+reboot
+# Check it has turned up
+ls -la /dev/video*
+# This should include video19
+# crw-rw----+ 1 root video 81, 7 Aug  4 17:25 /dev/video19
+
+# Currently on the Pi the linux headers from the debian distro don't match
+# the kernel that we ship and we need to update them - hopefully this step
+# will be unneeded in the future
+sudo apt install git bc bison flex libssl-dev make
+git clone --depth=1 https://github.com/raspberrypi/linux --branch rpi-5.10.y
+cd linux
+KERNEL=kernel7l
+make bcm2711_defconfig
+make headers_install
+sudo cp -r usr/include/linux /usr/include
+cd ..
+
+# Config - this builds a staticly linked ffmpeg which is easier for testing
+pi-util/conf_native.sh --noshared
+
+# Build (this is a bit dull)
+# If you want to poke the source the libavdevice/egl_vout.c contains the
+# output code -
+cd out/armv7-static-rel
+
+# Check that you have actually configured V4L2 request
+grep HEVC_V4L2REQUEST config.h
+# You are hoping for
+# #define CONFIG_HEVC_V4L2REQUEST_HWACCEL 1
+# if you get 0 then the config has failed
+
+make -j6
+
+# Grab test streams
+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-h264.mkv
+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc.mkv
+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc-10bit.mkv
+
+# Test i420 output (works currently)
+./ffmpeg -no_cvt_hw -vcodec h264_v4l2m2m -i jellyfish-3-mbps-hd-h264.mkv -f vout_egl -
+
+# Test Sand8 output - doesn't currently work but should once you have
+# Sand8 working in drm_mmal. I can't guarantee that this will work as
+# I can't test this path with a known working format, but the debug looks
+# good.  If this doesn't work & drm_mmal does with sand8 then come back to me
+# The "show_all 1" forces vout to display every frame otherwise it drops any
+# frame that would cause it to block
+./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc.mkv -show_all 1 -f vout_egl -
+
+# Test Sand30 - doesn't currently work
+# (Beware that when FFmpeg errors out it often leaves your teminal window
+# in a state where you need to reset it)
+./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc-10bit.mkv -f vout_egl -
+
+
+
diff --git a/pi-util/clean_usr_libs.sh b/pi-util/clean_usr_libs.sh
new file mode 100755
index 0000000000..b3b2d5509d
--- /dev/null
+++ b/pi-util/clean_usr_libs.sh
@@ -0,0 +1,26 @@
+set -e
+U=/usr/lib/arm-linux-gnueabihf
+rm -f $U/libavcodec.*
+rm -f $U/libavdevice.*
+rm -f $U/libavfilter.*
+rm -f $U/libavformat.*
+rm -f $U/libavutil.*
+rm -f $U/libswresample.*
+rm -f $U/libswscale.*
+U=/usr/lib/arm-linux-gnueabihf/neon/vfp
+rm -f $U/libavcodec.*
+rm -f $U/libavdevice.*
+rm -f $U/libavfilter.*
+rm -f $U/libavformat.*
+rm -f $U/libavutil.*
+rm -f $U/libswresample.*
+rm -f $U/libswscale.*
+U=/usr/lib/aarch64-linux-gnu
+rm -f $U/libavcodec.*
+rm -f $U/libavdevice.*
+rm -f $U/libavfilter.*
+rm -f $U/libavformat.*
+rm -f $U/libavutil.*
+rm -f $U/libswresample.*
+rm -f $U/libswscale.*
+
diff --git a/pi-util/conf_arm64_native.sh b/pi-util/conf_arm64_native.sh
new file mode 100644
index 0000000000..9e3bbfa190
--- /dev/null
+++ b/pi-util/conf_arm64_native.sh
@@ -0,0 +1,45 @@
+echo "Configure for ARM64 native build"
+
+#RPI_KEEPS="-save-temps=obj"
+
+SHARED_LIBS="--enable-shared"
+if [ "$1" == "--noshared" ]; then
+  SHARED_LIBS="--disable-shared"
+  echo Static libs
+  OUT=out/arm64-static-rel
+else
+  echo Shared libs
+  OUT=out/arm64-shared-rel
+fi
+
+mkdir -p $OUT
+cd $OUT
+
+A=aarch64-linux-gnu
+USR_PREFIX=`pwd`/install
+LIB_PREFIX=$USR_PREFIX/lib/$A
+INC_PREFIX=$USR_PREFIX/include/$A
+
+../../configure \
+ --prefix=$USR_PREFIX\
+ --libdir=$LIB_PREFIX\
+ --incdir=$INC_PREFIX\
+ --disable-stripping\
+ --disable-thumb\
+ --disable-mmal\
+ --enable-sand\
+ --enable-v4l2-request\
+ --enable-libdrm\
+ --enable-epoxy\
+ --enable-libudev\
+ --enable-vout-drm\
+ --enable-vout-egl\
+ $SHARED_LIBS\
+ --extra-cflags="-ggdb"
+
+# --enable-decoder=hevc_rpi\
+# --enable-extra-warnings\
+# --arch=armv71\
+
+# gcc option for getting asm listing
+# -Wa,-ahls
diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv
new file mode 100644
index 0000000000..4efd5d1c67
--- /dev/null
+++ b/pi-util/conf_h265.2016.csv
@@ -0,0 +1,195 @@
+1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5,8
+1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5,8
+1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5,8
+1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5,8
+1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5,8
+1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5,8
+1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5,8
+1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5,8
+1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5,8
+1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5,8
+1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5,8
+1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5,8
+1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5,8
+1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5,8
+1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5,8
+1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5,8
+1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5,8
+1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5,8
+1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5,8
+1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5,8
+1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5,8
+1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5,10
+1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5,8
+1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5,8
+1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5,8
+1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5,8
+1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5,8
+1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5,8
+1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5,8
+1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5,8
+1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5,8
+1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5,8
+1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5,8
+1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5,8
+1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5,8
+1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5,8
+1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5,8
+1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5,8
+1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5,8
+1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5,8
+1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5,8
+1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5,8
+1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5,10
+1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5,8
+1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5,8
+1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5,8
+1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5,8
+1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5,8
+1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5,8
+1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5,8
+1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5,8
+1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5,8
+1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5,8
+1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5,8
+1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5,8
+1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5,8
+1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5,8
+1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5,8
+1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5,8
+1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5,8
+1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5,8
+1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5,8
+1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5,8
+1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5,8
+1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5,8
+1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5,8
+1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5,8
+1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5,8
+1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5,8
+1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5,8
+1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5,8
+1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5,8
+1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5,8
+1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5,8
+1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5,8
+1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5,8
+1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5,8
+1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5,8
+1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5,8
+1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5,8
+1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5,8
+1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5,8
+1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5,8
+1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5,8
+1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5,8
+1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5,8
+1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5,8
+1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5,8
+1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5,8
+1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5,8
+1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5,8
+1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5,8
+1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5,8
+1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5,8
+1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5,8
+1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5,8
+1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5,8
+1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5,8
+1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5,8
+1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5,8
+1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5,8
+1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5,8
+1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5,8
+1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5,8
+1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5,8
+1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5,8
+1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5,8
+1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5,8
+1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5,8
+1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5,8
+1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt,8
+1,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt,8
+1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5,8
+1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5,8
+1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5,8
+1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5,8
+1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5,8
+1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5,8
+1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5,8
+1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5,8
+1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5,8
+1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5,8
+1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5,8
+1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5,8
+1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5,8
+1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5,8
+1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5,8
+3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth,10
+1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5,8
+1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5,8
+3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???,8
+1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5,10
+1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5,8
+1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5,8
+1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5,10
+1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5,10
+1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5,8
+1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5,10
+1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5,8
+1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5,10
+1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5,8
+1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5,10
+1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5,8
+1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5,10
+1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5,8
+1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5,10
+1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5,8
+1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5,0
+0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt,8
+0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt,8
+0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt,10
+0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt,8
+0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt,8
+1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt,0
+0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt,8
+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
+0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
+0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
+0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
+0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
+1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5,10
+1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5,0
+1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5,0
+1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5,0
+1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5,0
+1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5,0
+1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5,0
+0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5,0
+0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5,8
+0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5,8
+1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5,0
+1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5,8
+1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5,0
+1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5,0
+1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5,0
+1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt,0
+1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt,0
+1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5,0
+1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5,0
+0,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5, # Runs out of memory - could be fixed,8
+0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5,10
+0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5,10
+0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5,8
+0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5,8
+0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5,8
+0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5,8
+0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5,8
+1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5,8
+1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5,8
+1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5,8
+1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5,8
+1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5,8
diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv
new file mode 100644
index 0000000000..6082641271
--- /dev/null
+++ b/pi-util/conf_h265.2016_HEVC_v1.csv
@@ -0,0 +1,147 @@
+1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
+1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
+1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
+1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
+2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
+2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
+1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
+1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
+1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
new file mode 100644
index 0000000000..fc14f2a3c2
--- /dev/null
+++ b/pi-util/conf_h265.csv
@@ -0,0 +1,144 @@
+1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
+1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
+1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
+1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
+1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
+1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
+1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
+1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
new file mode 100755
index 0000000000..65576846e8
--- /dev/null
+++ b/pi-util/conf_native.sh
@@ -0,0 +1,108 @@
+echo "Configure for native build"
+
+FFSRC=`pwd`
+MC=`dpkg --print-architecture`
+BUILDBASE=$FFSRC/out
+
+#RPI_KEEPS="-save-temps=obj"
+RPI_KEEPS=""
+
+NOSHARED=
+MMAL=
+
+while [ "$1" != "" ] ; do
+    case $1 in
+	--noshared)
+	    NOSHARED=1
+	    ;;
+	--mmal)
+	    MMAL=1
+	    ;;
+	*)
+	    echo "Usage $0: [--noshared] [--mmal]"
+	    exit 1
+	    ;;
+    esac
+    shift
+done
+
+
+MCOPTS=
+RPI_INCLUDES=
+RPI_LIBDIRS=
+RPI_DEFINES=
+RPI_EXTRALIBS=
+
+if [ "$MC" == "arm64" ]; then
+  echo "M/C aarch64"
+  A=aarch64-linux-gnu
+  B=arm64
+elif [ "$MC" == "armhf" ]; then
+  echo "M/C armv7"
+  A=arm-linux-gnueabihf
+  B=armv7
+  MCOPTS="--arch=armv6t2 --cpu=cortex-a7"
+  RPI_DEFINES=-mfpu=neon-vfpv4
+else
+  echo Unexpected architecture $MC
+  exit 1
+fi
+
+if [ $MMAL ]; then
+  RPI_OPT_VC=/opt/vc
+  RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+  RPI_LIBDIRS="-L$RPI_OPT_VC/lib"
+  RPI_DEFINES="$RPI_DEFINES -D__VCCOREVER__=0x4000000"
+  RPI_EXTRALIBS="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm -Wl,--end-group"
+  RPIOPTS="--enable-mmal --enable-rpi"
+else
+  RPIOPTS="--disable-mmal --enable-sand"
+fi
+
+C=`lsb_release -sc`
+V=`cat RELEASE`
+
+SHARED_LIBS="--enable-shared"
+if [ $NOSHARED ]; then
+  SHARED_LIBS="--disable-shared"
+  OUT=$BUILDBASE/$B-$C-$V-static-rel
+  echo Static libs
+else
+  echo Shared libs
+  OUT=$BUILDBASE/$B-$C-$V-shared-rel
+fi
+
+USR_PREFIX=$OUT/install
+LIB_PREFIX=$USR_PREFIX/lib/$A
+INC_PREFIX=$USR_PREFIX/include/$A
+
+echo Destination directory: $OUT
+mkdir -p $OUT
+# Nothing under here need worry git - including this .gitignore!
+echo "**" > $BUILDBASE/.gitignore
+cd $OUT
+
+$FFSRC/configure \
+ --prefix=$USR_PREFIX\
+ --libdir=$LIB_PREFIX\
+ --incdir=$INC_PREFIX\
+ $MCOPTS\
+ --disable-stripping\
+ --disable-thumb\
+ --enable-v4l2-request\
+ --enable-libdrm\
+ --enable-epoxy\
+ --enable-libudev\
+ --enable-vout-egl\
+ --enable-vout-drm\
+ $SHARED_LIBS\
+ $RPIOPTS\
+ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
+ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
+ --extra-ldflags="$RPI_LIBDIRS"\
+ --extra-libs="$RPI_EXTRALIBS"\
+ --extra-version="rpi"
+
+
+# gcc option for getting asm listing
+# -Wa,-ahls
diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
new file mode 100755
index 0000000000..657568014e
--- /dev/null
+++ b/pi-util/ffconf.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+
+import string
+import os
+import subprocess
+import re
+import argparse
+import sys
+import csv
+from stat import *
+
+CODEC_HEVC_RPI  = 1
+HWACCEL_RPI     = 2
+HWACCEL_DRM     = 3
+HWACCEL_VAAPI   = 4
+
+def testone(fileroot, srcname, es_file, md5_file, pix, dectype, vcodec, ffmpeg_exec):
+    hwaccel = ""
+    if dectype == HWACCEL_RPI:
+        hwaccel = "rpi"
+    elif dectype == HWACCEL_DRM:
+        hwaccel = "drm"
+    elif dectype == HWACCEL_VAAPI:
+        hwaccel = "vaapi"
+
+    pix_fmt = []
+    if pix == "8":
+        pix_fmt = ["-pix_fmt", "yuv420p"]
+    elif pix == "10":
+        pix_fmt = ["-pix_fmt", "yuv420p10le"]
+    elif pix == "12":
+        pix_fmt = ["-pix_fmt", "yuv420p12le"]
+
+    tmp_root = "/tmp"
+
+    names = srcname.split('/')
+    while len(names) > 1:
+        tmp_root = os.path.join(tmp_root, names[0])
+        del names[0]
+    name = names[0]
+
+    if not os.path.exists(tmp_root):
+        os.makedirs(tmp_root)
+
+    dec_file = os.path.join(tmp_root, name + ".dec.md5")
+    try:
+        os.remove(dec_file)
+    except:
+        pass
+
+    flog = open(os.path.join(tmp_root, name + ".log"), "wt")
+
+    ffargs = [ffmpeg_exec, "-flags", "unaligned", "-hwaccel", hwaccel, "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file)] + pix_fmt + ["-f", "md5", dec_file]
+
+    # Unaligned needed for cropping conformance
+    if hwaccel:
+        rstr = subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT)
+    else:
+        rstr = subprocess.call(
+            [ffmpeg_exec, "-flags", "unaligned", "-vcodec", vcodec, "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
+            stdout=flog, stderr=subprocess.STDOUT)
+
+    try:
+        m1 = None
+        m2 = None
+        with open(os.path.join(fileroot, md5_file)) as f:
+            for line in f:
+                m1 = re.search("[0-9a-f]{32}", line.lower())
+                if m1:
+                    break
+
+        with open(dec_file) as f:
+            m2 = re.search("[0-9a-f]{32}", f.readline())
+    except:
+        pass
+
+    if  m1 and m2 and m1.group() == m2.group():
+        print("Match: " + m1.group(), file=flog)
+        rv = 0
+    elif not m1:
+        print("****** Cannot find m1", file=flog)
+        rv = 3
+    elif not m2:
+        print("****** Cannot find m2", file=flog)
+        rv = 2
+    else:
+        print("****** Mismatch: " + m1.group() + " != " + m2.group(), file=flog)
+        rv = 1
+    flog.close()
+    return rv
+
+def scandir(root):
+    aconf = []
+    ents = os.listdir(root)
+    ents.sort(key=str.lower)
+    for name in ents:
+        test_path = os.path.join(root, name)
+        if S_ISDIR(os.stat(test_path).st_mode):
+            files = os.listdir(test_path)
+            es_file = "?"
+            md5_file = "?"
+            for f in files:
+                (base, ext) = os.path.splitext(f)
+                if base[0] == '.':
+                    pass
+                elif ext == ".bit" or ext == ".bin":
+                    es_file = f
+                elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
+                    if md5_file == "?":
+                        md5_file = f
+                    elif base[-3:] == "yuv":
+                        md5_file = f
+            aconf.append((1, name, es_file, md5_file))
+    return aconf
+
+def runtest(name, tests):
+    if not tests:
+        return True
+    for t in tests:
+        if name[0:len(t)] == t or name.find("/" + t) != -1:
+            return True
+    return False
+
+def doconf(csva, tests, test_root, vcodec, dectype, ffmpeg_exec):
+    unx_failures = []
+    unx_success = []
+    failures = 0
+    successes = 0
+    for a in csva:
+        exp_test = int(a[0])
+        if (exp_test and runtest(a[1], tests)):
+            name = a[1]
+            print ("==== ", name, end="")
+            sys.stdout.flush()
+
+            rv = testone(os.path.join(test_root, name), name, a[2], a[3], a[4], dectype=dectype, vcodec=vcodec, ffmpeg_exec=ffmpeg_exec)
+            if (rv == 0):
+                successes += 1
+            else:
+                failures += 1
+
+            if (rv == 0):
+                if exp_test == 2:
+                    print(": * OK *")
+                    unx_success.append(name)
+                else:
+                    print(": ok")
+            elif exp_test == 2 and rv == 1:
+                print(": fail")
+            elif exp_test == 3 and rv == 2:
+                # Call an expected "crash" an abort
+                print(": abort")
+            else:
+                unx_failures.append(name)
+                if rv == 1:
+                    print(": * FAIL *")
+                elif (rv == 2) :
+                    print(": * CRASH *")
+                elif (rv == 3) :
+                    print(": * MD5 MISSING *")
+                else :
+                    print(": * BANG *")
+
+    if unx_failures or unx_success:
+        print("Unexpected Failures:", unx_failures)
+        print("Unexpected Success: ", unx_success)
+    else:
+        print("All tests normal:", successes, "ok,", failures, "failed")
+
+
+class ConfCSVDialect(csv.Dialect):
+    delimiter = ','
+    doublequote = True
+    lineterminator = '\n'
+    quotechar='"'
+    quoting = csv.QUOTE_MINIMAL
+    skipinitialspace = True
+    strict = True
+
+if __name__ == '__main__':
+
+    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
+    argp.add_argument("tests", nargs='*')
+    argp.add_argument("--pi4", action='store_true', help="Force pi4 cmd line")
+    argp.add_argument("--drm", action='store_true', help="Force v4l2 drm cmd line")
+    argp.add_argument("--vaapi", action='store_true', help="Force vaapi cmd line")
+    argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
+    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
+    argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
+    argp.add_argument("--vcodec", default="hevc_rpi", help="vcodec name to use")
+    argp.add_argument("--ffmpeg", default="./ffmpeg", help="ffmpeg exec name")
+    args = argp.parse_args()
+
+    if args.csvgen:
+        csv.writer(sys.stdout).writerows(scandir(args.test_root))
+        exit(0)
+
+    with open(args.csv, 'rt') as csvfile:
+        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
+
+    dectype = CODEC_HEVC_RPI
+    if os.path.exists("/dev/rpivid-hevcmem"):
+        dectype = HWACCEL_RPI
+    if args.drm or os.path.exists("/sys/module/rpivid_hevc"):
+        dectype = HWACCEL_DRM
+
+    if args.pi4:
+        dectype = HWACCEL_RPI
+    elif args.drm:
+        dectype = HWACCEL_DRM
+    elif args.vaapi:
+        dectype = HWACCEL_VAAPI
+
+    doconf(csva, args.tests, args.test_root, args.vcodec, dectype, args.ffmpeg)
+
diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py
new file mode 100755
index 0000000000..65c5224cd8
--- /dev/null
+++ b/pi-util/ffperf.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+
+import time
+import string
+import os
+import tempfile
+import subprocess
+import re
+import argparse
+import sys
+import csv
+from stat import *
+
+class tstats:
+    close_threshold = 0.01
+
+    def __init__(self, stats_dict=None):
+        if stats_dict != None:
+            self.name = stats_dict["name"]
+            self.elapsed = float(stats_dict["elapsed"])
+            self.user = float(stats_dict["user"])
+            self.sys = float(stats_dict["sys"])
+
+    def times_str(self):
+        ctime = self.sys + self.user
+        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
+
+    def dict(self):
+        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
+
+    def is_close(self, other):
+        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
+
+    def __lt__(self, other):
+        return self.elapsed < other.elapsed
+    def __gt__(self, other):
+        return self.elapsed > other.elapsed
+
+    def time_file(name, prefix, ffmpeg="./ffmpeg"):
+        stats = tstats()
+        stats.name = name
+        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
+        cproc = subprocess.Popen([ffmpeg, "-no_cvt_hw",
+                                  "-vcodec", "hevc_rpi",
+                                  "-t", "30", "-i", prefix + name,
+                                  "-f", "vout_rpi", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
+        pinfo = os.wait4(cproc.pid, 0)
+        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
+        stats.elapsed = end_time - start_time
+        stats.user = pinfo[2].ru_utime
+        stats.sys = pinfo[2].ru_stime
+        return stats
+
+
+def common_prefix(s1, s2):
+    for i in range(min(len(s1),len(s2))):
+        if s1[i] != s2[i]:
+            return s1[:i]
+    return s1[:i+1]
+
+def main():
+    global flog
+
+    argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
+To blank the screen before starting use "xdg-screensaver activate"
+(For some reason this doesn't seem to work from within python).
+""")
+
+    argp.add_argument("streams", nargs='*')
+    argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
+    argp.add_argument("--csv_in", help="CSV input filename")
+    argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
+    argp.add_argument("--repeat", default=3, type=int, help="Run repeat count")
+    argp.add_argument("--ffmpeg", default="./ffmpeg", help="FFmpeg executable")
+
+    args = argp.parse_args()
+
+    csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
+    csv_out.writeheader()
+
+    stats_in = {}
+    if args.csv_in != None:
+        with open(args.csv_in, 'r', newline='') as f_in:
+            stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
+
+    flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
+
+    streams = args.streams
+    if not streams:
+        if not stats_in:
+            print ("No source streams specified")
+            return 1
+        prefix = "" if args.prefix == None else args.prefix
+        streams = [k for k in stats_in]
+    elif args.prefix != None:
+        prefix = args.prefix
+    else:
+        prefix = streams[0]
+        for f in streams[1:]:
+            prefix = common_prefix(prefix, f)
+        pp = prefix.rpartition(os.sep)
+        prefix = pp[0] + pp[1]
+        streams = [s[len(prefix):] for s in streams]
+
+    for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
+        print ("====", f)
+
+        t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
+        for i in range(args.repeat):
+            t = tstats.time_file(f, prefix, args.ffmpeg)
+            print ("...", t.times_str())
+            if t0 > t:
+                t0 = t
+
+        if t0.name in stats_in:
+            pstat = stats_in[t0.name]
+            print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
+
+        csv_out.writerow(t0.dict())
+
+        print ()
+
+    return 0
+
+
+if __name__ == '__main__':
+    exit(main())
+
diff --git a/pi-util/genpatch.sh b/pi-util/genpatch.sh
new file mode 100755
index 0000000000..0948a68a7a
--- /dev/null
+++ b/pi-util/genpatch.sh
@@ -0,0 +1,35 @@
+set -e
+
+NOPATCH=
+if [ "$1" == "--notag" ]; then
+  shift
+  NOPATCH=1
+fi
+
+if [ "$1" == "" ]; then
+  echo Usage: $0 [--notag] \<patch_tag\>
+  echo e.g.: $0 mmal_4
+  exit 1
+fi
+
+VERSION=`cat RELEASE`
+if [ "$VERSION" == "" ]; then
+  echo Can\'t find version RELEASE
+  exit 1
+fi
+
+PATCHFILE=../ffmpeg-$VERSION-$1.patch
+
+if [ $NOPATCH ]; then
+  echo Not tagged
+else
+  # Only continue if we are all comitted
+  git diff --name-status --exit-code
+
+  PATCHTAG=pi/$VERSION/$1
+  echo Tagging: $PATCHTAG
+
+  git tag $PATCHTAG
+fi
+echo Generating patch: $PATCHFILE
+git diff n$VERSION -- > $PATCHFILE
diff --git a/pi-util/make_array.py b/pi-util/make_array.py
new file mode 100755
index 0000000000..67b22d2d51
--- /dev/null
+++ b/pi-util/make_array.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+# Usage
+#   make_array file.bin
+#   Produces file.h with array of bytes.
+#
+import sys
+for file in sys.argv[1:]:
+  prefix,suffix = file.split('.')
+  assert suffix=='bin'
+  name=prefix.split('/')[-1]
+  print 'Converting',file
+  with open(prefix+'.h','wb') as out:
+    print >>out, 'static const unsigned char',name,'[] = {'
+    with open(file,'rb') as fd:
+      i = 0
+      for byte in fd.read():
+        print >>out, '0x%02x, ' % ord(byte),
+        i = i + 1
+        if i % 8 == 0:
+          print >>out, ' // %04x' % (i - 8)
+    print >>out,'};'
+
diff --git a/pi-util/mkinst.sh b/pi-util/mkinst.sh
new file mode 100755
index 0000000000..271a39e846
--- /dev/null
+++ b/pi-util/mkinst.sh
@@ -0,0 +1,5 @@
+set -e
+
+make install
+
+cp -r install/* ../vlc/sysroot/raspian_stretch_pi1-sysroot/usr
diff --git a/pi-util/patkodi.sh b/pi-util/patkodi.sh
new file mode 100644
index 0000000000..dcd05a606e
--- /dev/null
+++ b/pi-util/patkodi.sh
@@ -0,0 +1,9 @@
+set -e
+KODIBASE=/home/jc/rpi/kodi/xbmc
+JOBS=-j20
+make $JOBS
+git diff xbmc/release/4.3-kodi > $KODIBASE/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+make -C $KODIBASE/tools/depends/target/ffmpeg $JOBS
+make -C $KODIBASE/build install
+
+
diff --git a/pi-util/perfcmp.py b/pi-util/perfcmp.py
new file mode 100755
index 0000000000..e44cfa0c3c
--- /dev/null
+++ b/pi-util/perfcmp.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+
+import time
+import string
+import os
+import tempfile
+import subprocess
+import re
+import argparse
+import sys
+import csv
+from stat import *
+
+class tstats:
+    close_threshold = 0.01
+
+    def __init__(self, stats_dict=None):
+        if stats_dict != None:
+            self.name = stats_dict["name"]
+            self.elapsed = float(stats_dict["elapsed"])
+            self.user = float(stats_dict["user"])
+            self.sys = float(stats_dict["sys"])
+
+    def times_str(self):
+        ctime = self.sys + self.user
+        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
+
+    def dict(self):
+        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
+
+    def is_close(self, other):
+        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
+
+    def __lt__(self, other):
+        return self.elapsed < other.elapsed
+    def __gt__(self, other):
+        return self.elapsed > other.elapsed
+
+    def time_file(name, prefix):
+        stats = tstats()
+        stats.name = name
+        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
+        cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
+                                  "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
+        pinfo = os.wait4(cproc.pid, 0)
+        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
+        stats.elapsed = end_time - start_time
+        stats.user = pinfo[2].ru_utime
+        stats.sys = pinfo[2].ru_stime
+        return stats
+
+
+def common_prefix(s1, s2):
+    for i in range(min(len(s1),len(s2))):
+        if s1[i] != s2[i]:
+            return s1[:i]
+    return s1[:i+1]
+
+def main():
+    argp = argparse.ArgumentParser(description="FFmpeg performance compare")
+
+    argp.add_argument("stream0", help="CSV to compare")
+    argp.add_argument("stream1", nargs='?', default="ffperf_out.csv", help="CSV to compare")
+
+    args = argp.parse_args()
+
+    with open(args.stream0, 'r', newline='') as f_in:
+        stats0 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
+    with open(args.stream1, 'r', newline='') as f_in:
+        stats1 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
+
+    print (args.stream0, "<<-->>", args.stream1)
+    print ()
+
+    for f in sorted(stats0.keys() | stats1.keys(), key=lambda x : "~" * x.count(os.sep) + x.lower()):
+       if not (f in stats0) :
+           print ("           XX               :", f)
+           continue
+       if not (f in stats1) :
+           print ("       XX                   :", f)
+           continue
+
+       s0 = stats0[f]
+       s1 = stats1[f]
+
+       pcent = ((s0.elapsed - s1.elapsed) / s0.elapsed) * 100.0
+       thresh = 0.3
+       tc = 6
+
+       nchar = min(tc - 1, int(abs(pcent) / thresh))
+       cc = "  --  " if nchar == 0 else "<" * nchar + " " * (tc - nchar) if pcent < 0 else " " * (tc - nchar) + ">" * nchar
+
+       print ("%6.2f %s%6.2f (%+5.2f) : %s" %
+           (s0.elapsed, cc, s1.elapsed, pcent, f))
+
+    return 0
+
+
+if __name__ == '__main__':
+    exit(main())
+
diff --git a/pi-util/qem.sh b/pi-util/qem.sh
new file mode 100755
index 0000000000..a4dbb6eacd
--- /dev/null
+++ b/pi-util/qem.sh
@@ -0,0 +1,9 @@
+TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
+QASM=python\ ../local/bin/qasm.py
+SRC_FILE=libavcodec/rpi_hevc_shader.qasm
+DST_BASE=shader
+
+cp libavcodec/rpi_hevc_shader_cmd.h $TARGET_DIR
+$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
+$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
+
diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
new file mode 100755
index 0000000000..5935a11ca5
--- /dev/null
+++ b/pi-util/v3dusage.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+
+import sys
+import argparse
+import re
+
+def do_logparse(logname):
+
+    rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
+    rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
+    rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
+    rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
+
+    ttotal = {'idle':0.0}
+    tstart = {}
+    qctotal = {}
+    qtstotal = {}
+    l2hits = {}
+    l2total = {}
+    time0 = None
+    idle_start = None
+    qpu_op_no = 0
+    op_count = 0
+
+    with open(logname, "rt") as infile:
+        for line in infile:
+            match = rmatch.match(line)
+            if match:
+#                print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
+                time = float(match.group(1))
+                unit = match.group(3)
+                opstart = not match.group(2)
+                optype = match.group(7)
+                hascb = match.group(8) != "0"
+
+                if unit == 'qpu1':
+                    unit = unit + "." + str(qpu_op_no)
+                    if not opstart:
+                        if hascb or optype == 'EXECUTE_SYNC':
+                            qpu_op_no = 0
+                        else:
+                            qpu_op_no += 1
+
+                # Ignore sync type
+                if optype == 'EXECUTE_SYNC':
+                    continue
+
+                if not time0:
+                    time0 = time
+
+                if opstart:
+                    tstart[unit] = time;
+                elif unit in tstart:
+                    op_count += 1
+                    if not unit in ttotal:
+                        ttotal[unit] = 0.0
+                    ttotal[unit] += time - tstart[unit]
+                    del tstart[unit]
+
+                if not idle_start and not tstart:
+                    idle_start = time
+                elif idle_start and tstart:
+                    ttotal['idle'] += time - idle_start
+                    idle_start = None
+
+            match = rqcycle.match(line)
+            if match:
+                unit = "qpu1." + str(qpu_op_no)
+                if not unit in qctotal:
+                    qctotal[unit] = 0
+                qctotal[unit] += int(match.group(2))
+
+            match = rqtscycle.match(line)
+            if match:
+                unit = "qpu1." + str(qpu_op_no)
+                if not unit in qtstotal:
+                    qtstotal[unit] = 0
+                qtstotal[unit] += int(match.group(2))
+
+            match = rl2hits.match(line)
+            if match:
+                unit = "qpu1." + str(qpu_op_no)
+                if not unit in l2total:
+                    l2total[unit] = 0
+                    l2hits[unit] = 0
+                l2total[unit] += int(match.group(3))
+                if match.group(2) == "hits":
+                    l2hits[unit] += int(match.group(3))
+
+
+    if not time0:
+        print "No v3d profile records found"
+    else:
+        tlogged = time - time0
+
+        print "Logged time:", tlogged, "  Op count:", op_count
+        for unit in sorted(ttotal):
+            print b'%6s: %10.3f    %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
+        print
+        for unit in sorted(qctotal):
+            if not unit in qtstotal:
+                qtstotal[unit] = 0;
+            print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
+            if unit in l2total:
+                print b'        L2Total: %10d, hits:      %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
+
+
+
+if __name__ == '__main__':
+    argp = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description="QPU/VPU perf summary from VC logging",
+        epilog = """
+Will also summarise TMU stalls if logging requests set in qpu noflush param
+in the profiled code.
+
+Example use:
+  vcgencmd set_logging level=0xc0
+  <command to profile>
+  sudo vcdbg log msg >& t.log
+  v3dusage.py t.log
+""")
+
+    argp.add_argument("logfile")
+    args = argp.parse_args()
+
+    do_logparse(args.logfile)
+

From f3eaadb27a5bc6db07d33ce0814d796e8cee623e Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 11:27:39 +0100
Subject: [PATCH 002/161] Add sand pix fmts & conversion fns

---
 configure                     |   3 +
 libavutil/Makefile            |   3 +
 libavutil/arm/Makefile        |   1 +
 libavutil/arm/rpi_sand_neon.S | 768 ++++++++++++++++++++++++++++++++++
 libavutil/arm/rpi_sand_neon.h |  99 +++++
 libavutil/pixdesc.c           |  44 ++
 libavutil/pixfmt.h            |   6 +
 libavutil/rpi_sand_fn_pw.h    | 227 ++++++++++
 libavutil/rpi_sand_fns.c      | 353 ++++++++++++++++
 libavutil/rpi_sand_fns.h      | 183 ++++++++
 10 files changed, 1687 insertions(+)
 create mode 100644 libavutil/arm/rpi_sand_neon.S
 create mode 100644 libavutil/arm/rpi_sand_neon.h
 create mode 100644 libavutil/rpi_sand_fn_pw.h
 create mode 100644 libavutil/rpi_sand_fns.c
 create mode 100644 libavutil/rpi_sand_fns.h

diff --git a/configure b/configure
index b6616f00b6..27112ced58 100755
--- a/configure
+++ b/configure
@@ -344,6 +344,7 @@ External library support:
   --enable-libvpl          enable Intel oneVPL code via libvpl if libmfx is not used [no]
   --enable-libnpp          enable Nvidia Performance Primitives-based code [no]
   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
+  --enable-sand            enable sand video formats [rpi]
   --disable-nvdec          disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
   --disable-nvenc          disable Nvidia video encoding code [autodetect]
   --enable-omx             enable OpenMAX IL code [no]
@@ -1930,6 +1931,7 @@ FEATURE_LIST="
     omx_rpi
     runtime_cpudetect
     safe_bitstream_reader
+    sand
     shared
     small
     static
@@ -2495,6 +2497,7 @@ CONFIG_EXTRA="
     rtpdec
     rtpenc_chain
     rv34dsp
+    sand
     scene_sad
     sinewin
     snappy
diff --git a/libavutil/Makefile b/libavutil/Makefile
index dc9012f9a8..e33f5db099 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -73,6 +73,7 @@ HEADERS = adler32.h                                                     \
           rational.h                                                    \
           replaygain.h                                                  \
           ripemd.h                                                      \
+	  rpi_sand_fns.h                                                \
           samplefmt.h                                                   \
           sha.h                                                         \
           sha512.h                                                      \
@@ -192,6 +193,7 @@ OBJS-$(CONFIG_MACOS_KPERF)              += macos_kperf.o
 OBJS-$(CONFIG_MEDIACODEC)               += hwcontext_mediacodec.o
 OBJS-$(CONFIG_OPENCL)                   += hwcontext_opencl.o
 OBJS-$(CONFIG_QSV)                      += hwcontext_qsv.o
+OBJS-$(CONFIG_SAND)                     += rpi_sand_fns.o
 OBJS-$(CONFIG_VAAPI)                    += hwcontext_vaapi.o
 OBJS-$(CONFIG_VIDEOTOOLBOX)             += hwcontext_videotoolbox.o
 OBJS-$(CONFIG_VDPAU)                    += hwcontext_vdpau.o
@@ -212,6 +214,7 @@ SKIPHEADERS-$(CONFIG_D3D11VA)          += hwcontext_d3d11va.h
 SKIPHEADERS-$(CONFIG_DXVA2)            += hwcontext_dxva2.h
 SKIPHEADERS-$(CONFIG_QSV)              += hwcontext_qsv.h
 SKIPHEADERS-$(CONFIG_OPENCL)           += hwcontext_opencl.h
+SKIPHEADERS-$(CONFIG-RPI)              += rpi_sand_fn_pw.h
 SKIPHEADERS-$(CONFIG_VAAPI)            += hwcontext_vaapi.h
 SKIPHEADERS-$(CONFIG_VIDEOTOOLBOX)     += hwcontext_videotoolbox.h
 SKIPHEADERS-$(CONFIG_VDPAU)            += hwcontext_vdpau.h
diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
index 5da44b0542..b74b7c4e2f 100644
--- a/libavutil/arm/Makefile
+++ b/libavutil/arm/Makefile
@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o                                    \

 NEON-OBJS += arm/float_dsp_init_neon.o                                  \
              arm/float_dsp_neon.o                                       \
+             arm/rpi_sand_neon.o                                        \
diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
new file mode 100644
index 0000000000..80890fe985
--- /dev/null
+++ b/libavutil/arm/rpi_sand_neon.S
@@ -0,0 +1,768 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+#include "libavutil/arm/asm.S"
+
+
+@ General notes:
+@ Having done some timing on this in sand8->y8 (Pi4)
+@  vst1 (680fps) is a bit faster than vstm (660fps)
+@  vldm (680fps) is noticably faster than vld1 (480fps)
+@  (or it might be that a mix is what is required)
+@
+@ At least on a Pi4 it is no more expensive to have a single auto-inc register
+@ for dest address than it is to have 2 used alternately (On Pi3 Ben asserted
+@ the latter was better)
+@
+@ vstm will bus error on unaligned access (so will vldm), vst1 is safe unless
+@ the memory is uncached.
+@ As these are Sand -> planar we can assume that src is going to be aligned but
+@ it is possible that dest isn't (converting to .yuv or other packed format).
+@ Luckily vst1 is faster than vstm :-) so all is well
+@ vst1 has alignment requirements of el size so maybe splitting vst1.32 into 4
+@ .8 stores would let us do non-word aligned stores into uncached but it
+@ probably isn't worth it.
+
+
+
+
+@ void ff_rpi_sand128b_stripe_to_8_10(
+@   uint8_t * dest,             // [r0]
+@   const uint8_t * src1,       // [r1]
+@   const uint8_t * src2,       // [r2]
+@   unsigned int lines);        // [r3]
+
+.macro  stripe2_to_8, bit_depth
+        vpush    {q4-q7}
+1:
+        vldm     r1!, {q0-q7}
+        subs     r3, #1
+        vldm     r2!, {q8-q15}
+        vqrshrn.u16 d0,  q0,  #\bit_depth - 8
+        vqrshrn.u16 d1,  q1,  #\bit_depth - 8
+        vqrshrn.u16 d2,  q2,  #\bit_depth - 8
+        vqrshrn.u16 d3,  q3,  #\bit_depth - 8
+        vqrshrn.u16 d4,  q4,  #\bit_depth - 8
+        vqrshrn.u16 d5,  q5,  #\bit_depth - 8
+        vqrshrn.u16 d6,  q6,  #\bit_depth - 8
+        vqrshrn.u16 d7,  q7,  #\bit_depth - 8
+        vqrshrn.u16 d8,  q8,  #\bit_depth - 8
+        vqrshrn.u16 d9,  q9,  #\bit_depth - 8
+        vqrshrn.u16 d10, q10, #\bit_depth - 8
+        vqrshrn.u16 d11, q11, #\bit_depth - 8
+        vqrshrn.u16 d12, q12, #\bit_depth - 8
+        vqrshrn.u16 d13, q13, #\bit_depth - 8
+        vqrshrn.u16 d14, q14, #\bit_depth - 8
+        vqrshrn.u16 d15, q15, #\bit_depth - 8
+        vstm     r0!, {q0-q7}
+        bne      1b
+        vpop     {q4-q7}
+        bx       lr
+.endm
+
+function ff_rpi_sand128b_stripe_to_8_10, export=1
+        stripe2_to_8     10
+endfunc
+
+@ void ff_rpi_sand8_lines_to_planar_y8(
+@   uint8_t * dest,             // [r0]
+@   unsigned int dst_stride,    // [r1]
+@   const uint8_t * src,        // [r2]
+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+@   unsigned int src_stride2,   // [sp, #0]  -> r3
+@   unsigned int _x,            // [sp, #4]  Ignored - 0
+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+@   unsigned int h);            // [sp, #16] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for writing
+
+function ff_rpi_sand8_lines_to_planar_y8, export=1
+                push            {r4-r8, lr}     @ +24            L
+                ldr             r3,  [sp, #24]
+                ldr             r6,  [sp, #36]
+                ldr             r7,  [sp, #32]  @ y
+                lsl             r3,  #7
+                sub             r1,  r6
+                add             r8,  r2,  r7,  lsl #7
+                ldr             r7,  [sp, #40]
+
+10:
+                mov             r2,  r8
+                add             r4,  r0,  #24
+                mov             r5,  r6
+                mov             lr,  #0
+1:
+                vldm            r2,  {q8-q15}
+                add             r2,  r3
+                subs            r5,  #128
+                blt             2f
+                vst1.8          {d16, d17, d18, d19}, [r0]!
+                vst1.8          {d20, d21, d22, d23}, [r0]!
+                vst1.8          {d24, d25, d26, d27}, [r0]!
+                vst1.8          {d28, d29, d30, d31}, [r0]!
+                bne             1b
+11:
+                subs            r7,  #1
+                add             r0,  r1
+                add             r8,  #128
+                bne             10b
+
+                pop             {r4-r8, pc}
+
+@ Partial final write
+2:
+                cmp             r5,  #64-128
+                blt             1f
+                vst1.8          {d16, d17, d18, d19}, [r0]!
+                vst1.8          {d20, d21, d22, d23}, [r0]!
+                beq             11b
+                vmov            q8,  q12
+                vmov            q9,  q13
+                sub             r5,  #64
+                vmov            q10, q14
+                vmov            q11, q15
+1:
+                cmp             r5,  #32-128
+                blt             1f
+                vst1.8          {d16, d17, d18, d19}, [r0]!
+                beq             11b
+                vmov            q8,  q10
+                sub             r5,  #32
+                vmov            q9,  q11
+1:
+                cmp             r5,  #16-128
+                blt             1f
+                vst1.8          {d16, d17}, [r0]!
+                beq             11b
+                sub             r5,  #16
+                vmov            q8,  q9
+1:
+                cmp             r5,  #8-128
+                blt             1f
+                vst1.8          {d16}, [r0]!
+                beq             11b
+                sub             r5,  #8
+                vmov            d16, d17
+1:
+                cmp             r5,  #4-128
+                blt             1f
+                vst1.32         {d16[0]}, [r0]!
+                beq             11b
+                sub             r5,  #4
+                vshr.u64        d16, #32
+1:
+                cmp             r5,  #2-128
+                blt             1f
+                vst1.16         {d16[0]}, [r0]!
+                beq             11b
+                vst1.8          {d16[2]}, [r0]!
+                b               11b
+1:
+                vst1.8          {d16[0]}, [r0]!
+                b               11b
+endfunc
+
+@ void ff_rpi_sand8_lines_to_planar_c8(
+@   uint8_t * dst_u,            // [r0]
+@   unsigned int dst_stride_u,  // [r1]
+@   uint8_t * dst_v,            // [r2]
+@   unsigned int dst_stride_v,  // [r3]
+@   const uint8_t * src,        // [sp, #0]  -> r4, r5
+@   unsigned int stride1,       // [sp, #4]  128
+@   unsigned int stride2,       // [sp, #8]  -> r8
+@   unsigned int _x,            // [sp, #12] 0
+@   unsigned int y,             // [sp, #16] (r7 in prefix)
+@   unsigned int _w,            // [sp, #20] -> r12, r6
+@   unsigned int h);            // [sp, #24] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for writing
+
+function ff_rpi_sand8_lines_to_planar_c8, export=1
+                push            {r4-r8, lr}     @ +24
+
+                ldr             r5,  [sp, #24]
+                ldr             r8,  [sp, #32]
+                ldr             r7,  [sp, #40]
+                ldr             r6,  [sp, #44]
+                lsl             r8,  #7
+                add             r5,  r5,  r7,  lsl #7
+                sub             r1,  r1,  r6
+                sub             r3,  r3,  r6
+                ldr             r7,  [sp, #48]
+                vpush           {q4-q7}
+
+10:
+                mov             r4,  r5
+                mov             r12, r6
+1:
+                subs            r12, #64
+                vldm            r4,  {q0-q7}
+                add             r4,  r8
+                it              gt
+                vldmgt          r4,  {q8-q15}
+                add             r4,  r8
+
+                vuzp.8          q0,  q1
+                vuzp.8          q2,  q3
+                vuzp.8          q4,  q5
+                vuzp.8          q6,  q7
+
+                vuzp.8          q8,  q9
+                vuzp.8          q10, q11
+                vuzp.8          q12, q13
+                vuzp.8          q14, q15
+                subs            r12, #64
+
+                @ Rearrange regs so we can use vst1 with 4 regs
+                vswp            q1,  q2
+                vswp            q5,  q6
+                vswp            q9,  q10
+                vswp            q13, q14
+                blt             2f
+
+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
+                vst1.8          {d8,  d9,  d10, d11}, [r0]!
+                vst1.8          {d16, d17, d18, d19}, [r0]!
+                vst1.8          {d24, d25, d26, d27}, [r0]!
+
+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
+                vst1.8          {d12, d13, d14, d15}, [r2]!
+                vst1.8          {d20, d21, d22, d23}, [r2]!
+                vst1.8          {d28, d29, d30, d31}, [r2]!
+                bne             1b
+11:
+                subs            r7,  #1
+                add             r5,  #128
+                add             r0,  r1
+                add             r2,  r3
+                bne             10b
+                vpop            {q4-q7}
+                pop             {r4-r8,pc}
+
+2:
+                cmp             r12, #64-128
+                blt             1f
+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
+                vst1.8          {d8,  d9,  d10, d11}, [r0]!
+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
+                vst1.8          {d12, d13, d14, d15}, [r2]!
+                beq             11b
+                sub             r12, #64
+                vmov            q0,  q8
+                vmov            q1,  q9
+                vmov            q2,  q10
+                vmov            q3,  q11
+                vmov            q4,  q12
+                vmov            q5,  q13
+                vmov            q6,  q14
+                vmov            q7,  q15
+1:
+                cmp             r12, #32-128
+                blt             1f
+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
+                beq             11b
+                sub             r12, #32
+                vmov            q0,  q4
+                vmov            q1,  q5
+                vmov            q2,  q6
+                vmov            q3,  q7
+1:
+                cmp             r12, #16-128
+                blt             1f
+                vst1.8          {d0,  d1 }, [r0]!
+                vst1.8          {d4,  d5 }, [r2]!
+                beq             11b
+                sub             r12, #16
+                vmov            q0,  q1
+                vmov            q2,  q3
+1:
+                cmp             r12, #8-128
+                blt             1f
+                vst1.8          {d0}, [r0]!
+                vst1.8          {d4}, [r2]!
+                beq             11b
+                sub             r12, #8
+                vmov            d0,  d1
+                vmov            d4,  d5
+1:
+                cmp             r12, #4-128
+                blt             1f
+                vst1.32         {d0[0]}, [r0]!
+                vst1.32         {d4[0]}, [r2]!
+                beq             11b
+                sub             r12, #4
+                vmov            s0,  s1
+                vmov            s8,  s9
+1:
+                cmp             r12, #2-128
+                blt             1f
+                vst1.16         {d0[0]}, [r0]!
+                vst1.16         {d4[0]}, [r2]!
+                beq             11b
+                vst1.8          {d0[2]}, [r0]!
+                vst1.8          {d4[2]}, [r2]!
+                b               11b
+1:
+                vst1.8          {d0[0]}, [r0]!
+                vst1.8          {d4[0]}, [r2]!
+                b               11b
+endfunc
+
+
+
+@ void ff_rpi_sand30_lines_to_planar_y16(
+@   uint8_t * dest,             // [r0]
+@   unsigned int dst_stride,    // [r1]
+@   const uint8_t * src,        // [r2]
+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+@   unsigned int src_stride2,   // [sp, #0]  -> r3
+@   unsigned int _x,            // [sp, #4]  Ignored - 0
+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+@   unsigned int h);            // [sp, #16] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for writing
+
+function ff_rpi_sand30_lines_to_planar_y16, export=1
+                push            {r4-r8, lr}     @ +24
+                ldr             r3,  [sp, #24]
+                ldr             r6,  [sp, #36]
+                ldr             r7,  [sp, #32]  @ y
+                mov             r12, #48
+                vmov.u16        q15, #0x3ff
+                sub             r3,  #1
+                lsl             r3,  #7
+                sub             r1,  r1,  r6,  lsl #1
+                add             r8,  r2,  r7,  lsl #7
+                ldr             r7,  [sp, #40]
+
+10:
+                mov             r2,  r8
+                add             r4,  r0,  #24
+                mov             r5,  r6
+                mov             lr,  #0
+1:
+                vldm            r2!, {q10-q13}
+                add             lr,  #64
+
+                vshr.u32        q14, q10, #20    @ Cannot vshrn.u32 #20!
+                ands            lr,  #127
+                vshrn.u32       d2,  q10, #10
+                vmovn.u32       d0,  q10
+                vmovn.u32       d4,  q14
+
+                vshr.u32        q14, q11, #20
+                it              eq
+                addeq           r2,  r3
+                vshrn.u32       d3,  q11, #10
+                vmovn.u32       d1,  q11
+                vmovn.u32       d5,  q14
+
+                subs            r5,  #48
+                vand            q0,  q15
+                vand            q1,  q15
+                vand            q2,  q15
+
+                vshr.u32        q14, q12, #20
+                vshrn.u32       d18, q12, #10
+                vmovn.u32       d16, q12
+                vmovn.u32       d20, q14
+
+                vshr.u32        q14, q13, #20
+                vshrn.u32       d19, q13, #10
+                vmovn.u32       d17, q13
+                vmovn.u32       d21, q14
+
+                vand            q8,  q15
+                vand            q9,  q15
+                vand            q10, q15
+                blt             2f
+
+                vst3.16         {d0,  d2,  d4},  [r0], r12
+                vst3.16         {d1,  d3,  d5},  [r4], r12
+                vst3.16         {d16, d18, d20}, [r0], r12
+                vst3.16         {d17, d19, d21}, [r4], r12
+
+                bne             1b
+
+11:
+                subs            r7,  #1
+                add             r0,  r1
+                add             r8,  #128
+                bne             10b
+
+                pop             {r4-r8, pc}
+
+@ Partial final write
+2:
+                cmp             r5,  #24-48
+                blt             1f
+                vst3.16         {d0,  d2,  d4},  [r0], r12
+                vst3.16         {d1,  d3,  d5},  [r4]
+                beq             11b
+                vmov            q0,  q8
+                sub             r5,  #24
+                vmov            q1,  q9
+                vmov            q2,  q10
+1:
+                cmp             r5,  #12-48
+                blt             1f
+                vst3.16         {d0,  d2,  d4},  [r0]!
+                beq             11b
+                vmov            d0, d1
+                sub             r5, #12
+                vmov            d2, d3
+                vmov            d4, d5
+1:
+                cmp             r5,  #6-48
+                add             r4,  r0,  #6    @ avoid [r0]! on sequential instructions
+                blt             1f
+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]
+                vst3.16         {d0[1], d2[1], d4[1]}, [r4]
+                add             r0,  #12
+                beq             11b
+                vmov            s0,  s1
+                sub             r5,  #6
+                vmov            s4,  s5
+                vmov            s8,  s9
+1:
+                cmp             r5, #3-48
+                blt             1f
+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]!
+                beq             11b
+                sub             r5, #3
+                vshr.u32        d0, #16
+                vshr.u32        d2, #16
+1:
+                cmp             r5, #2-48
+                blt             1f
+                vst2.16         {d0[0], d2[0]}, [r0]!
+                b               11b
+1:
+                vst1.16         {d0[0]}, [r0]!
+                b               11b
+
+endfunc
+
+
+@ void ff_rpi_sand30_lines_to_planar_c16(
+@   uint8_t * dst_u,            // [r0]
+@   unsigned int dst_stride_u,  // [r1]
+@   uint8_t * dst_v,            // [r2]
+@   unsigned int dst_stride_v,  // [r3]
+@   const uint8_t * src,        // [sp, #0]  -> r4, r5
+@   unsigned int stride1,       // [sp, #4]  128
+@   unsigned int stride2,       // [sp, #8]  -> r8
+@   unsigned int _x,            // [sp, #12] 0
+@   unsigned int y,             // [sp, #16] (r7 in prefix)
+@   unsigned int _w,            // [sp, #20] -> r6, r9
+@   unsigned int h);            // [sp, #24] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for writing
+
+function ff_rpi_sand30_lines_to_planar_c16, export=1
+                push            {r4-r10, lr}    @ +32
+                ldr             r5,  [sp, #32]
+                ldr             r8,  [sp, #40]
+                ldr             r7,  [sp, #48]
+                ldr             r9,  [sp, #52]
+                mov             r12, #48
+                vmov.u16        q15, #0x3ff
+                sub             r8,  #1
+                lsl             r8,  #7
+                add             r5,  r5,  r7,  lsl #7
+                sub             r1,  r1,  r9,  lsl #1
+                sub             r3,  r3,  r9,  lsl #1
+                ldr             r7,  [sp, #56]
+10:
+                mov             lr,  #0
+                mov             r4,  r5
+                mov             r6,  r9
+1:
+                vldm            r4!, {q0-q3}
+                add             lr,  #64
+
+                @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2
+                vshr.u32        q14, q0,  #20
+                vshrn.u32       d16, q0,  #10
+                vmovn.u32       d18, q0
+                ands            lr,  #127
+                vmovn.u32       d20, q14
+
+                vshr.u32        q14, q1,  #20
+                vshrn.u32       d17, q1,  #10
+                vmovn.u32       d19, q1
+                vmovn.u32       d21, q14
+
+                vshr.u32        q14, q2,  #20
+                vshrn.u32       d22, q2,  #10
+                vmovn.u32       d24, q2
+                vmovn.u32       d26, q14
+
+                vshr.u32        q14, q3,  #20
+                vshrn.u32       d23, q3,  #10
+                vmovn.u32       d25, q3
+                add             r10, r0,  #24
+                vmovn.u32       d27, q14
+
+                it              eq
+                addeq           r4,  r8
+                vuzp.16         q8,  q11
+                vuzp.16         q9,  q12
+                vuzp.16         q10, q13
+
+                @ q8   V0, V3,.. -> q0
+                @ q9   U0, U3...
+                @ q10  U1, U4...
+                @ q11  U2, U5,..
+                @ q12  V1, V4,.. -> q1
+                @ q13  V2, V5,.. -> q2
+
+                subs            r6,  #24
+                vand            q11, q15
+                vand            q9,  q15
+                vand            q10, q15
+                vand            q0,  q8,  q15
+                vand            q1,  q12, q15
+                vand            q2,  q13, q15
+
+                blt             2f
+
+                vst3.16         {d18, d20, d22}, [r0],  r12
+                vst3.16         {d19, d21, d23}, [r10]
+                add             r10, r2,  #24
+                vst3.16         {d0,  d2,  d4},  [r2],  r12
+                vst3.16         {d1,  d3,  d5},  [r10]
+
+                bne             1b
+
+11:
+                subs            r7,  #1
+                add             r5,  #128
+                add             r0,  r1
+                add             r2,  r3
+                bne             10b
+
+                pop             {r4-r10, pc}
+
+@ Partial final write
+2:
+                cmp             r6,  #-12
+                blt             1f
+                vst3.16         {d18, d20, d22}, [r0]!
+                vst3.16         {d0,  d2,  d4},  [r2]!
+                beq             11b
+                vmov            d18, d19
+                vmov            d20, d21
+                vmov            d22, d23
+                sub             r6,  #12
+                vmov            d0,  d1
+                vmov            d2,  d3
+                vmov            d4,  d5
+1:
+                cmp             r6,  #-18
+                @ Rezip here as it makes the remaining tail handling easier
+                vzip.16         d0,  d18
+                vzip.16         d2,  d20
+                vzip.16         d4,  d22
+                blt             1f
+                vst3.16         {d0[1],  d2[1],  d4[1]},  [r0]!
+                vst3.16         {d0[0],  d2[0],  d4[0]},  [r2]!
+                vst3.16         {d0[3],  d2[3],  d4[3]},  [r0]!
+                vst3.16         {d0[2],  d2[2],  d4[2]},  [r2]!
+                beq             11b
+                vmov            d0,  d18
+                vmov            d2,  d20
+                sub             r6,  #6
+                vmov            d4,  d22
+1:
+                cmp             r6,  #-21
+                blt             1f
+                vst3.16         {d0[1], d2[1], d4[1]}, [r0]!
+                vst3.16         {d0[0], d2[0], d4[0]}, [r2]!
+                beq             11b
+                vmov            s4,  s5
+                sub             r6,  #3
+                vmov            s0,  s1
+1:
+                cmp             r6,  #-22
+                blt             1f
+                vst2.16         {d0[1], d2[1]}, [r0]!
+                vst2.16         {d0[0], d2[0]}, [r2]!
+                b               11b
+1:
+                vst1.16         {d0[1]}, [r0]!
+                vst1.16         {d0[0]}, [r2]!
+                b               11b
+
+endfunc
+
+@ void ff_rpi_sand30_lines_to_planar_p010(
+@   uint8_t * dest,             // [r0]
+@   unsigned int dst_stride,    // [r1]
+@   const uint8_t * src,        // [r2]
+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+@   unsigned int src_stride2,   // [sp, #0]  -> r3
+@   unsigned int _x,            // [sp, #4]  Ignored - 0
+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+@   unsigned int h);            // [sp, #16] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for writing
+
+function ff_rpi_sand30_lines_to_planar_p010, export=1
+                push            {r4-r8, lr}     @ +24
+                ldr             r3,  [sp, #24]
+                ldr             r6,  [sp, #36]
+                ldr             r7,  [sp, #32]  @ y
+                mov             r12, #48
+                vmov.u16        q15, #0xffc0
+                sub             r3,  #1
+                lsl             r3,  #7
+                sub             r1,  r1,  r6,  lsl #1
+                add             r8,  r2,  r7,  lsl #7
+                ldr             r7,  [sp, #40]
+
+10:
+                mov             r2,  r8
+                add             r4,  r0,  #24
+                mov             r5,  r6
+                mov             lr,  #0
+1:
+                vldm            r2!, {q10-q13}
+                add             lr,  #64
+
+                vshl.u32        q14, q10, #6
+                ands            lr,  #127
+                vshrn.u32       d4,  q10, #14
+                vshrn.u32       d2,  q10, #4
+                vmovn.u32       d0,  q14
+
+                vshl.u32        q14, q11, #6
+                it              eq
+                addeq           r2,  r3
+                vshrn.u32       d5,  q11, #14
+                vshrn.u32       d3,  q11, #4
+                vmovn.u32       d1,  q14
+
+                subs            r5,  #48
+                vand            q2,  q15
+                vand            q1,  q15
+                vand            q0,  q15
+
+                vshl.u32        q14, q12, #6
+                vshrn.u32       d20, q12, #14
+                vshrn.u32       d18, q12, #4
+                vmovn.u32       d16, q14
+
+                vshl.u32        q14, q13, #6
+                vshrn.u32       d21, q13, #14
+                vshrn.u32       d19, q13, #4
+                vmovn.u32       d17, q14
+
+                vand            q10, q15
+                vand            q9,  q15
+                vand            q8,  q15
+                blt             2f
+
+                vst3.16         {d0,  d2,  d4},  [r0], r12
+                vst3.16         {d1,  d3,  d5},  [r4], r12
+                vst3.16         {d16, d18, d20}, [r0], r12
+                vst3.16         {d17, d19, d21}, [r4], r12
+
+                bne             1b
+
+11:
+                subs            r7,  #1
+                add             r0,  r1
+                add             r8,  #128
+                bne             10b
+
+                pop             {r4-r8, pc}
+
+@ Partial final write
+2:
+                cmp             r5,  #24-48
+                blt             1f
+                vst3.16         {d0,  d2,  d4},  [r0], r12
+                vst3.16         {d1,  d3,  d5},  [r4]
+                beq             11b
+                vmov            q0,  q8
+                sub             r5,  #24
+                vmov            q1,  q9
+                vmov            q2,  q10
+1:
+                cmp             r5,  #12-48
+                blt             1f
+                vst3.16         {d0,  d2,  d4},  [r0]!
+                beq             11b
+                vmov            d0, d1
+                sub             r5, #12
+                vmov            d2, d3
+                vmov            d4, d5
+1:
+                cmp             r5,  #6-48
+                add             r4,  r0,  #6    @ avoid [r0]! on sequential instructions
+                blt             1f
+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]
+                vst3.16         {d0[1], d2[1], d4[1]}, [r4]
+                add             r0,  #12
+                beq             11b
+                vmov            s0,  s1
+                sub             r5,  #6
+                vmov            s4,  s5
+                vmov            s8,  s9
+1:
+                cmp             r5, #3-48
+                blt             1f
+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]!
+                beq             11b
+                sub             r5, #3
+                vshr.u32        d0, #16
+                vshr.u32        d2, #16
+1:
+                cmp             r5, #2-48
+                blt             1f
+                vst2.16         {d0[0], d2[0]}, [r0]!
+                b               11b
+1:
+                vst1.16         {d0[0]}, [r0]!
+                b               11b
+
+endfunc
+
+
+
diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h
new file mode 100644
index 0000000000..447f367bea
--- /dev/null
+++ b/libavutil/arm/rpi_sand_neon.h
@@ -0,0 +1,99 @@
+/*
+Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+#ifndef AVUTIL_ARM_SAND_NEON_H
+#define AVUTIL_ARM_SAND_NEON_H
+
+void ff_rpi_sand128b_stripe_to_8_10(
+  uint8_t * dest,             // [r0]
+  const uint8_t * src1,       // [r1]
+  const uint8_t * src2,       // [r2]
+  unsigned int lines);        // [r3]
+
+void ff_rpi_sand8_lines_to_planar_y8(
+  uint8_t * dest,             // [r0]
+  unsigned int dst_stride,    // [r1]
+  const uint8_t * src,        // [r2]
+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+  unsigned int src_stride2,   // [sp, #0]  -> r3
+  unsigned int _x,            // [sp, #4]  Ignored - 0
+  unsigned int y,             // [sp, #8]  (r7 in prefix)
+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+  unsigned int h);            // [sp, #16] -> r7
+
+void ff_rpi_sand8_lines_to_planar_c8(
+  uint8_t * dst_u,            // [r0]
+  unsigned int dst_stride_u,  // [r1]
+  uint8_t * dst_v,            // [r2]
+  unsigned int dst_stride_v,  // [r3]
+  const uint8_t * src,        // [sp, #0]  -> r4, r5
+  unsigned int stride1,       // [sp, #4]  128
+  unsigned int stride2,       // [sp, #8]  -> r8
+  unsigned int _x,            // [sp, #12] 0
+  unsigned int y,             // [sp, #16] (r7 in prefix)
+  unsigned int _w,            // [sp, #20] -> r12, r6
+  unsigned int h);            // [sp, #24] -> r7
+
+void ff_rpi_sand30_lines_to_planar_y16(
+  uint8_t * dest,             // [r0]
+  unsigned int dst_stride,    // [r1]
+  const uint8_t * src,        // [r2]
+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+  unsigned int src_stride2,   // [sp, #0]  -> r3
+  unsigned int _x,            // [sp, #4]  Ignored - 0
+  unsigned int y,             // [sp, #8]  (r7 in prefix)
+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+  unsigned int h);            // [sp, #16] -> r7
+
+void ff_rpi_sand30_lines_to_planar_c16(
+  uint8_t * dst_u,            // [r0]
+  unsigned int dst_stride_u,  // [r1]
+  uint8_t * dst_v,            // [r2]
+  unsigned int dst_stride_v,  // [r3]
+  const uint8_t * src,        // [sp, #0]  -> r4, r5
+  unsigned int stride1,       // [sp, #4]  128
+  unsigned int stride2,       // [sp, #8]  -> r8
+  unsigned int _x,            // [sp, #12] 0
+  unsigned int y,             // [sp, #16] (r7 in prefix)
+  unsigned int _w,            // [sp, #20] -> r6, r9
+  unsigned int h);            // [sp, #24] -> r7
+
+void ff_rpi_sand30_lines_to_planar_p010(
+  uint8_t * dest,             // [r0]
+  unsigned int dst_stride,    // [r1]
+  const uint8_t * src,        // [r2]
+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+  unsigned int src_stride2,   // [sp, #0]  -> r3
+  unsigned int _x,            // [sp, #4]  Ignored - 0
+  unsigned int y,             // [sp, #8]  (r7 in prefix)
+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+  unsigned int h);            // [sp, #16] -> r7
+
+#endif // AVUTIL_ARM_SAND_NEON_H
+
diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
index 62a2ae08d9..cb73521ea7 100644
--- a/libavutil/pixdesc.c
+++ b/libavutil/pixdesc.c
@@ -2717,6 +2717,50 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
         .flags = AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_FLOAT |
                  AV_PIX_FMT_FLAG_ALPHA,
     },
+    [AV_PIX_FMT_SAND128] = {
+        .name = "sand128",
+        .nb_components = 3,
+        .log2_chroma_w = 1,
+        .log2_chroma_h = 1,
+        .comp = {
+            { 0, 1, 0, 0, 8 },        /* Y */
+            { 1, 2, 0, 0, 8 },        /* U */
+            { 1, 2, 1, 0, 8 },        /* V */
+        },
+        .flags = 0,
+    },
+    [AV_PIX_FMT_SAND64_10] = {
+        .name = "sand64_10",
+        .nb_components = 3,
+        .log2_chroma_w = 1,
+        .log2_chroma_h = 1,
+        .comp = {
+            { 0, 2, 0, 0, 10 },        /* Y */
+            { 1, 4, 0, 0, 10 },        /* U */
+            { 1, 4, 2, 0, 10 },        /* V */
+        },
+        .flags = 0,
+    },
+    [AV_PIX_FMT_SAND64_16] = {
+        .name = "sand64_16",
+        .nb_components = 3,
+        .log2_chroma_w = 1,
+        .log2_chroma_h = 1,
+        .comp = {
+            { 0, 2, 0, 0, 16 },        /* Y */
+            { 1, 4, 0, 0, 16 },        /* U */
+            { 1, 4, 2, 0, 16 },        /* V */
+        },
+        .flags = 0,
+    },
+    [AV_PIX_FMT_RPI4_8] = {
+        .name = "rpi4_8",
+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
+    },
+    [AV_PIX_FMT_RPI4_10] = {
+        .name = "rpi4_10",
+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
+    },
 };

 static const char * const color_range_names[] = {
diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
index 37c2c79e01..22f70007c3 100644
--- a/libavutil/pixfmt.h
+++ b/libavutil/pixfmt.h
@@ -377,6 +377,12 @@ enum AVPixelFormat {

     AV_PIX_FMT_Y210BE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian
     AV_PIX_FMT_Y210LE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian
+// RPI - not on ifdef so can be got at by calling progs
+    AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
+    AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
+    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
+    AV_PIX_FMT_RPI4_8,
+    AV_PIX_FMT_RPI4_10,

     AV_PIX_FMT_X2RGB10LE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), little-endian, X=unused/undefined
     AV_PIX_FMT_X2RGB10BE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), big-endian, X=unused/undefined
diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
new file mode 100644
index 0000000000..0324f6826d
--- /dev/null
+++ b/libavutil/rpi_sand_fn_pw.h
@@ -0,0 +1,227 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+// * Included twice from rpi_sand_fn with different PW
+
+#define STRCAT(x,y) x##y
+
+#if PW == 1
+#define pixel uint8_t
+#define FUNC(f) STRCAT(f, 8)
+#elif PW == 2
+#define pixel uint16_t
+#define FUNC(f) STRCAT(f, 16)
+#else
+#error Unexpected PW
+#endif
+
+// Fetches a single patch - offscreen fixup not done here
+// w <= stride1
+// unclipped
+void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x = _x;
+    const unsigned int w = _w;
+    const unsigned int mask = stride1 - 1;
+
+#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64)
+    if (_x == 0) {
+        ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride,
+                                     src, stride1, stride2, _x, y, _w, h);
+        return;
+    }
+#endif
+
+    if ((x & ~mask) == ((x + w) & ~mask)) {
+        // All in one sand stripe
+        const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
+            memcpy(dst, p, w);
+        }
+    }
+    else
+    {
+        // Two+ stripe
+        const unsigned int sstride = stride1 * stride2;
+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        const uint8_t * p2 = p1 + sstride - (x & mask);
+        const unsigned int w1 = stride1 - (x & mask);
+        const unsigned int w3 = (x + w) & mask;
+        const unsigned int w2 = w - (w1 + w3);
+
+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
+            unsigned int j;
+            const uint8_t * p = p2;
+            uint8_t * d = dst;
+            memcpy(d, p1, w1);
+            d += w1;
+            for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
+                memcpy(d, p, stride1);
+            }
+            memcpy(d, p, w3);
+        }
+    }
+}
+
+// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
+
+void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x = _x * 2;
+    const unsigned int w = _w * 2;
+    const unsigned int mask = stride1 - 1;
+
+#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64)
+    if (_x == 0) {
+        ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v,
+                                     src, stride1, stride2, _x, y, _w, h);
+        return;
+    }
+#endif
+
+    if ((x & ~mask) == ((x + w) & ~mask)) {
+        // All in one sand stripe
+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
+            pixel * du = (pixel *)dst_u;
+            pixel * dv = (pixel *)dst_v;
+            const pixel * p = (const pixel *)p1;
+            for (unsigned int k = 0; k < w; k += 2 * PW) {
+                *du++ = *p++;
+                *dv++ = *p++;
+            }
+        }
+    }
+    else
+    {
+        // Two+ stripe
+        const unsigned int sstride = stride1 * stride2;
+        const unsigned int sstride_p = (sstride - stride1) / PW;
+
+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        const uint8_t * p2 = p1 + sstride - (x & mask);
+        const unsigned int w1 = stride1 - (x & mask);
+        const unsigned int w3 = (x + w) & mask;
+        const unsigned int w2 = w - (w1 + w3);
+
+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
+            unsigned int j;
+            const pixel * p = (const pixel *)p1;
+            pixel * du = (pixel *)dst_u;
+            pixel * dv = (pixel *)dst_v;
+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
+                *du++ = *p++;
+                *dv++ = *p++;
+            }
+            for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
+                    *du++ = *p++;
+                    *dv++ = *p++;
+                }
+            }
+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
+                *du++ = *p++;
+                *dv++ = *p++;
+            }
+        }
+    }
+}
+
+void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
+                             unsigned int stride1, unsigned int stride2,
+                             const uint8_t * src_u, const unsigned int src_stride_u,
+                             const uint8_t * src_v, const unsigned int src_stride_v,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x = _x * 2;
+    const unsigned int w = _w * 2;
+    const unsigned int mask = stride1 - 1;
+    if ((x & ~mask) == ((x + w) & ~mask)) {
+        // All in one sand stripe
+        uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
+            const pixel * su = (const pixel *)src_u;
+            const pixel * sv = (const pixel *)src_v;
+            pixel * p = (pixel *)p1;
+            for (unsigned int k = 0; k < w; k += 2 * PW) {
+                *p++ = *su++;
+                *p++ = *sv++;
+            }
+        }
+    }
+    else
+    {
+        // Two+ stripe
+        const unsigned int sstride = stride1 * stride2;
+        const unsigned int sstride_p = (sstride - stride1) / PW;
+
+        const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        const uint8_t * p2 = p1 + sstride - (x & mask);
+        const unsigned int w1 = stride1 - (x & mask);
+        const unsigned int w3 = (x + w) & mask;
+        const unsigned int w2 = w - (w1 + w3);
+
+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
+            unsigned int j;
+            const pixel * su = (const pixel *)src_u;
+            const pixel * sv = (const pixel *)src_v;
+            pixel * p = (pixel *)p1;
+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
+                *p++ = *su++;
+                *p++ = *sv++;
+            }
+            for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
+                    *p++ = *su++;
+                    *p++ = *sv++;
+                }
+            }
+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
+                *p++ = *su++;
+                *p++ = *sv++;
+            }
+        }
+    }
+}
+
+
+#undef pixel
+#undef STRCAT
+#undef FUNC
+
diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
new file mode 100644
index 0000000000..ed0261b02f
--- /dev/null
+++ b/libavutil/rpi_sand_fns.c
@@ -0,0 +1,353 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+#include "config.h"
+#include <stdint.h>
+#include <string.h>
+#include "rpi_sand_fns.h"
+#include "avassert.h"
+#include "frame.h"
+
+#if ARCH_ARM && HAVE_NEON
+#include "arm/rpi_sand_neon.h"
+#define HAVE_SAND_ASM 1
+#else
+#define HAVE_SAND_ASM 0
+#endif
+
+#define PW 1
+#include "rpi_sand_fn_pw.h"
+#undef PW
+
+#define PW 2
+#include "rpi_sand_fn_pw.h"
+#undef PW
+
+#if 1
+// Simple round
+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
+{
+    const unsigned int rnd = (1 << shr) >> 1;
+    const uint16_t * src = (const uint16_t *)_src;
+
+    for (; n != 0; --n) {
+        *dst++ = (*src++ + rnd) >> shr;
+    }
+}
+#else
+// Dithered variation
+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
+{
+    unsigned int rnd = (1 << shr) >> 1;
+    const unsigned int mask = ((1 << shr) - 1);
+    const uint16_t * src = (const uint16_t *)_src;
+
+    for (; n != 0; --n) {
+        rnd = *src++ + (rnd & mask);
+        *dst++ = rnd >> shr;
+    }
+}
+#endif
+
+// Fetches a single patch - offscreen fixup not done here
+// w <= stride1
+// unclipped
+// _x & _w in pixels, strides in bytes
+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
+    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
+    const unsigned int x1 = ((_x + _w) / 3) * 4;
+    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
+    const unsigned int mask = stride1 - 1;
+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
+
+#if HAVE_SAND_ASM
+    if (_x == 0) {
+        ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
+        return;
+    }
+#endif
+
+    if (x0 == x1) {
+        // *******************
+        // Partial single word xfer
+        return;
+    }
+
+    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
+    {
+        unsigned int x = x0;
+        const uint32_t * p = (const uint32_t *)p0;
+        uint16_t * d = (uint16_t *)dst;
+
+        if (xskip0 != 0) {
+            const uint32_t p3 = *p++;
+
+            if (xskip0 == 1)
+                *d++ = (p3 >> 10) & 0x3ff;
+            *d++ = (p3 >> 20) & 0x3ff;
+
+            if (((x += 4) & mask) == 0)
+                p += slice_inc;
+        }
+
+        while (x != x1) {
+            const uint32_t p3 = *p++;
+            *d++ = p3 & 0x3ff;
+            *d++ = (p3 >> 10) & 0x3ff;
+            *d++ = (p3 >> 20) & 0x3ff;
+
+            if (((x += 4) & mask) == 0)
+                p += slice_inc;
+        }
+
+        if (xrem1 != 0) {
+            const uint32_t p3 = *p;
+
+            *d++ = p3 & 0x3ff;
+            if (xrem1 == 2)
+                *d++ = (p3 >> 10) & 0x3ff;
+        }
+    }
+}
+
+
+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word
+    const unsigned int xskip0 = _x - (x0 >> 3) * 3;
+    const unsigned int x1 = ((_x + _w) / 3) * 8;
+    const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3;
+    const unsigned int mask = stride1 - 1;
+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
+
+#if HAVE_SAND_ASM
+    if (_x == 0) {
+        ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v,
+                                       src, stride1, stride2, _x, y, _w, h);
+        return;
+    }
+#endif
+
+    if (x0 == x1) {
+        // *******************
+        // Partial single word xfer
+        return;
+    }
+
+    for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1)
+    {
+        unsigned int x = x0;
+        const uint32_t * p = (const uint32_t *)p0;
+        uint16_t * du = (uint16_t *)dst_u;
+        uint16_t * dv = (uint16_t *)dst_v;
+
+        if (xskip0 != 0) {
+            const uint32_t p3a = *p++;
+            const uint32_t p3b = *p++;
+
+            if (xskip0 == 1)
+            {
+                *du++ = (p3a >> 20) & 0x3ff;
+                *dv++ = (p3b >>  0) & 0x3ff;
+            }
+            *du++ = (p3b >> 10) & 0x3ff;
+            *dv++ = (p3b >> 20) & 0x3ff;
+
+            if (((x += 8) & mask) == 0)
+                p += slice_inc;
+        }
+
+        while (x != x1) {
+            const uint32_t p3a = *p++;
+            const uint32_t p3b = *p++;
+
+            *du++ = p3a & 0x3ff;
+            *dv++ = (p3a >> 10) & 0x3ff;
+            *du++ = (p3a >> 20) & 0x3ff;
+            *dv++ = p3b & 0x3ff;
+            *du++ = (p3b >> 10) & 0x3ff;
+            *dv++ = (p3b >> 20) & 0x3ff;
+
+            if (((x += 8) & mask) == 0)
+                p += slice_inc;
+        }
+
+        if (xrem1 != 0) {
+            const uint32_t p3a = *p++;
+            const uint32_t p3b = *p++;
+
+            *du++ = p3a & 0x3ff;
+            *dv++ = (p3a >> 10) & 0x3ff;
+            if (xrem1 == 2)
+            {
+                *du++ = (p3a >> 20) & 0x3ff;
+                *dv++ = p3b & 0x3ff;
+            }
+        }
+    }
+}
+
+
+// w/h in pixels
+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
+                         unsigned int w, unsigned int h, const unsigned int shr)
+{
+    const unsigned int n = dst_stride1 / 2;
+    unsigned int j;
+
+    // This is true for our current layouts
+    av_assert0(dst_stride1 == src_stride1);
+
+    // As we have the same stride1 for src & dest and src is wider than dest
+    // then if we loop on src we can always write contiguously to dest
+    // We make no effort to copy an exact width - round up to nearest src stripe
+    // as we will always have storage in dest for that
+
+#if ARCH_ARM && HAVE_NEON
+    if (shr == 3 && src_stride1 == 128) {
+        for (j = 0; j + n < w; j += dst_stride1) {
+            uint8_t * d = dst + j * dst_stride2;
+            const uint8_t * s1 = src + j * 2 * src_stride2;
+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
+
+            ff_rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
+        }
+    }
+    else
+#endif
+    {
+        for (j = 0; j + n < w; j += dst_stride1) {
+            uint8_t * d = dst + j * dst_stride2;
+            const uint8_t * s1 = src + j * 2 * src_stride2;
+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
+
+            for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
+                cpy16_to_8(d, s1, n, shr);
+                cpy16_to_8(d + n, s2, n, shr);
+            }
+        }
+    }
+
+    // Fix up a trailing dest half stripe
+    if (j < w) {
+        uint8_t * d = dst + j * dst_stride2;
+        const uint8_t * s1 = src + j * 2 * src_stride2;
+
+        for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
+            cpy16_to_8(d, s1, n, shr);
+        }
+    }
+}
+
+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
+{
+    const int w = av_frame_cropped_width(src);
+    const int h = av_frame_cropped_height(src);
+    const int x = src->crop_left;
+    const int y = src->crop_top;
+
+    // We will crop as part of the conversion
+    dst->crop_top = 0;
+    dst->crop_left = 0;
+    dst->crop_bottom = 0;
+    dst->crop_right = 0;
+
+    switch (src->format){
+        case AV_PIX_FMT_SAND128:
+        case AV_PIX_FMT_RPI4_8:
+            switch (dst->format){
+                case AV_PIX_FMT_YUV420P:
+                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
+                                             src->data[0],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x, y, w, h);
+                    av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
+                                             dst->data[2], dst->linesize[2],
+                                             src->data[1],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x/2, y/2,  w/2, h/2);
+                    break;
+                default:
+                    return -1;
+            }
+            break;
+        case AV_PIX_FMT_SAND64_10:
+            switch (dst->format){
+                case AV_PIX_FMT_YUV420P10:
+                    av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0],
+                                             src->data[0],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x*2, y, w*2, h);
+                    av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1],
+                                             dst->data[2], dst->linesize[2],
+                                             src->data[1],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x, y/2,  w, h/2);
+                    break;
+                default:
+                    return -1;
+            }
+            break;
+        case AV_PIX_FMT_RPI4_10:
+            switch (dst->format){
+                case AV_PIX_FMT_YUV420P10:
+                    av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
+                                             src->data[0],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x, y, w, h);
+                    av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
+                                             dst->data[2], dst->linesize[2],
+                                             src->data[1],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x/2, y/2, w/2, h/2);
+                    break;
+                default:
+                    return -1;
+            }
+            break;
+        default:
+            return -1;
+    }
+
+    return av_frame_copy_props(dst, src);
+}
diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
new file mode 100644
index 0000000000..634b55e800
--- /dev/null
+++ b/libavutil/rpi_sand_fns.h
@@ -0,0 +1,183 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+#ifndef AVUTIL_RPI_SAND_FNS
+#define AVUTIL_RPI_SAND_FNS
+
+#include "libavutil/frame.h"
+
+// For all these fns _x & _w are measured as coord * PW
+// For the C fns coords are in chroma pels (so luma / 2)
+// Strides are in bytes
+
+void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+
+void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+
+void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
+                             unsigned int stride1, unsigned int stride2,
+                             const uint8_t * src_u, const unsigned int src_stride_u,
+                             const uint8_t * src_v, const unsigned int src_stride_v,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
+                             unsigned int stride1, unsigned int stride2,
+                             const uint8_t * src_u, const unsigned int src_stride_u,
+                             const uint8_t * src_v, const unsigned int src_stride_v,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+
+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+
+
+// w/h in pixels
+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
+                         unsigned int w, unsigned int h, const unsigned int shr);
+
+
+// dst must contain required pixel format & allocated data buffers
+// Cropping on the src buffer will be honoured and dst crop will be set to zero
+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src);
+
+
+static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
+{
+#ifdef RPI_ZC_SAND128_ONLY
+    // If we are sure we only only support 128 byte sand formats replace the
+    // var with a constant which should allow for better optimisation
+    return 128;
+#else
+    return frame->linesize[0];
+#endif
+}
+
+static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
+{
+    return frame->linesize[3];
+}
+
+
+static inline int av_rpi_is_sand_format(const int format)
+{
+    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10);
+}
+
+static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
+{
+    return av_rpi_is_sand_format(frame->format);
+}
+
+static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
+{
+    return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8);
+}
+
+static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
+{
+    return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
+}
+
+static inline int av_rpi_is_sand30_frame(const AVFrame * const frame)
+{
+    return (frame->format == AV_PIX_FMT_RPI4_10);
+}
+
+static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
+{
+    return av_rpi_is_sand8_frame(frame) ? 0 : 1;
+}
+
+// If x is measured in bytes (not pixels) then this works for sand64_16 as
+// well as sand128 - but in the general case we work that out
+
+static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
+{
+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+    const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
+    const unsigned int x1 = x & (stride1 - 1);
+    const unsigned int x2 = x ^ x1;
+
+    return x1 + stride1 * y + stride2 * x2;
+}
+
+static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
+{
+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+    const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
+    const unsigned int x1 = x & (stride1 - 1);
+    const unsigned int x2 = x ^ x1;
+
+    return x1 + stride1 * y_c + stride2 * x2;
+}
+
+static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+{
+    return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
+}
+
+static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+{
+    return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
+}
+
+#endif
+

From 89b8d6ac2a886749d4594656083753e682de05a7 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 11:36:47 +0100
Subject: [PATCH 003/161] Add aarch64 asm sand conv functions

Many thanks to eiler.mike@gmail.com (Michael Eiler) for these
optimizations
---
 libavutil/aarch64/Makefile        |   2 +
 libavutil/aarch64/rpi_sand_neon.S | 676 ++++++++++++++++++++++++++++++
 libavutil/aarch64/rpi_sand_neon.h |  55 +++
 libavutil/rpi_sand_fn_pw.h        |   4 +-
 libavutil/rpi_sand_fns.c          |   3 +
 5 files changed, 738 insertions(+), 2 deletions(-)
 create mode 100644 libavutil/aarch64/rpi_sand_neon.S
 create mode 100644 libavutil/aarch64/rpi_sand_neon.h

diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile
index eba0151337..1b44beab39 100644
--- a/libavutil/aarch64/Makefile
+++ b/libavutil/aarch64/Makefile
@@ -4,3 +4,5 @@ OBJS += aarch64/cpu.o                                                 \

 NEON-OBJS += aarch64/float_dsp_neon.o                                 \
              aarch64/tx_float_neon.o                                  \
+             aarch64/rpi_sand_neon.o                                  \
+
diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
new file mode 100644
index 0000000000..cdcf71ee67
--- /dev/null
+++ b/libavutil/aarch64/rpi_sand_neon.S
@@ -0,0 +1,676 @@
+/*
+Copyright (c) 2021 Michael Eiler
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: Michael Eiler <eiler.mike@gmail.com>
+*/
+
+#include "asm.S"
+
+// void ff_rpi_sand8_lines_to_planar_y8(
+//   uint8_t * dest,            : x0
+//   unsigned int dst_stride,   : w1
+//   const uint8_t * src,       : x2
+//   unsigned int src_stride1,  : w3, always 128
+//   unsigned int src_stride2,  : w4
+//   unsigned int _x,           : w5
+//   unsigned int y,            : w6
+//   unsigned int _w,           : w7
+//   unsigned int h);           : [sp, #0]
+
+function ff_rpi_sand8_lines_to_planar_y8, export=1
+    // w15 contains the number of rows we need to process
+    ldr w15, [sp, #0]
+
+    // w8 will contain the number of blocks per row
+    // w8 = floor(_w/stride1)
+    // stride1 is assumed to always be 128
+    mov w8, w1
+    lsr w8, w8, #7
+
+    // in case the width of the image is not a multiple of 128, there will
+    // be an incomplete block at the end of every row
+    // w9 contains the number of pixels stored within this block
+    // w9 = _w - w8 * 128
+    lsl w9, w8, #7
+    sub w9, w7, w9
+
+    // this is the value we have to add to the src pointer after reading a complete block
+    // it will move the address to the start of the next block
+    // w10 = stride2 * stride1 - stride1
+    mov w10, w4
+    lsl w10, w10, #7
+    sub w10, w10, #128
+
+    // w11 is the row offset, meaning the start offset of the first block of every collumn
+    // this will be increased with stride1 within every iteration of the row_loop
+    eor w11, w11, w11
+
+    // w12 = 0, processed row count
+    eor w12, w12, w12
+row_loop:
+    // start of the first block within the current row
+    // x13 = row offset + src
+    mov x13, x2
+    add x13, x13, x11
+
+    // w14 = 0, processed block count
+    eor w14, w14, w14
+
+    cmp w8, #0
+    beq no_main_y8
+
+block_loop:
+    // copy 128 bytes (a full block) into the vector registers v0-v7 and increase the src address by 128
+    // fortunately these aren't callee saved ones, meaning we don't need to backup them
+    ld1 { v0.16b,  v1.16b,  v2.16b,  v3.16b}, [x13], #64
+    ld1 { v4.16b,  v5.16b,  v6.16b,  v7.16b}, [x13], #64
+
+    // write these registers back to the destination vector and increase the dst address by 128
+    st1 { v0.16b,  v1.16b,  v2.16b,  v3.16b }, [x0], #64
+    st1 { v4.16b,  v5.16b,  v6.16b,  v7.16b }, [x0], #64
+
+    // move the source register to the beginning of the next block (x13 = src + block offset)
+    add x13, x13, x10
+    // increase the block counter
+    add w14, w14, #1
+
+    // continue with the block_loop if we haven't copied all full blocks yet
+    cmp w8, w14
+    bgt block_loop
+
+    // handle the last block at the end of each row
+    // at most 127 byte values copied from src to dst
+no_main_y8:
+    eor w5, w5, w5 // i = 0
+incomplete_block_loop_y8:
+    cmp w5, w9
+    bge incomplete_block_loop_end_y8
+
+    ldrb w6, [x13]
+    strb w6, [x0]
+    add x13, x13, #1
+    add x0, x0, #1
+
+    add w5, w5, #1
+    b incomplete_block_loop_y8
+incomplete_block_loop_end_y8:
+
+
+    // increase the row offset by 128 (stride1)
+    add w11, w11, #128
+    // increment the row counter
+    add w12, w12, #1
+
+    // process the next row if we haven't finished yet
+    cmp w15, w12
+    bgt row_loop
+
+    ret
+endfunc
+
+
+
+// void ff_rpi_sand8_lines_to_planar_c8(
+//   uint8_t * dst_u,           : x0
+//   unsigned int dst_stride_u, : w1 == width
+//   uint8_t * dst_v,           : x2
+//   unsigned int dst_stride_v, : w3 == width
+//   const uint8_t * src,       : x4
+//   unsigned int stride1,      : w5 == 128
+//   unsigned int stride2,      : w6
+//   unsigned int _x,           : w7
+//   unsigned int y,            : [sp, #0]
+//   unsigned int _w,           : [sp, #8]
+//   unsigned int h);           : [sp, #16]
+
+function ff_rpi_sand8_lines_to_planar_c8, export=1
+    // w7 = width
+    ldr w7, [sp, #8]
+
+    // w15 contains the number of rows we need to process
+    // counts down
+    ldr w15, [sp, #16]
+
+    // number of full blocks, w8 = _w / (stride1 >> 1) == _w / 64 == _w >> 6
+    mov w8, w7
+    lsr w8, w8, #6
+
+    // number of pixels in block at the end of every row
+    // w9 = _w - (w8 * 64)
+    lsl w9, w8, #6
+    sub w9, w7, w9
+
+    // Skip at the end of the line to account for stride
+    sub w12, w1, w7
+
+    // address delta to the beginning of the next block
+    // w10 = (stride2 * stride1 - stride1) = stride2 * 128 - 128
+    lsl w10, w6, #7
+    sub w10, w10, #128
+
+    // w11 = row address start offset = 0
+    eor w11, w11, w11
+
+row_loop_c8:
+    // start of the first block within the current row
+    // x13 = row offset + src
+    mov x13, x4
+    add x13, x13, x11
+
+    // w14 = 0, processed block count
+    eor w14, w14, w14
+
+    cmp w8, #0
+    beq no_main_c8
+
+block_loop_c8:
+    // load the full block -> 128 bytes, the block contains 64 interleaved U and V values
+    ld2 { v0.16b,  v1.16b }, [x13], #32
+    ld2 { v2.16b,  v3.16b }, [x13], #32
+    ld2 { v4.16b,  v5.16b }, [x13], #32
+    ld2 { v6.16b,  v7.16b }, [x13], #32
+
+    // swap register so that we can write them out with a single instruction
+    mov v16.16b, v1.16b
+    mov v17.16b, v3.16b
+    mov v18.16b, v5.16b
+    mov v1.16b, v2.16b
+    mov v2.16b, v4.16b
+    mov v3.16b, v6.16b
+    mov v4.16b, v16.16b
+    mov v5.16b, v17.16b
+    mov v6.16b, v18.16b
+
+    st1 { v0.16b,  v1.16b,  v2.16b,  v3.16b }, [x0], #64
+    st1 { v4.16b,  v5.16b,  v6.16b,  v7.16b }, [x2], #64
+
+    // increment row counter and move src to the beginning of the next block
+    add w14, w14, #1
+    add x13, x13, x10
+
+    // jump to block_loop_c8 iff the block count is smaller than the number of full blocks
+    cmp w8, w14
+    bgt block_loop_c8
+
+no_main_c8:
+    // handle incomplete block at the end of every row
+    eor w5, w5, w5 // point counter, this might be
+incomplete_block_loop_c8:
+    cmp w5, w9
+    bge incomplete_block_loop_end_c8
+
+    ldrb w1, [x13]
+    strb w1, [x0]
+    add x13, x13, #1
+
+    ldrb w1, [x13]
+    strb w1, [x2]
+    add x13, x13, #1
+
+    add x0, x0, #1
+    add x2, x2, #1
+
+    add w5, w5, #1
+    b incomplete_block_loop_c8
+incomplete_block_loop_end_c8:
+
+    // increase row_offset by stride1
+    add w11, w11, #128
+    add x0, x0, w12, sxtw
+    add x2, x2, w12, sxtw
+
+    // jump to row_Loop_c8 iff the row count is small than the height
+    subs w15, w15, #1
+    bgt row_loop_c8
+
+    ret
+endfunc
+
+//void ff_rpi_sand30_lines_to_planar_y16(
+//  uint8_t * dest,             // [x0]
+//  unsigned int dst_stride,    // [w1] -> assumed to be equal to _w
+//  const uint8_t * src,        // [x2]
+//  unsigned int src_stride1,   // [w3] -> 128
+//  unsigned int src_stride2,   // [w4]
+//  unsigned int _x,            // [w5]
+//  unsigned int y,             // [w6]
+//  unsigned int _w,            // [w7]
+//  unsigned int h);            // [sp, #0]
+
+function ff_rpi_sand30_lines_to_planar_y16, export=1
+    stp x19, x20, [sp, #-48]!
+    stp x21, x22, [sp, #16]
+    stp x23, x24, [sp, #32]
+
+    // w6 = argument h
+    ldr w6, [sp, #48]
+
+    // slice_inc = ((stride2 - 1) * stride1)
+    mov w5, w4
+    sub w5, w5, #1
+    lsl w5, w5, #7
+
+    // total number of bytes per row = (width / 3) * 4
+    mov w8, w7
+    mov w9, #3
+    udiv w8, w8, w9
+    lsl w8, w8, #2
+
+    // number of full 128 byte blocks to be processed
+    mov w9, #96
+    udiv w9, w7, w9 // = (width * 4) / (3*128) = width/96
+
+    // w10 = number of full integers to process (4 bytes)
+    // w11 = remaning zero to two 10bit values still to copy over
+    mov w12, #96
+    mul w12, w9, w12
+    sub w12, w7, w12  // width - blocks*96 = remaining points per row
+    mov w11, #3
+    udiv w10, w12, w11 // full integers to process = w12 / 3
+    mul w11, w10, w11  // #integers *3
+    sub w11, w12, w11  // remaining 0-2 points = remaining points - integers*3
+
+    // increase w9 by one if w10+w11 is not zero, and decrease the row count by one
+    // this is to efficiently copy incomplete blocks at the end of the rows
+    // the last row is handled explicitly to avoid writing out of bounds
+    add w22, w10, w11
+    cmp w22, #0
+    cset w22, ne // 1 iff w10+w11 not zero, 0 otherwise
+    add w9, w9, w22
+    sub w6, w6, #1
+
+    // store the number of bytes in w20 which we copy too much for every row
+    // when the width of the frame is not a multiple of 96 (128bytes storing 96 10bit values)
+    mov w20, #96*2
+    mul w20, w20, w9
+    sub w20, w1, w20
+
+    mov w23, #0 // flag to check whether the last line had already been processed
+
+    // bitmask to clear the uppper 6bits of the result values
+    mov x19, #0x03ff03ff03ff03ff
+    dup v22.2d, x19
+
+    // row counter = 0
+    eor w12, w12, w12
+row_loop_y16:
+    cmp w12, w6               // jump to row_loop_y16_fin if we processed all rows
+    bge row_loop_y16_fin
+
+    mov x13, x2               // row src
+    eor w14, w14, w14         // full block counter
+block_loop_y16:
+    cmp w14, w9
+    bge block_loop_y16_fin
+
+    // load 64 bytes
+    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
+
+    // process v0 and v1
+    xtn v16.4h, v0.4s
+    ushr v0.4s, v0.4s, #10
+    xtn v17.4h, v0.4s
+    ushr v0.4s, v0.4s, #10
+    xtn v18.4h, v0.4s
+
+    xtn2 v16.8h, v1.4s
+    and v16.16b, v16.16b, v22.16b
+    ushr v1.4s, v1.4s, #10
+    xtn2 v17.8h, v1.4s
+    and v17.16b, v17.16b, v22.16b
+    ushr v1.4s, v1.4s, #10
+    xtn2 v18.8h, v1.4s
+    and v18.16b, v18.16b, v22.16b
+
+    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
+
+    // process v2 and v3
+    xtn v23.4h, v2.4s
+    ushr v2.4s, v2.4s, #10
+    xtn v24.4h, v2.4s
+    ushr v2.4s, v2.4s, #10
+    xtn v25.4h, v2.4s
+
+    xtn2 v23.8h, v3.4s
+    and v23.16b, v23.16b, v22.16b
+    ushr v3.4s, v3.4s, #10
+    xtn2 v24.8h, v3.4s
+    and v24.16b, v24.16b, v22.16b
+    ushr v3.4s, v3.4s, #10
+    xtn2 v25.8h, v3.4s
+    and v25.16b, v25.16b, v22.16b
+
+    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
+
+    // load the second half of the block -> 64 bytes into registers v4-v7
+    ld1 { v4.4s,  v5.4s,  v6.4s,  v7.4s }, [x13], #64
+
+    // process v4 and v5
+    xtn v16.4h, v4.4s
+    ushr v4.4s, v4.4s, #10
+    xtn v17.4h, v4.4s
+    ushr v4.4s, v4.4s, #10
+    xtn v18.4h, v4.4s
+
+    xtn2 v16.8h, v5.4s
+    and v16.16b, v16.16b, v22.16b
+    ushr v5.4s, v5.4s, #10
+    xtn2 v17.8h, v5.4s
+    and v17.16b, v17.16b, v22.16b
+    ushr v5.4s, v5.4s, #10
+    xtn2 v18.8h, v5.4s
+    and v18.16b, v18.16b, v22.16b
+
+    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
+
+    // v6 and v7
+    xtn v23.4h, v6.4s
+    ushr v6.4s, v6.4s, #10
+    xtn v24.4h, v6.4s
+    ushr v6.4s, v6.4s, #10
+    xtn v25.4h, v6.4s
+
+    xtn2 v23.8h, v7.4s
+    and v23.16b, v23.16b, v22.16b
+    ushr v7.4s, v7.4s, #10
+    xtn2 v24.8h, v7.4s
+    and v24.16b, v24.16b, v22.16b
+    ushr v7.4s, v7.4s, #10
+    xtn2 v25.8h, v7.4s
+    and v25.16b, v25.16b, v22.16b
+
+    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
+
+    add x13, x13, x5          // row src += slice_inc
+    add w14, w14, #1
+    b block_loop_y16
+block_loop_y16_fin:
+
+
+
+
+    add x2, x2, #128          // src += stride1 (start of the next row)
+    add x0, x0, w20, sxtw     // subtract the bytes we copied too much from dst
+    add w12, w12, #1
+    b row_loop_y16
+row_loop_y16_fin:
+
+    // check whether we have incomplete blocks at the end of every row
+    // in that case decrease row block count by one
+    // change height back to it's original value (meaning increase it by 1)
+    // and jump back to another iteration of row_loop_y16
+
+    cmp w23, #1
+    beq row_loop_y16_fin2 // don't continue here if we already processed the last row
+    add w6, w6, #1    // increase height to the original value
+    sub w9, w9, w22   // block count - 1 or 0, depending on the remaining bytes count
+    mov w23, #1
+    b row_loop_y16
+row_loop_y16_fin2:
+
+    sub x0, x0, w20, sxtw // with the last row we didn't actually move the dst ptr to far ahead, therefore readd the diference
+
+    // now we've got to handle the last block in the last row
+    eor w12, w12, w12 // w12 = 0 = counter
+integer_loop_y16:
+    cmp w12, w10
+    bge integer_loop_y16_fin
+    ldr w14, [x13], #4
+    and w15, w14, #0x3ff
+    strh w15, [x0], #2
+    lsr w14, w14, #10
+    and w15, w14, #0x3ff
+    strh w15, [x0], #2
+    lsr w14, w14, #10
+    and w15, w14, #0x3ff
+    strh w15, [x0], #2
+    add w12, w12, #1
+    b integer_loop_y16
+integer_loop_y16_fin:
+
+final_values_y16:
+    // remaining point count = w11
+    ldr w14, [x13], #4
+    cmp w11, #0
+    beq final_values_y16_fin
+    and w15, w14, #0x3ff
+    strh w15, [x0], #2
+    cmp w11, #1
+    beq final_values_y16_fin
+    lsr w14, w14, #10
+    and w15, w14, #0x3ff
+    strh w15, [x0], #2
+final_values_y16_fin:
+
+    ldp x23, x24, [sp, #32]
+    ldp x21, x22, [sp, #16]
+    ldp x19, x20, [sp], #48
+    ret
+endfunc
+
+//void ff_rpi_sand30_lines_to_planar_c16(
+//  uint8_t * dst_u,            // [x0]
+//  unsigned int dst_stride_u,  // [w1] == _w*2
+//  uint8_t * dst_v,            // [x2]
+//  unsigned int dst_stride_v,  // [w3] == _w*2
+//  const uint8_t * src,        // [x4]
+//  unsigned int stride1,       // [w5] == 128
+//  unsigned int stride2,       // [w6]
+//  unsigned int _x,            // [w7] == 0
+//  unsigned int y,             // [sp, #0] == 0
+//  unsigned int _w,            // [sp, #8] -> w3
+//  unsigned int h);            // [sp, #16] -> w7
+
+.macro rpi_sand30_lines_to_planar_c16_block_half
+    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
+
+    xtn v4.4h, v0.4s
+    ushr v0.4s, v0.4s, #10
+    xtn v5.4h, v0.4s
+    ushr v0.4s, v0.4s, #10
+    xtn v6.4h, v0.4s
+    xtn2 v4.8h, v1.4s
+    ushr v1.4s, v1.4s, #10
+    xtn2 v5.8h, v1.4s
+    ushr v1.4s, v1.4s, #10
+    xtn2 v6.8h, v1.4s
+    and v4.16b, v4.16b, v16.16b
+    and v5.16b, v5.16b, v16.16b
+    and v6.16b, v6.16b, v16.16b
+    st3 { v4.8h, v5.8h, v6.8h }, [sp], #48
+
+    xtn v4.4h, v2.4s
+    ushr v2.4s, v2.4s, #10
+    xtn v5.4h, v2.4s
+    ushr v2.4s, v2.4s, #10
+    xtn v6.4h, v2.4s
+    xtn2 v4.8h, v3.4s
+    ushr v3.4s, v3.4s, #10
+    xtn2 v5.8h, v3.4s
+    ushr v3.4s, v3.4s, #10
+    xtn2 v6.8h, v3.4s
+    and v4.16b, v4.16b, v16.16b
+    and v5.16b, v5.16b, v16.16b
+    and v6.16b, v6.16b, v16.16b
+    st3 { v4.8h, v5.8h, v6.8h }, [sp]
+    sub sp, sp, #48
+.endm
+
+function ff_rpi_sand30_lines_to_planar_c16, export=1
+    stp x19, x20, [sp, #-48]!
+    stp x21, x22, [sp, #16]
+    stp x23, x24, [sp, #32]
+
+    ldr w3, [sp, #48+8]    // w3 = width
+    ldr w7, [sp, #48+16]   // w7 = height
+
+    // reserve space on the stack for intermediate results
+    sub sp, sp, #256
+
+    // number of 128byte blocks per row, w8 = width / 48
+    mov w9, #48
+    udiv w8, w3, w9
+
+    // remaining pixels (rem_pix) per row, w9 = width - w8 * 48
+    mul w9, w8, w9
+    sub w9, w3, w9
+
+    // row offset, the beginning of the next row to process
+    eor w10, w10, w10
+
+    // offset to the beginning of the next block, w11 = stride2 * 128 - 128
+    lsl w11, w6, #7
+    sub w11, w11, #128
+
+    // decrease the height by one and in case of remaining pixels increase the block count by one
+    sub w7, w7, #1
+    cmp w9, #0
+    cset w19, ne    // w19 == 1 iff reamining pixels != 0
+    add w8, w8, w19
+
+    // bytes we have to move dst back by at the end of every row
+    mov w21, #48*2
+    mul w21, w21, w8
+    sub w21, w1, w21
+
+    mov w20, #0     // w20 = flag, last row processed
+
+    mov x12, #0x03ff03ff03ff03ff
+    dup v16.2d, x12
+
+    // iterate through rows, row counter = w12 = 0
+    eor w12, w12, w12
+row_loop_c16:
+    cmp w12, w7
+    bge row_loop_c16_fin
+
+    // address of row data = src + row_offset
+    mov x13, x4
+    add x13, x13, x10
+
+    eor w14, w14, w14
+block_loop_c16:
+    cmp w14, w8
+    bge block_loop_c16_fin
+
+    rpi_sand30_lines_to_planar_c16_block_half
+
+    ld2 { v0.8h, v1.8h }, [sp], #32
+    ld2 { v2.8h, v3.8h }, [sp], #32
+    ld2 { v4.8h, v5.8h }, [sp]
+    sub sp, sp, #64
+
+    st1 { v0.8h }, [x0], #16
+    st1 { v2.8h }, [x0], #16
+    st1 { v4.8h }, [x0], #16
+    st1 { v1.8h }, [x2], #16
+    st1 { v3.8h }, [x2], #16
+    st1 { v5.8h }, [x2], #16
+
+    rpi_sand30_lines_to_planar_c16_block_half
+
+    ld2 { v0.8h, v1.8h }, [sp], #32
+    ld2 { v2.8h, v3.8h }, [sp], #32
+    ld2 { v4.8h, v5.8h }, [sp]
+    sub sp, sp, #64
+
+    st1 { v0.8h }, [x0], #16
+    st1 { v2.8h }, [x0], #16
+    st1 { v4.8h }, [x0], #16
+    st1 { v1.8h }, [x2], #16
+    st1 { v3.8h }, [x2], #16
+    st1 { v5.8h }, [x2], #16
+
+    add x13, x13, x11 // offset to next block
+    add w14, w14, #1
+    b block_loop_c16
+block_loop_c16_fin:
+
+    add w10, w10, #128
+    add w12, w12, #1
+    add x0, x0, w21, sxtw  // move dst pointers back by x21
+    add x2, x2, w21, sxtw
+    b row_loop_c16
+row_loop_c16_fin:
+
+    cmp w20, #1
+    beq row_loop_c16_fin2
+    mov w20, #1
+    sub w8, w8, w19 // decrease block count by w19
+    add w7, w7, #1 // increase height
+    b row_loop_c16
+
+row_loop_c16_fin2:
+    sub x0, x0, w21, sxtw // readd x21 in case of the last row
+    sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels
+
+    // last incomplete block to be finished
+    // read operations are fine, stride2 is more than large enough even if rem_pix is 0
+    rpi_sand30_lines_to_planar_c16_block_half
+    ld2 { v0.8h, v1.8h }, [sp], #32
+    ld2 { v2.8h, v3.8h }, [sp], #32
+    ld2 { v4.8h, v5.8h }, [sp], #32
+    rpi_sand30_lines_to_planar_c16_block_half
+    ld2 { v0.8h, v1.8h }, [sp], #32
+    ld2 { v2.8h, v3.8h }, [sp], #32
+    ld2 { v4.8h, v5.8h }, [sp]
+    sub sp, sp, #160
+
+    mov x4, sp
+    eor w20, w20, w20
+rem_pix_c16_loop:
+    cmp w20, w9
+    bge rem_pix_c16_fin
+
+    ldr w22, [x4], #4
+    str w22, [x0], #2
+    lsr w22, w22, #16
+    str w22, [x2], #2
+
+    add w20, w20, #1
+    b rem_pix_c16_loop
+rem_pix_c16_fin:
+
+    add sp, sp, #256
+
+    ldp x23, x24, [sp, #32]
+    ldp x21, x22, [sp, #16]
+    ldp x19, x20, [sp], #48
+    ret
+endfunc
+
+
+
+//void ff_rpi_sand30_lines_to_planar_p010(
+//  uint8_t * dest,
+//  unsigned int dst_stride,
+//  const uint8_t * src,
+//  unsigned int src_stride1,
+//  unsigned int src_stride2,
+//  unsigned int _x,
+//  unsigned int y,
+//  unsigned int _w,
+//  unsigned int h);
+
diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h
new file mode 100644
index 0000000000..b3aa481ea4
--- /dev/null
+++ b/libavutil/aarch64/rpi_sand_neon.h
@@ -0,0 +1,55 @@
+/*
+Copyright (c) 2021 Michael Eiler
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: Michael Eiler <eiler.mike@gmail.com>
+*/
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ff_rpi_sand8_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
+
+void ff_rpi_sand8_lines_to_planar_c8(uint8_t * dst_u, unsigned int dst_stride_u,
+  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src,
+  unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y,
+  unsigned int _w, unsigned int h);
+
+void ff_rpi_sand30_lines_to_planar_y16(uint8_t * dest, unsigned int dst_stride,
+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
+
+void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_u,
+  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1,
+  unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
index 0324f6826d..0d5d203dc3 100644
--- a/libavutil/rpi_sand_fn_pw.h
+++ b/libavutil/rpi_sand_fn_pw.h
@@ -54,7 +54,7 @@ void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
     const unsigned int w = _w;
     const unsigned int mask = stride1 - 1;

-#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64)
+#if PW == 1 && HAVE_SAND_ASM
     if (_x == 0) {
         ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride,
                                      src, stride1, stride2, _x, y, _w, h);
@@ -106,7 +106,7 @@ void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_strid
     const unsigned int w = _w * 2;
     const unsigned int mask = stride1 - 1;

-#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64)
+#if PW == 1 && HAVE_SAND_ASM
     if (_x == 0) {
         ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v,
                                      src, stride1, stride2, _x, y, _w, h);
diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
index ed0261b02f..1f543e9357 100644
--- a/libavutil/rpi_sand_fns.c
+++ b/libavutil/rpi_sand_fns.c
@@ -37,6 +37,9 @@ Authors: John Cox
 #if ARCH_ARM && HAVE_NEON
 #include "arm/rpi_sand_neon.h"
 #define HAVE_SAND_ASM 1
+#elif ARCH_AARCH64 && HAVE_NEON
+#include "aarch64/rpi_sand_neon.h"
+#define HAVE_SAND_ASM 1
 #else
 #define HAVE_SAND_ASM 0
 #endif

From 247025a42ae09d6c9c5d4128a5e4b288b7b3047c Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 11:56:02 +0100
Subject: [PATCH 004/161] Add raw encoding for sand

---
 libavcodec/raw.c    |  6 +++
 libavcodec/rawenc.c | 92 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 96 insertions(+), 2 deletions(-)

diff --git a/libavcodec/raw.c b/libavcodec/raw.c
index 1e5b48d1e0..1e689f9ee0 100644
--- a/libavcodec/raw.c
+++ b/libavcodec/raw.c
@@ -295,6 +295,12 @@ static const PixelFormatTag raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */

+    /* RPI (Might as well define for everything) */
+    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
+    { AV_PIX_FMT_RPI4_8,      MKTAG('S', 'A', 'N', 'D') },
+    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
+    { AV_PIX_FMT_RPI4_10,     MKTAG('S', 'N', 'D', 'B') },
+
     { AV_PIX_FMT_NONE, 0 },
 };

diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
index 8c577006d9..594a77c42a 100644
--- a/libavcodec/rawenc.c
+++ b/libavcodec/rawenc.c
@@ -24,6 +24,7 @@
  * Raw Video Encoder
  */

+#include "config.h"
 #include "avcodec.h"
 #include "codec_internal.h"
 #include "encode.h"
@@ -33,6 +34,10 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
+#include "libavutil/avassert.h"
+#if CONFIG_SAND
+#include "libavutil/rpi_sand_fns.h"
+#endif

 static av_cold int raw_encode_init(AVCodecContext *avctx)
 {
@@ -46,12 +51,95 @@ static av_cold int raw_encode_init(AVCodecContext *avctx)
     return 0;
 }

+#if CONFIG_SAND
+static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+                      const AVFrame *frame)
+{
+    const int width = av_frame_cropped_width(frame);
+    const int height = av_frame_cropped_height(frame);
+    const int x0 = frame->crop_left;
+    const int y0 = frame->crop_top;
+    const int size = width * height * 3 / 2;
+    uint8_t * dst;
+    int ret;
+
+    if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0)
+        return ret;
+
+    dst = pkt->data;
+
+    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
+    dst += width * height;
+    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
+    return 0;
+}
+
+static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+                      const AVFrame *frame)
+{
+    const int width = av_frame_cropped_width(frame);
+    const int height = av_frame_cropped_height(frame);
+    const int x0 = frame->crop_left;
+    const int y0 = frame->crop_top;
+    const int size = width * height * 3;
+    uint8_t * dst;
+    int ret;
+
+    if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0)
+        return ret;
+
+    dst = pkt->data;
+
+    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
+    dst += width * height * 2;
+    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
+    return 0;
+}
+
+static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+                      const AVFrame *frame)
+{
+    const int width = av_frame_cropped_width(frame);
+    const int height = av_frame_cropped_height(frame);
+    const int x0 = frame->crop_left;
+    const int y0 = frame->crop_top;
+    const int size = width * height * 3;
+    uint8_t * dst;
+    int ret;
+
+    if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0)
+        return ret;
+
+    dst = pkt->data;
+
+    av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
+    dst += width * height * 2;
+    av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width,
+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2);
+    return 0;
+}
+#endif
+
+
 static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
                       const AVFrame *frame, int *got_packet)
 {
-    int ret = av_image_get_buffer_size(frame->format,
-                                       frame->width, frame->height, 1);
+    int ret;

+#if CONFIG_SAND
+    if (av_rpi_is_sand_frame(frame)) {
+        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) :
+            av_rpi_is_sand16_frame(frame) ? raw_sand16_as_yuv420(avctx, pkt, frame) :
+            av_rpi_is_sand30_frame(frame) ? raw_sand30_as_yuv420(avctx, pkt, frame) : -1;
+        *got_packet = (ret == 0);
+        return ret;
+    }
+#endif
+
+    ret = av_image_get_buffer_size(frame->format,
+                                       frame->width, frame->height, 1);
     if (ret < 0)
         return ret;


From ac6961f424b56563dc793b6bc002a8c04cb1bc36 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 12:02:09 +0100
Subject: [PATCH 005/161] Deal with the lack of trivial sand cropping

---
 fftools/ffmpeg.c        |  4 ++--
 fftools/ffmpeg_filter.c |  4 ++--
 libavutil/frame.c       | 11 +++++++++++
 libavutil/frame.h       | 10 ++++++++++
 4 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
index d721a5e721..15e084f0b2 100644
--- a/fftools/ffmpeg.c
+++ b/fftools/ffmpeg.c
@@ -1993,8 +1993,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame, int keep_ref
                        av_channel_layout_compare(&ifilter->ch_layout, &frame->ch_layout);
         break;
     case AVMEDIA_TYPE_VIDEO:
-        need_reinit |= ifilter->width  != frame->width ||
-                       ifilter->height != frame->height;
+        need_reinit |= ifilter->width  != av_frame_cropped_width(frame) ||
+                       ifilter->height != av_frame_cropped_height(frame);
         break;
     }

diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c
index 1f5bbf6c4d..f888307762 100644
--- a/fftools/ffmpeg_filter.c
+++ b/fftools/ffmpeg_filter.c
@@ -1281,8 +1281,8 @@ int ifilter_parameters_from_frame(InputFilter *ifilter, const AVFrame *frame)

     ifilter->format = frame->format;

-    ifilter->width               = frame->width;
-    ifilter->height              = frame->height;
+    ifilter->width               = av_frame_cropped_width(frame);
+    ifilter->height              = av_frame_cropped_height(frame);
     ifilter->sample_aspect_ratio = frame->sample_aspect_ratio;

     ifilter->sample_rate         = frame->sample_rate;
diff --git a/libavutil/frame.c b/libavutil/frame.c
index 9545477acc..48621e4098 100644
--- a/libavutil/frame.c
+++ b/libavutil/frame.c
@@ -16,6 +16,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

+#include "config.h"
+
 #include "channel_layout.h"
 #include "avassert.h"
 #include "buffer.h"
@@ -27,6 +29,9 @@
 #include "mem.h"
 #include "samplefmt.h"
 #include "hwcontext.h"
+#if CONFIG_SAND
+#include "rpi_sand_fns.h"
+#endif

 #if FF_API_OLD_CHANNEL_LAYOUT
 #define CHECK_CHANNELS_CONSISTENCY(frame) \
@@ -874,6 +879,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags)
         (frame->crop_top + frame->crop_bottom) >= frame->height)
         return AVERROR(ERANGE);

+#if CONFIG_SAND
+    // Sand cannot be cropped - do not try
+    if (av_rpi_is_sand_format(frame->format))
+        return 0;
+#endif
+
     desc = av_pix_fmt_desc_get(frame->format);
     if (!desc)
         return AVERROR_BUG;
diff --git a/libavutil/frame.h b/libavutil/frame.h
index 2580269549..3a9d323325 100644
--- a/libavutil/frame.h
+++ b/libavutil/frame.h
@@ -957,6 +957,16 @@ int av_frame_apply_cropping(AVFrame *frame, int flags);
  */
 const char *av_frame_side_data_name(enum AVFrameSideDataType type);

+
+static inline int av_frame_cropped_width(const AVFrame * const frame)
+{
+    return frame->width - (frame->crop_left + frame->crop_right);
+}
+static inline int av_frame_cropped_height(const AVFrame * const frame)
+{
+    return frame->height - (frame->crop_top + frame->crop_bottom);
+}
+
 /**
  * @}
  */

From 9a08431f7790507b0374d9585dfc736000c1bd42 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 12:31:16 +0100
Subject: [PATCH 006/161] Add an unsand filter

---
 configure                |   1 +
 libavfilter/Makefile     |   1 +
 libavfilter/allfilters.c |   1 +
 libavfilter/buffersrc.c  |   2 +-
 libavfilter/vf_unsand.c  | 228 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 232 insertions(+), 1 deletion(-)
 create mode 100644 libavfilter/vf_unsand.c

diff --git a/configure b/configure
index 27112ced58..7712482bd5 100755
--- a/configure
+++ b/configure
@@ -3754,6 +3754,7 @@ tonemap_opencl_filter_deps="opencl const_nan"
 transpose_opencl_filter_deps="opencl"
 transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags"
 transpose_vulkan_filter_deps="vulkan spirv_compiler"
+unsand_filter_select="sand"
 unsharp_opencl_filter_deps="opencl"
 uspp_filter_deps="gpl avcodec"
 vaguedenoiser_filter_deps="gpl"
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index b3d3d981dd..c14fc995a0 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -518,6 +518,7 @@ OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER)        += vf_transpose_vaapi.o vaapi_vpp.o
 OBJS-$(CONFIG_TRANSPOSE_VULKAN_FILTER)       += vf_transpose_vulkan.o vulkan.o vulkan_filter.o
 OBJS-$(CONFIG_TRIM_FILTER)                   += trim.o
 OBJS-$(CONFIG_UNPREMULTIPLY_FILTER)          += vf_premultiply.o framesync.o
+OBJS-$(CONFIG_UNSAND_FILTER)                 += vf_unsand.o
 OBJS-$(CONFIG_UNSHARP_FILTER)                += vf_unsharp.o
 OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER)         += vf_unsharp_opencl.o opencl.o \
                                                 opencl/unsharp.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index d7db46c2af..b990a00152 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -490,6 +490,7 @@ extern const AVFilter ff_vf_trim;
 extern const AVFilter ff_vf_unpremultiply;
 extern const AVFilter ff_vf_unsharp;
 extern const AVFilter ff_vf_unsharp_opencl;
+extern const AVFilter ff_vf_unsand;
 extern const AVFilter ff_vf_untile;
 extern const AVFilter ff_vf_uspp;
 extern const AVFilter ff_vf_v360;
diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c
index ba17450b93..0dbe5d2335 100644
--- a/libavfilter/buffersrc.c
+++ b/libavfilter/buffersrc.c
@@ -201,7 +201,7 @@ FF_ENABLE_DEPRECATION_WARNINGS

         switch (ctx->outputs[0]->type) {
         case AVMEDIA_TYPE_VIDEO:
-            CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height,
+            CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame),
                                      frame->format, frame->pts);
             break;
         case AVMEDIA_TYPE_AUDIO:
diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c
new file mode 100644
index 0000000000..7100f2fc9b
--- /dev/null
+++ b/libavfilter/vf_unsand.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2007 Bobby Bingham
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * format and noformat video filters
+ */
+
+#include <string.h>
+
+#include "libavutil/internal.h"
+#include "libavutil/mem.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/opt.h"
+#include "libavutil/rpi_sand_fns.h"
+
+#include "avfilter.h"
+#include "formats.h"
+#include "internal.h"
+#include "video.h"
+
+typedef struct UnsandContext {
+    const AVClass *class;
+} UnsandContext;
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+//    UnsandContext *s = ctx->priv;
+}
+
+static av_cold int init(AVFilterContext *ctx)
+{
+//    UnsandContext *s = ctx->priv;
+
+    return 0;
+}
+
+
+static int filter_frame(AVFilterLink *link, AVFrame *in)
+{
+    AVFilterLink * const outlink = link->dst->outputs[0];
+    AVFrame *out = NULL;
+    int rv = 0;
+
+    if (outlink->format == in->format) {
+        // If nothing to do then do nothing
+        out = in;
+    }
+    else
+    {
+        if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL)
+        {
+            rv = AVERROR(ENOMEM);
+            goto fail;
+        }
+        if (av_rpi_sand_to_planar_frame(out, in) != 0)
+        {
+            rv = -1;
+            goto fail;
+        }
+
+        av_frame_free(&in);
+    }
+
+    return ff_filter_frame(outlink, out);
+
+fail:
+    av_frame_free(&out);
+    av_frame_free(&in);
+    return rv;
+}
+
+#if 0
+static void dump_fmts(const AVFilterFormats * fmts)
+{
+    int i;
+    if (fmts== NULL) {
+        printf("NULL\n");
+        return;
+    }
+    for (i = 0; i < fmts->nb_formats; ++i) {
+        printf(" %d", fmts->formats[i]);
+    }
+    printf("\n");
+}
+#endif
+
+static int query_formats(AVFilterContext *ctx)
+{
+//    UnsandContext *s = ctx->priv;
+    int ret;
+
+    // If we aren't connected at both ends then just do nothing
+    if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL)
+        return 0;
+
+    // Our output formats depend on our input formats and we can't/don't
+    // want to convert between bit depths so we need to wait for the source
+    // to have an opinion before we do
+    if (ctx->inputs[0]->incfg.formats == NULL)
+        return AVERROR(EAGAIN);
+
+    // Accept anything
+    if (ctx->inputs[0]->outcfg.formats == NULL &&
+        (ret = ff_formats_ref(ctx->inputs[0]->incfg.formats, &ctx->inputs[0]->outcfg.formats)) < 0)
+        return ret;
+
+    // Filter out sand formats
+
+    // Generate a container if we don't already have one
+    if (ctx->outputs[0]->incfg.formats == NULL)
+    {
+        // Somewhat rubbish way of ensuring we have a good structure
+        const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE};
+        AVFilterFormats *formats = ff_make_format_list(out_fmts);
+
+        if (formats == NULL)
+            return AVERROR(ENOMEM);
+        if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats)) < 0)
+            return ret;
+    }
+
+    // Replace old format list with new filtered list derived from what our
+    // input says it can do
+    {
+        const AVFilterFormats * const src_ff = ctx->inputs[0]->outcfg.formats;
+        AVFilterFormats * const dst_ff = ctx->outputs[0]->incfg.formats;
+        enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats);
+        int i;
+        int n = 0;
+        int seen_420p = 0;
+        int seen_420p10 = 0;
+
+        for (i = 0; i < src_ff->nb_formats; ++i) {
+            const enum AVPixelFormat f = src_ff->formats[i];
+
+            switch (f){
+                case AV_PIX_FMT_YUV420P:
+                case AV_PIX_FMT_SAND128:
+                case AV_PIX_FMT_RPI4_8:
+                    if (!seen_420p) {
+                        seen_420p = 1;
+                        dst_fmts[n++] = AV_PIX_FMT_YUV420P;
+                    }
+                    break;
+                case AV_PIX_FMT_SAND64_10:
+                case AV_PIX_FMT_YUV420P10:
+                case AV_PIX_FMT_RPI4_10:
+                    if (!seen_420p10) {
+                        seen_420p10 = 1;
+                        dst_fmts[n++] = AV_PIX_FMT_YUV420P10;
+                    }
+                    break;
+                default:
+                    dst_fmts[n++] = f;
+                    break;
+            }
+        }
+
+        av_freep(&dst_ff->formats);
+        dst_ff->formats = dst_fmts;
+        dst_ff->nb_formats = n;
+    }
+
+//    printf("Unsand: %s calc: ", __func__);
+//    dump_fmts(ctx->outputs[0]->incfg.formats);
+
+    return 0;
+}
+
+
+#define OFFSET(x) offsetof(UnsandContext, x)
+static const AVOption unsand_options[] = {
+    { NULL }
+};
+
+
+AVFILTER_DEFINE_CLASS(unsand);
+
+static const AVFilterPad avfilter_vf_unsand_inputs[] = {
+    {
+        .name             = "default",
+        .type             = AVMEDIA_TYPE_VIDEO,
+        .filter_frame = filter_frame,
+    },
+    { NULL }
+};
+
+static const AVFilterPad avfilter_vf_unsand_outputs[] = {
+    {
+        .name = "default",
+        .type = AVMEDIA_TYPE_VIDEO
+    },
+};
+
+AVFilter ff_vf_unsand = {
+    .name          = "unsand",
+    .description   = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"),
+
+    .init          = init,
+    .uninit        = uninit,
+
+    FILTER_QUERY_FUNC(query_formats),
+
+    .priv_size     = sizeof(UnsandContext),
+    .priv_class    = &unsand_class,
+
+    FILTER_INPUTS(avfilter_vf_unsand_inputs),
+    FILTER_OUTPUTS(avfilter_vf_unsand_outputs),
+};
+

From 6e61007b19544c573f1c2a4c6060d3d24b8d500e Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 12:37:07 +0100
Subject: [PATCH 007/161] Reduce mmal compile warnings

---
 libavcodec/mmaldec.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
index 3092f58510..6f41b41ac4 100644
--- a/libavcodec/mmaldec.c
+++ b/libavcodec/mmaldec.c
@@ -24,6 +24,9 @@
  * MMAL Video Decoder
  */

+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
 #include <bcm_host.h>
 #include <interface/mmal/mmal.h>
 #include <interface/mmal/mmal_parameters_video.h>
@@ -31,6 +34,7 @@
 #include <interface/mmal/util/mmal_util_params.h>
 #include <interface/mmal/util/mmal_default_components.h>
 #include <interface/mmal/vc/mmal_vc_api.h>
+#pragma GCC diagnostic pop
 #include <stdatomic.h>

 #include "avcodec.h"

From 01aff455665e8f889330519096912ad0005add3c Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 17:56:16 +0100
Subject: [PATCH 008/161] Add chroma location to hevc parse

---
 libavcodec/hevc_parser.c | 13 +++++++++++++
 libavcodec/hevcdec.c     | 13 +++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/libavcodec/hevc_parser.c b/libavcodec/hevc_parser.c
index 59f9a0ff3e..4ae7222e8b 100644
--- a/libavcodec/hevc_parser.c
+++ b/libavcodec/hevc_parser.c
@@ -97,6 +97,19 @@ static int hevc_parse_slice_header(AVCodecParserContext *s, H2645NAL *nal,
     avctx->profile  = ps->sps->ptl.general_ptl.profile_idc;
     avctx->level    = ps->sps->ptl.general_ptl.level_idc;

+    if (ps->sps->chroma_format_idc == 1) {
+        avctx->chroma_sample_location = ps->sps->vui.common.chroma_loc_info_present_flag ?
+            ps->sps->vui.common.chroma_sample_loc_type_top_field + 1 :
+            AVCHROMA_LOC_LEFT;
+    }
+    else if (ps->sps->chroma_format_idc == 2 ||
+             ps->sps->chroma_format_idc == 3) {
+        avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
+    }
+    else {
+        avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
+    }
+
     if (ps->vps->vps_timing_info_present_flag) {
         num = ps->vps->vps_num_units_in_tick;
         den = ps->vps->vps_time_scale;
diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
index 567e8d81d4..b6cfea64d3 100644
--- a/libavcodec/hevcdec.c
+++ b/libavcodec/hevcdec.c
@@ -347,6 +347,19 @@ static void export_stream_params(HEVCContext *s, const HEVCSPS *sps)
     else
         avctx->color_range = AVCOL_RANGE_MPEG;

+    if (sps->chroma_format_idc == 1) {
+        avctx->chroma_sample_location = sps->vui.common.chroma_loc_info_present_flag ?
+            sps->vui.common.chroma_sample_loc_type_top_field + 1 :
+            AVCHROMA_LOC_LEFT;
+    }
+    else if (sps->chroma_format_idc == 2 ||
+             sps->chroma_format_idc == 3) {
+        avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
+    }
+    else {
+        avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
+    }
+
     if (sps->vui.common.colour_description_present_flag) {
         avctx->color_primaries = sps->vui.common.colour_primaries;
         avctx->color_trc       = sps->vui.common.transfer_characteristics;

From c80aad5d2fb373f7564e4257b1272f2decb06dd0 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 26 Sep 2022 18:20:50 +0100
Subject: [PATCH 009/161] hwaccel: Add .abort_frame & use in hevcdec

---
 libavcodec/avcodec.h | 11 +++++++++++
 libavcodec/hevcdec.c |  7 ++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 39881a1d2b..32bc78e2be 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -2221,6 +2221,17 @@ typedef struct AVHWAccel {
      * that avctx->hwaccel_priv_data is invalid.
      */
     int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
+
+    /**
+     * Called if parsing fails
+     *
+     * An error has occured, end_frame will not be called
+     * start_frame & decode_slice may or may not have been called
+     * Optional
+     *
+     * @param avctx the codec context
+     */
+    void (*abort_frame)(AVCodecContext *avctx);
 } AVHWAccel;

 /**
diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
index b6cfea64d3..8a0246fa21 100644
--- a/libavcodec/hevcdec.c
+++ b/libavcodec/hevcdec.c
@@ -3375,8 +3375,13 @@ static int hevc_decode_frame(AVCodecContext *avctx, AVFrame *rframe,

     s->ref = NULL;
     ret    = decode_nal_units(s, avpkt->data, avpkt->size);
-    if (ret < 0)
+    if (ret < 0) {
+        // Ensure that hwaccel knows this frame is over
+        if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame)
+            s->avctx->hwaccel->abort_frame(s->avctx);
+
         return ret;
+    }

     if (avctx->hwaccel) {
         if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {

From 317722fd652d9a1c1700319c80fc71acf68ddde6 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 26 Sep 2022 18:26:17 +0100
Subject: [PATCH 010/161] hwaccel: Add CAP_MT_SAFE for accels that can use
 multi-thread

---
 libavcodec/hwconfig.h      | 1 +
 libavcodec/pthread_frame.c | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h
index 721424912c..c43ad55245 100644
--- a/libavcodec/hwconfig.h
+++ b/libavcodec/hwconfig.h
@@ -24,6 +24,7 @@


 #define HWACCEL_CAP_ASYNC_SAFE      (1 << 0)
+#define HWACCEL_CAP_MT_SAFE         (1 << 1)


 typedef struct AVCodecHWConfigInternal {
diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c
index d9d5afaa82..2cc89a41f5 100644
--- a/libavcodec/pthread_frame.c
+++ b/libavcodec/pthread_frame.c
@@ -204,7 +204,8 @@ static attribute_align_arg void *frame_worker_thread(void *arg)

         /* if the previous thread uses hwaccel then we take the lock to ensure
          * the threads don't run concurrently */
-        if (avctx->hwaccel) {
+        if (avctx->hwaccel &&
+            !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
             pthread_mutex_lock(&p->parent->hwaccel_mutex);
             p->hwaccel_serializing = 1;
         }
@@ -590,7 +591,9 @@ void ff_thread_finish_setup(AVCodecContext *avctx) {

     if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return;

-    if (avctx->hwaccel && !p->hwaccel_serializing) {
+    if (avctx->hwaccel &&
+        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) &&
+        !p->hwaccel_serializing) {
         pthread_mutex_lock(&p->parent->hwaccel_mutex);
         p->hwaccel_serializing = 1;
     }

From 9005b263450e154a5ec5258fda17d5998fe7896b Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 17:59:08 +0100
Subject: [PATCH 011/161] Weak link utils

---
 libavcodec/weak_link.c | 102 +++++++++++++++++++++++++++++++++++++++++
 libavcodec/weak_link.h |  23 ++++++++++
 2 files changed, 125 insertions(+)
 create mode 100644 libavcodec/weak_link.c
 create mode 100644 libavcodec/weak_link.h

diff --git a/libavcodec/weak_link.c b/libavcodec/weak_link.c
new file mode 100644
index 0000000000..f234a985b9
--- /dev/null
+++ b/libavcodec/weak_link.c
@@ -0,0 +1,102 @@
+#include <stdlib.h>
+#include <pthread.h>
+#include <stdatomic.h>
+#include "weak_link.h"
+
+struct ff_weak_link_master {
+    atomic_int ref_count;    /* 0 is single ref for easier atomics */
+    pthread_rwlock_t lock;
+    void * ptr;
+};
+
+static inline struct ff_weak_link_master * weak_link_x(struct ff_weak_link_client * c)
+{
+    return (struct ff_weak_link_master *)c;
+}
+
+struct ff_weak_link_master * ff_weak_link_new(void * p)
+{
+    struct ff_weak_link_master * w = malloc(sizeof(*w));
+    if (!w)
+        return NULL;
+    w->ptr = p;
+    if (pthread_rwlock_init(&w->lock, NULL)) {
+        free(w);
+        return NULL;
+    }
+    return w;
+}
+
+static void weak_link_do_unref(struct ff_weak_link_master * const w)
+{
+    int n = atomic_fetch_sub(&w->ref_count, 1);
+    if (n)
+        return;
+
+    pthread_rwlock_destroy(&w->lock);
+    free(w);
+}
+
+// Unref & break link
+void ff_weak_link_break(struct ff_weak_link_master ** ppLink)
+{
+    struct ff_weak_link_master * const w = *ppLink;
+    if (!w)
+        return;
+
+    *ppLink = NULL;
+    pthread_rwlock_wrlock(&w->lock);
+    w->ptr = NULL;
+    pthread_rwlock_unlock(&w->lock);
+
+    weak_link_do_unref(w);
+}
+
+struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w)
+{
+    if (!w)
+        return NULL;
+    atomic_fetch_add(&w->ref_count, 1);
+    return (struct ff_weak_link_client*)w;
+}
+
+void ff_weak_link_unref(struct ff_weak_link_client ** ppLink)
+{
+    struct ff_weak_link_master * const w = weak_link_x(*ppLink);
+    if (!w)
+        return;
+
+    *ppLink = NULL;
+    weak_link_do_unref(w);
+}
+
+void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink)
+{
+    struct ff_weak_link_master * const w = weak_link_x(*ppLink);
+
+    if (!w)
+        return NULL;
+
+    if (pthread_rwlock_rdlock(&w->lock))
+        goto broken;
+
+    if (w->ptr)
+        return w->ptr;
+
+    pthread_rwlock_unlock(&w->lock);
+
+broken:
+    *ppLink = NULL;
+    weak_link_do_unref(w);
+    return NULL;
+}
+
+// Ignores a NULL c (so can be on the return path of both broken & live links)
+void ff_weak_link_unlock(struct ff_weak_link_client * c)
+{
+    struct ff_weak_link_master * const w = weak_link_x(c);
+    if (w)
+        pthread_rwlock_unlock(&w->lock);
+}
+
+
diff --git a/libavcodec/weak_link.h b/libavcodec/weak_link.h
new file mode 100644
index 0000000000..415b6a27a0
--- /dev/null
+++ b/libavcodec/weak_link.h
@@ -0,0 +1,23 @@
+struct ff_weak_link_master;
+struct ff_weak_link_client;
+
+struct ff_weak_link_master * ff_weak_link_new(void * p);
+void ff_weak_link_break(struct ff_weak_link_master ** ppLink);
+
+struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w);
+void ff_weak_link_unref(struct ff_weak_link_client ** ppLink);
+
+// Returns NULL if link broken - in this case it will also zap
+//   *ppLink and unref the weak_link.
+// Returns NULL if *ppLink is NULL (so a link once broken stays broken)
+//
+// The above does mean that there is a race if this is called simultainiously
+// by two threads using the same weak_link_client (so don't do that)
+void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink);
+void ff_weak_link_unlock(struct ff_weak_link_client * c);
+
+
+
+
+
+

From 824be1710ca96d97c86836fdac0e7dcd28a4b92e Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 19:23:26 +0100
Subject: [PATCH 012/161] Add v4l2_req V4L2 request H265 drm_prime decode

Has the abiliy to switch between kernel API versions at runtime. This
could be removed later once teher is no chance of usage on an old
kernel.
---
 configure                       |   14 +
 libavcodec/Makefile             |    4 +
 libavcodec/hevc-ctrls-v1.h      |  229 +++++
 libavcodec/hevc-ctrls-v2.h      |  257 +++++
 libavcodec/hevcdec.c            |   10 +
 libavcodec/hwaccels.h           |    1 +
 libavcodec/hwconfig.h           |    2 +
 libavcodec/v4l2_req_decode_q.c  |   84 ++
 libavcodec/v4l2_req_decode_q.h  |   25 +
 libavcodec/v4l2_req_devscan.c   |  449 +++++++++
 libavcodec/v4l2_req_devscan.h   |   23 +
 libavcodec/v4l2_req_dmabufs.c   |  266 ++++++
 libavcodec/v4l2_req_dmabufs.h   |   40 +
 libavcodec/v4l2_req_hevc_v1.c   |    3 +
 libavcodec/v4l2_req_hevc_v2.c   |    3 +
 libavcodec/v4l2_req_hevc_vx.c   | 1213 +++++++++++++++++++++++
 libavcodec/v4l2_req_media.c     | 1596 +++++++++++++++++++++++++++++++
 libavcodec/v4l2_req_media.h     |  151 +++
 libavcodec/v4l2_req_pollqueue.c |  361 +++++++
 libavcodec/v4l2_req_pollqueue.h |   18 +
 libavcodec/v4l2_req_utils.h     |   27 +
 libavcodec/v4l2_request_hevc.c  |  297 ++++++
 libavcodec/v4l2_request_hevc.h  |  102 ++
 23 files changed, 5175 insertions(+)
 create mode 100644 libavcodec/hevc-ctrls-v1.h
 create mode 100644 libavcodec/hevc-ctrls-v2.h
 create mode 100644 libavcodec/v4l2_req_decode_q.c
 create mode 100644 libavcodec/v4l2_req_decode_q.h
 create mode 100644 libavcodec/v4l2_req_devscan.c
 create mode 100644 libavcodec/v4l2_req_devscan.h
 create mode 100644 libavcodec/v4l2_req_dmabufs.c
 create mode 100644 libavcodec/v4l2_req_dmabufs.h
 create mode 100644 libavcodec/v4l2_req_hevc_v1.c
 create mode 100644 libavcodec/v4l2_req_hevc_v2.c
 create mode 100644 libavcodec/v4l2_req_hevc_vx.c
 create mode 100644 libavcodec/v4l2_req_media.c
 create mode 100644 libavcodec/v4l2_req_media.h
 create mode 100644 libavcodec/v4l2_req_pollqueue.c
 create mode 100644 libavcodec/v4l2_req_pollqueue.h
 create mode 100644 libavcodec/v4l2_req_utils.h
 create mode 100644 libavcodec/v4l2_request_hevc.c
 create mode 100644 libavcodec/v4l2_request_hevc.h

diff --git a/configure b/configure
index 7712482bd5..199aa2b3d5 100755
--- a/configure
+++ b/configure
@@ -281,6 +281,7 @@ External library support:
                            if openssl, gnutls or mbedtls is not used [no]
   --enable-libtwolame      enable MP2 encoding via libtwolame [no]
   --enable-libuavs3d       enable AVS3 decoding via libuavs3d [no]
+  --enable-libudev         enable libudev [no]
   --enable-libv4l2         enable libv4l2/v4l-utils [no]
   --enable-libvidstab      enable video stabilization using vid.stab [no]
   --enable-libvmaf         enable vmaf filter via libvmaf [no]
@@ -351,6 +352,7 @@ External library support:
   --enable-omx-rpi         enable OpenMAX IL code for Raspberry Pi [no]
   --enable-rkmpp           enable Rockchip Media Process Platform code [no]
   --disable-v4l2-m2m       disable V4L2 mem2mem code [autodetect]
+  --enable-v4l2-request    enable V4L2 request API code [no]
   --disable-vaapi          disable Video Acceleration API (mainly Unix/Intel) code [autodetect]
   --disable-vdpau          disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
   --disable-videotoolbox   disable VideoToolbox code [autodetect]
@@ -1858,6 +1860,7 @@ EXTERNAL_LIBRARY_LIST="
     libtheora
     libtwolame
     libuavs3d
+    libudev
     libv4l2
     libvmaf
     libvorbis
@@ -1914,6 +1917,7 @@ HWACCEL_LIBRARY_LIST="
     mmal
     omx
     opencl
+    v4l2_request
 "

 DOCUMENT_LIST="
@@ -3002,6 +3006,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext"
 dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32"
 ffnvcodec_deps_any="libdl LoadLibrary"
 nvdec_deps="ffnvcodec"
+v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev"
 vaapi_x11_deps="xlib_x11"
 videotoolbox_hwaccel_deps="videotoolbox pthreads"
 videotoolbox_hwaccel_extralibs="-framework QuartzCore"
@@ -3045,6 +3050,8 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC"
 hevc_dxva2_hwaccel_select="hevc_decoder"
 hevc_nvdec_hwaccel_deps="nvdec"
 hevc_nvdec_hwaccel_select="hevc_decoder"
+hevc_v4l2request_hwaccel_deps="v4l2_request"
+hevc_v4l2request_hwaccel_select="hevc_decoder"
 hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
 hevc_vaapi_hwaccel_select="hevc_decoder"
 hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC"
@@ -6696,6 +6703,7 @@ enabled libtwolame        && require libtwolame twolame.h twolame_init -ltwolame
                              { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame ||
                                die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; }
 enabled libuavs3d         && require_pkg_config libuavs3d "uavs3d >= 1.1.41" uavs3d.h uavs3d_decode
+enabled libudev           && require_pkg_config libudev libudev libudev.h udev_new
 enabled libv4l2           && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl
 enabled libvidstab        && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit
 enabled libvmaf           && require_pkg_config libvmaf "libvmaf >= 2.0.0" libvmaf.h vmaf_init
@@ -6798,6 +6806,10 @@ enabled rkmpp             && { require_pkg_config rkmpp rockchip_mpp  rockchip/r
                                { enabled libdrm ||
                                  die "ERROR: rkmpp requires --enable-libdrm"; }
                              }
+enabled v4l2_request      && { enabled libdrm ||
+                               die "ERROR: v4l2-request requires --enable-libdrm"; } &&
+                             { enabled libudev ||
+                               die "ERROR: v4l2-request requires --enable-libudev"; }
 enabled vapoursynth       && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init


@@ -6880,6 +6892,8 @@ if enabled v4l2_m2m; then
     check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
 fi

+check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
+check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
 check_headers sys/videoio.h
 test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 389253f5d0..2d440b5648 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -170,6 +170,8 @@ OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
 OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
 OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
 OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
+OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\
+					  v4l2_req_devscan.o weak_link.o
 OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
 OBJS-$(CONFIG_WMV2DSP)                 += wmv2dsp.o

@@ -996,6 +998,8 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)       += dxva2_hevc.o
 OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
 OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
 OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec.o
+OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o\
+                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o
 OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
diff --git a/libavcodec/hevc-ctrls-v1.h b/libavcodec/hevc-ctrls-v1.h
new file mode 100644
index 0000000000..72cbba0953
--- /dev/null
+++ b/libavcodec/hevc-ctrls-v1.h
@@ -0,0 +1,229 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * These are the HEVC state controls for use with stateless HEVC
+ * codec drivers.
+ *
+ * It turns out that these structs are not stable yet and will undergo
+ * more changes. So keep them private until they are stable and ready to
+ * become part of the official public API.
+ */
+
+#ifndef _HEVC_CTRLS_H_
+#define _HEVC_CTRLS_H_
+
+#include <linux/videodev2.h>
+
+/* The pixel format isn't stable at the moment and will likely be renamed. */
+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
+
+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_MPEG_BASE + 1008)
+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_MPEG_BASE + 1009)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_MPEG_BASE + 1010)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_MPEG_BASE + 1011)
+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_MPEG_BASE + 1015)
+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_MPEG_BASE + 1016)
+
+/* enum v4l2_ctrl_type type values */
+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
+
+enum v4l2_mpeg_video_hevc_decode_mode {
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
+};
+
+enum v4l2_mpeg_video_hevc_start_code {
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
+};
+
+#define V4L2_HEVC_SLICE_TYPE_B	0
+#define V4L2_HEVC_SLICE_TYPE_P	1
+#define V4L2_HEVC_SLICE_TYPE_I	2
+
+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
+
+/* The controls are not stable at the moment and will likely be reworked. */
+struct v4l2_ctrl_hevc_sps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
+	__u16	pic_width_in_luma_samples;
+	__u16	pic_height_in_luma_samples;
+	__u8	bit_depth_luma_minus8;
+	__u8	bit_depth_chroma_minus8;
+	__u8	log2_max_pic_order_cnt_lsb_minus4;
+	__u8	sps_max_dec_pic_buffering_minus1;
+	__u8	sps_max_num_reorder_pics;
+	__u8	sps_max_latency_increase_plus1;
+	__u8	log2_min_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_luma_coding_block_size;
+	__u8	log2_min_luma_transform_block_size_minus2;
+	__u8	log2_diff_max_min_luma_transform_block_size;
+	__u8	max_transform_hierarchy_depth_inter;
+	__u8	max_transform_hierarchy_depth_intra;
+	__u8	pcm_sample_bit_depth_luma_minus1;
+	__u8	pcm_sample_bit_depth_chroma_minus1;
+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
+	__u8	num_short_term_ref_pic_sets;
+	__u8	num_long_term_ref_pics_sps;
+	__u8	chroma_format_idc;
+	__u8	sps_max_sub_layers_minus1;
+
+	__u64	flags;
+};
+
+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 0)
+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
+
+struct v4l2_ctrl_hevc_pps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
+	__u8	num_extra_slice_header_bits;
+	__s8	init_qp_minus26;
+	__u8	diff_cu_qp_delta_depth;
+	__s8	pps_cb_qp_offset;
+	__s8	pps_cr_qp_offset;
+	__u8	num_tile_columns_minus1;
+	__u8	num_tile_rows_minus1;
+	__u8	column_width_minus1[20];
+	__u8	row_height_minus1[22];
+	__s8	pps_beta_offset_div2;
+	__s8	pps_tc_offset_div2;
+	__u8	log2_parallel_merge_level_minus2;
+
+	__u8	padding[4];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
+
+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
+
+struct v4l2_hevc_dpb_entry {
+	__u64	timestamp;
+	__u8	rps;
+	__u8	field_pic;
+	__u16	pic_order_cnt[2];
+	__u8	padding[2];
+};
+
+struct v4l2_hevc_pred_weight_table {
+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__u8	padding[6];
+
+	__u8	luma_log2_weight_denom;
+	__s8	delta_chroma_log2_weight_denom;
+};
+
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 9)
+
+struct v4l2_ctrl_hevc_slice_params {
+	__u32	bit_size;
+	__u32	data_bit_offset;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u32	slice_segment_addr;
+	__u32	num_entry_point_offsets;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+	__u8	nal_unit_type;
+	__u8	nuh_temporal_id_plus1;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	slice_type;
+	__u8	colour_plane_id;
+	__u16	slice_pic_order_cnt;
+	__u8	num_ref_idx_l0_active_minus1;
+	__u8	num_ref_idx_l1_active_minus1;
+	__u8	collocated_ref_idx;
+	__u8	five_minus_max_num_merge_cand;
+	__s8	slice_qp_delta;
+	__s8	slice_cb_qp_offset;
+	__s8	slice_cr_qp_offset;
+	__s8	slice_act_y_qp_offset;
+	__s8	slice_act_cb_qp_offset;
+	__s8	slice_act_cr_qp_offset;
+	__s8	slice_beta_offset_div2;
+	__s8	slice_tc_offset_div2;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
+	__u8	pic_struct;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	num_active_dpb_entries;
+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+
+	__u8	num_rps_poc_st_curr_before;
+	__u8	num_rps_poc_st_curr_after;
+	__u8	num_rps_poc_lt_curr;
+
+	__u8	padding;
+
+	__u32	entry_point_offset_minus1[256];
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
+	struct v4l2_hevc_pred_weight_table pred_weight_table;
+
+	__u64	flags;
+};
+
+struct v4l2_ctrl_hevc_scaling_matrix {
+	__u8	scaling_list_4x4[6][16];
+	__u8	scaling_list_8x8[6][64];
+	__u8	scaling_list_16x16[6][64];
+	__u8	scaling_list_32x32[2][64];
+	__u8	scaling_list_dc_coef_16x16[6];
+	__u8	scaling_list_dc_coef_32x32[2];
+};
+
+#endif
diff --git a/libavcodec/hevc-ctrls-v2.h b/libavcodec/hevc-ctrls-v2.h
new file mode 100644
index 0000000000..7cbbbf055f
--- /dev/null
+++ b/libavcodec/hevc-ctrls-v2.h
@@ -0,0 +1,257 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * These are the HEVC state controls for use with stateless HEVC
+ * codec drivers.
+ *
+ * It turns out that these structs are not stable yet and will undergo
+ * more changes. So keep them private until they are stable and ready to
+ * become part of the official public API.
+ */
+
+#ifndef _HEVC_CTRLS_H_
+#define _HEVC_CTRLS_H_
+
+#include <linux/videodev2.h>
+
+/* The pixel format isn't stable at the moment and will likely be renamed. */
+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
+
+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
+
+/* enum v4l2_ctrl_type type values */
+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
+#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
+
+enum v4l2_mpeg_video_hevc_decode_mode {
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
+};
+
+enum v4l2_mpeg_video_hevc_start_code {
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
+};
+
+#define V4L2_HEVC_SLICE_TYPE_B	0
+#define V4L2_HEVC_SLICE_TYPE_P	1
+#define V4L2_HEVC_SLICE_TYPE_I	2
+
+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
+
+/* The controls are not stable at the moment and will likely be reworked. */
+struct v4l2_ctrl_hevc_sps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
+	__u16	pic_width_in_luma_samples;
+	__u16	pic_height_in_luma_samples;
+	__u8	bit_depth_luma_minus8;
+	__u8	bit_depth_chroma_minus8;
+	__u8	log2_max_pic_order_cnt_lsb_minus4;
+	__u8	sps_max_dec_pic_buffering_minus1;
+	__u8	sps_max_num_reorder_pics;
+	__u8	sps_max_latency_increase_plus1;
+	__u8	log2_min_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_luma_coding_block_size;
+	__u8	log2_min_luma_transform_block_size_minus2;
+	__u8	log2_diff_max_min_luma_transform_block_size;
+	__u8	max_transform_hierarchy_depth_inter;
+	__u8	max_transform_hierarchy_depth_intra;
+	__u8	pcm_sample_bit_depth_luma_minus1;
+	__u8	pcm_sample_bit_depth_chroma_minus1;
+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
+	__u8	num_short_term_ref_pic_sets;
+	__u8	num_long_term_ref_pics_sps;
+	__u8	chroma_format_idc;
+	__u8	sps_max_sub_layers_minus1;
+
+	__u64	flags;
+};
+
+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
+
+struct v4l2_ctrl_hevc_pps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
+	__u8	num_extra_slice_header_bits;
+	__u8	num_ref_idx_l0_default_active_minus1;
+	__u8	num_ref_idx_l1_default_active_minus1;
+	__s8	init_qp_minus26;
+	__u8	diff_cu_qp_delta_depth;
+	__s8	pps_cb_qp_offset;
+	__s8	pps_cr_qp_offset;
+	__u8	num_tile_columns_minus1;
+	__u8	num_tile_rows_minus1;
+	__u8	column_width_minus1[20];
+	__u8	row_height_minus1[22];
+	__s8	pps_beta_offset_div2;
+	__s8	pps_tc_offset_div2;
+	__u8	log2_parallel_merge_level_minus2;
+
+	__u8	padding[4];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
+
+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
+
+struct v4l2_hevc_dpb_entry {
+	__u64	timestamp;
+	__u8	rps;
+	__u8	field_pic;
+	__u16	pic_order_cnt[2];
+	__u8	padding[2];
+};
+
+struct v4l2_hevc_pred_weight_table {
+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__u8	padding[6];
+
+	__u8	luma_log2_weight_denom;
+	__s8	delta_chroma_log2_weight_denom;
+};
+
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
+
+struct v4l2_ctrl_hevc_slice_params {
+	__u32	bit_size;
+	__u32	data_bit_offset;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u32	slice_segment_addr;
+	__u32	num_entry_point_offsets;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+	__u8	nal_unit_type;
+	__u8	nuh_temporal_id_plus1;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	slice_type;
+	__u8	colour_plane_id;
+	__u16	slice_pic_order_cnt;
+	__u8	num_ref_idx_l0_active_minus1;
+	__u8	num_ref_idx_l1_active_minus1;
+	__u8	collocated_ref_idx;
+	__u8	five_minus_max_num_merge_cand;
+	__s8	slice_qp_delta;
+	__s8	slice_cb_qp_offset;
+	__s8	slice_cr_qp_offset;
+	__s8	slice_act_y_qp_offset;
+	__s8	slice_act_cb_qp_offset;
+	__s8	slice_act_cr_qp_offset;
+	__s8	slice_beta_offset_div2;
+	__s8	slice_tc_offset_div2;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
+	__u8	pic_struct;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+
+	__u8	padding[5];
+
+	__u32	entry_point_offset_minus1[256];
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
+	struct v4l2_hevc_pred_weight_table pred_weight_table;
+
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
+
+struct v4l2_ctrl_hevc_decode_params {
+	__s32	pic_order_cnt_val;
+	__u8	num_active_dpb_entries;
+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	num_poc_st_curr_before;
+	__u8	num_poc_st_curr_after;
+	__u8	num_poc_lt_curr;
+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u64	flags;
+};
+
+/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
+#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
+/*
+ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
+ * the number of data (in bits) to skip in the
+ * slice segment header.
+ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
+ * to before syntax element "slice_temporal_mvp_enabled_flag".
+ * If IDR, the skipped bits are just "pic_output_flag"
+ * (separate_colour_plane_flag is not supported).
+ */
+#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
+
+struct v4l2_ctrl_hevc_scaling_matrix {
+	__u8	scaling_list_4x4[6][16];
+	__u8	scaling_list_8x8[6][64];
+	__u8	scaling_list_16x16[6][64];
+	__u8	scaling_list_32x32[2][64];
+	__u8	scaling_list_dc_coef_16x16[6];
+	__u8	scaling_list_dc_coef_32x32[2];
+};
+
+#endif
diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
index 8a0246fa21..2867cb2e16 100644
--- a/libavcodec/hevcdec.c
+++ b/libavcodec/hevcdec.c
@@ -416,6 +416,7 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
 #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \
                      CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \
                      CONFIG_HEVC_NVDEC_HWACCEL + \
+                     CONFIG_HEVC_V4L2REQUEST_HWACCEL + \
                      CONFIG_HEVC_VAAPI_HWACCEL + \
                      CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \
                      CONFIG_HEVC_VDPAU_HWACCEL)
@@ -442,6 +443,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
 #endif
 #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
         *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
+#endif
+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
+        *fmt++ = AV_PIX_FMT_DRM_PRIME;
 #endif
         break;
     case AV_PIX_FMT_YUV420P10:
@@ -463,6 +467,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
 #endif
 #if CONFIG_HEVC_NVDEC_HWACCEL
         *fmt++ = AV_PIX_FMT_CUDA;
+#endif
+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
+        *fmt++ = AV_PIX_FMT_DRM_PRIME;
 #endif
         break;
     case AV_PIX_FMT_YUV444P:
@@ -3749,6 +3756,9 @@ const FFCodec ff_hevc_decoder = {
 #if CONFIG_HEVC_NVDEC_HWACCEL
                                HWACCEL_NVDEC(hevc),
 #endif
+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
+                               HWACCEL_V4L2REQUEST(hevc),
+#endif
 #if CONFIG_HEVC_VAAPI_HWACCEL
                                HWACCEL_VAAPI(hevc),
 #endif
diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
index aca55831f3..f32d1c4ec4 100644
--- a/libavcodec/hwaccels.h
+++ b/libavcodec/hwaccels.h
@@ -40,6 +40,7 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel;
 extern const AVHWAccel ff_hevc_d3d11va2_hwaccel;
 extern const AVHWAccel ff_hevc_dxva2_hwaccel;
 extern const AVHWAccel ff_hevc_nvdec_hwaccel;
+extern const AVHWAccel ff_hevc_v4l2request_hwaccel;
 extern const AVHWAccel ff_hevc_vaapi_hwaccel;
 extern const AVHWAccel ff_hevc_vdpau_hwaccel;
 extern const AVHWAccel ff_hevc_videotoolbox_hwaccel;
diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h
index c43ad55245..b8aa383071 100644
--- a/libavcodec/hwconfig.h
+++ b/libavcodec/hwconfig.h
@@ -71,6 +71,8 @@ typedef struct AVCodecHWConfigInternal {
     HW_CONFIG_HWACCEL(1, 1, 0, D3D11,        D3D11VA,      ff_ ## codec ## _d3d11va2_hwaccel)
 #define HWACCEL_NVDEC(codec) \
     HW_CONFIG_HWACCEL(1, 1, 0, CUDA,         CUDA,         ff_ ## codec ## _nvdec_hwaccel)
+#define HWACCEL_V4L2REQUEST(codec) \
+    HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME,    DRM,          ff_ ## codec ## _v4l2request_hwaccel)
 #define HWACCEL_VAAPI(codec) \
     HW_CONFIG_HWACCEL(1, 1, 1, VAAPI,        VAAPI,        ff_ ## codec ## _vaapi_hwaccel)
 #define HWACCEL_VDPAU(codec) \
diff --git a/libavcodec/v4l2_req_decode_q.c b/libavcodec/v4l2_req_decode_q.c
new file mode 100644
index 0000000000..5b3fb958fa
--- /dev/null
+++ b/libavcodec/v4l2_req_decode_q.c
@@ -0,0 +1,84 @@
+#include <memory.h>
+#include <semaphore.h>
+#include <pthread.h>
+
+#include "v4l2_req_decode_q.h"
+
+int decode_q_in_q(const req_decode_ent * const d)
+{
+    return d->in_q;
+}
+
+void decode_q_add(req_decode_q * const q, req_decode_ent * const d)
+{
+    pthread_mutex_lock(&q->q_lock);
+    if (!q->head) {
+        q->head = d;
+        q->tail = d;
+        d->prev = NULL;
+    }
+    else {
+        q->tail->next = d;
+        d->prev = q->tail;
+        q->tail = d;
+    }
+    d->next = NULL;
+    d->in_q = 1;
+    pthread_mutex_unlock(&q->q_lock);
+}
+
+// Remove entry from Q - if head wake-up anything that was waiting
+void decode_q_remove(req_decode_q * const q, req_decode_ent * const d)
+{
+    int try_signal = 0;
+
+    if (!d->in_q)
+        return;
+
+    pthread_mutex_lock(&q->q_lock);
+    if (d->prev)
+        d->prev->next = d->next;
+    else {
+        try_signal = 1;  // Only need to signal if we were head
+        q->head = d->next;
+    }
+
+    if (d->next)
+        d->next->prev = d->prev;
+    else
+        q->tail = d->prev;
+
+    // Not strictly needed but makes debug easier
+    d->next = NULL;
+    d->prev = NULL;
+    d->in_q = 0;
+    pthread_mutex_unlock(&q->q_lock);
+
+    if (try_signal)
+        pthread_cond_broadcast(&q->q_cond);
+}
+
+void decode_q_wait(req_decode_q * const q, req_decode_ent * const d)
+{
+    pthread_mutex_lock(&q->q_lock);
+
+    while (q->head != d)
+        pthread_cond_wait(&q->q_cond, &q->q_lock);
+
+    pthread_mutex_unlock(&q->q_lock);
+}
+
+void decode_q_uninit(req_decode_q * const q)
+{
+    pthread_mutex_destroy(&q->q_lock);
+    pthread_cond_destroy(&q->q_cond);
+}
+
+void decode_q_init(req_decode_q * const q)
+{
+    memset(q, 0, sizeof(*q));
+    pthread_mutex_init(&q->q_lock, NULL);
+    pthread_cond_init(&q->q_cond, NULL);
+}
+
+
diff --git a/libavcodec/v4l2_req_decode_q.h b/libavcodec/v4l2_req_decode_q.h
new file mode 100644
index 0000000000..af7bbe1de4
--- /dev/null
+++ b/libavcodec/v4l2_req_decode_q.h
@@ -0,0 +1,25 @@
+#ifndef AVCODEC_V4L2_REQ_DECODE_Q_H
+#define AVCODEC_V4L2_REQ_DECODE_Q_H
+
+typedef struct req_decode_ent {
+    struct req_decode_ent * next;
+    struct req_decode_ent * prev;
+    int in_q;
+} req_decode_ent;
+
+typedef struct req_decode_q {
+    pthread_mutex_t q_lock;
+    pthread_cond_t q_cond;
+    req_decode_ent * head;
+    req_decode_ent * tail;
+} req_decode_q;
+
+int decode_q_in_q(const req_decode_ent * const d);
+void decode_q_add(req_decode_q * const q, req_decode_ent * const d);
+void decode_q_remove(req_decode_q * const q, req_decode_ent * const d);
+void decode_q_wait(req_decode_q * const q, req_decode_ent * const d);
+void decode_q_uninit(req_decode_q * const q);
+void decode_q_init(req_decode_q * const q);
+
+#endif
+
diff --git a/libavcodec/v4l2_req_devscan.c b/libavcodec/v4l2_req_devscan.c
new file mode 100644
index 0000000000..cfa94d55c4
--- /dev/null
+++ b/libavcodec/v4l2_req_devscan.c
@@ -0,0 +1,449 @@
+#include <errno.h>
+#include <fcntl.h>
+#include <libudev.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/sysmacros.h>
+
+#include <linux/media.h>
+#include <linux/videodev2.h>
+
+#include "v4l2_req_devscan.h"
+#include "v4l2_req_utils.h"
+
+struct decdev {
+    enum v4l2_buf_type src_type;
+    uint32_t src_fmt_v4l2;
+    const char * vname;
+    const char * mname;
+};
+
+struct devscan {
+    struct decdev env;
+    unsigned int dev_size;
+    unsigned int dev_count;
+    struct decdev *devs;
+};
+
+static int video_src_pixfmt_supported(uint32_t fmt)
+{
+    return 1;
+}
+
+static void v4l2_setup_format(struct v4l2_format *format, unsigned int type,
+                  unsigned int width, unsigned int height,
+                  unsigned int pixelformat)
+{
+    unsigned int sizeimage;
+
+    memset(format, 0, sizeof(*format));
+    format->type = type;
+
+    sizeimage = V4L2_TYPE_IS_OUTPUT(type) ? 4 * 1024 * 1024 : 0;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
+        format->fmt.pix_mp.width = width;
+        format->fmt.pix_mp.height = height;
+        format->fmt.pix_mp.plane_fmt[0].sizeimage = sizeimage;
+        format->fmt.pix_mp.pixelformat = pixelformat;
+    } else {
+        format->fmt.pix.width = width;
+        format->fmt.pix.height = height;
+        format->fmt.pix.sizeimage = sizeimage;
+        format->fmt.pix.pixelformat = pixelformat;
+    }
+}
+
+static int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat,
+            unsigned int width, unsigned int height)
+{
+    struct v4l2_format format;
+
+    v4l2_setup_format(&format, type, width, height, pixelformat);
+
+    return ioctl(video_fd, VIDIOC_S_FMT, &format) ? -errno : 0;
+}
+
+static int v4l2_query_capabilities(int video_fd, unsigned int *capabilities)
+{
+    struct v4l2_capability capability = { 0 };
+    int rc;
+
+    rc = ioctl(video_fd, VIDIOC_QUERYCAP, &capability);
+    if (rc < 0)
+        return -errno;
+
+    if (capabilities != NULL) {
+        if ((capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0)
+            *capabilities = capability.device_caps;
+        else
+            *capabilities = capability.capabilities;
+    }
+
+    return 0;
+}
+
+static int devscan_add(struct devscan *const scan,
+                       enum v4l2_buf_type src_type,
+                       uint32_t src_fmt_v4l2,
+                       const char * vname,
+                       const char * mname)
+{
+    struct decdev *d;
+
+    if (scan->dev_size <= scan->dev_count) {
+        unsigned int n = !scan->dev_size ? 4 : scan->dev_size * 2;
+        d = realloc(scan->devs, n * sizeof(*d));
+        if (!d)
+            return -ENOMEM;
+        scan->devs = d;
+        scan->dev_size = n;
+    }
+
+    d = scan->devs + scan->dev_count;
+    d->src_type = src_type;
+    d->src_fmt_v4l2 = src_fmt_v4l2;
+    d->vname = strdup(vname);
+    if (!d->vname)
+        return -ENOMEM;
+    d->mname = strdup(mname);
+    if (!d->mname) {
+        free((char *)d->vname);
+        return -ENOMEM;
+    }
+    ++scan->dev_count;
+    return 0;
+}
+
+void devscan_delete(struct devscan **const pScan)
+{
+    unsigned int i;
+    struct devscan * const scan = *pScan;
+
+    if (!scan)
+        return;
+    *pScan = NULL;
+
+    for (i = 0; i < scan->dev_count; ++i) {
+        free((char*)scan->devs[i].mname);
+        free((char*)scan->devs[i].vname);
+    }
+    free(scan->devs);
+    free(scan);
+}
+
+#define REQ_BUF_CAPS (\
+    V4L2_BUF_CAP_SUPPORTS_DMABUF |\
+    V4L2_BUF_CAP_SUPPORTS_REQUESTS |\
+    V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF)
+
+static void probe_formats(void * const dc,
+              struct devscan *const scan,
+              const int fd,
+              const unsigned int type_v4l2,
+              const char *const mpath,
+              const char *const vpath)
+{
+    unsigned int i;
+    for (i = 0;; ++i) {
+        struct v4l2_fmtdesc fmtdesc = {
+            .index = i,
+            .type = type_v4l2
+        };
+        struct v4l2_requestbuffers rbufs = {
+            .count = 0,
+            .type = type_v4l2,
+            .memory = V4L2_MEMORY_MMAP
+        };
+        while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
+            if (errno == EINTR)
+                continue;
+            if (errno != EINVAL)
+                request_err(dc, "Enum[%d] failed for type=%d\n", i, type_v4l2);
+            return;
+        }
+        if (!video_src_pixfmt_supported(fmtdesc.pixelformat))
+            continue;
+
+        if (v4l2_set_format(fd, type_v4l2, fmtdesc.pixelformat, 720, 480)) {
+            request_debug(dc, "Set failed for type=%d, pf=%.4s\n", type_v4l2, (char*)&fmtdesc.pixelformat);
+            continue;
+        }
+
+        while (ioctl(fd, VIDIOC_REQBUFS, &rbufs)) {
+            if (errno != EINTR) {
+                request_debug(dc, "%s: Reqbufs failed\n", vpath);
+                continue;
+            }
+        }
+
+        if ((rbufs.capabilities & REQ_BUF_CAPS) != REQ_BUF_CAPS) {
+            request_debug(dc, "%s: Buf caps %#x insufficient\n", vpath, rbufs.capabilities);
+            continue;
+        }
+
+        request_debug(dc, "Adding: %s,%s pix=%#x, type=%d\n",
+                 mpath, vpath, fmtdesc.pixelformat, type_v4l2);
+        devscan_add(scan, type_v4l2, fmtdesc.pixelformat, vpath, mpath);
+    }
+}
+
+
+static int probe_video_device(void * const dc,
+                   struct udev_device *const device,
+                   struct devscan *const scan,
+                   const char *const mpath)
+{
+    int ret;
+    unsigned int capabilities = 0;
+    int video_fd = -1;
+
+    const char *path = udev_device_get_devnode(device);
+    if (!path) {
+        request_err(dc, "%s: get video device devnode failed\n", __func__);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    video_fd = open(path, O_RDWR, 0);
+    if (video_fd == -1) {
+        ret = -errno;
+        request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
+        goto fail;
+    }
+
+    ret = v4l2_query_capabilities(video_fd, &capabilities);
+    if (ret < 0) {
+        request_err(dc, "%s: get video capability failed, %s (%d)\n", __func__, strerror(-ret), -ret);
+        goto fail;
+    }
+
+    request_debug(dc, "%s: path=%s capabilities=%#x\n", __func__, path, capabilities);
+
+    if (!(capabilities & V4L2_CAP_STREAMING)) {
+        request_debug(dc, "%s: missing required streaming capability\n", __func__);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    if (!(capabilities & (V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M))) {
+        request_debug(dc, "%s: missing required mem2mem capability\n", __func__);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* Should check capture formats too... */
+    if ((capabilities & V4L2_CAP_VIDEO_M2M) != 0)
+        probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT, mpath, path);
+    if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) != 0)
+        probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, mpath, path);
+
+    close(video_fd);
+    return 0;
+
+fail:
+    if (video_fd >= 0)
+        close(video_fd);
+    return ret;
+}
+
+static int probe_media_device(void * const dc,
+                   struct udev_device *const device,
+                   struct devscan *const scan)
+{
+    int ret;
+    int rv;
+    struct media_device_info device_info = { 0 };
+    struct media_v2_topology topology = { 0 };
+    struct media_v2_interface *interfaces = NULL;
+    struct udev *udev = udev_device_get_udev(device);
+    struct udev_device *video_device;
+    dev_t devnum;
+    int media_fd = -1;
+
+    const char *path = udev_device_get_devnode(device);
+    if (!path) {
+        request_err(dc, "%s: get media device devnode failed\n", __func__);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    media_fd = open(path, O_RDWR, 0);
+    if (media_fd < 0) {
+        ret = -errno;
+        request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(-ret), -ret);
+        goto fail;
+    }
+
+    rv = ioctl(media_fd, MEDIA_IOC_DEVICE_INFO, &device_info);
+    if (rv < 0) {
+        ret = -errno;
+        request_err(dc, "%s: get media device info failed, %s (%d)\n", __func__, strerror(-ret), -ret);
+        goto fail;
+    }
+
+    rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
+    if (rv < 0) {
+        ret = -errno;
+        request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
+        goto fail;
+    }
+
+    if (topology.num_interfaces <= 0) {
+        request_err(dc, "%s: media device has no interfaces\n", __func__);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    interfaces = calloc(topology.num_interfaces, sizeof(*interfaces));
+    if (!interfaces) {
+        request_err(dc, "%s: allocating media interface struct failed\n", __func__);
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    topology.ptr_interfaces = (__u64)(uintptr_t)interfaces;
+    rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
+    if (rv < 0) {
+        ret = -errno;
+        request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
+        goto fail;
+    }
+
+    for (int i = 0; i < topology.num_interfaces; i++) {
+        if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO)
+            continue;
+
+        devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor);
+        video_device = udev_device_new_from_devnum(udev, 'c', devnum);
+        if (!video_device) {
+            ret = -errno;
+            request_err(dc, "%s: video_device[%d]=%p\n", __func__, i, video_device);
+            continue;
+        }
+
+        ret = probe_video_device(dc, video_device, scan, path);
+        udev_device_unref(video_device);
+
+        if (ret != 0)
+            goto fail;
+    }
+
+fail:
+    free(interfaces);
+    if (media_fd != -1)
+        close(media_fd);
+    return ret;
+}
+
+const char *decdev_media_path(const struct decdev *const dev)
+{
+    return !dev ? NULL : dev->mname;
+}
+
+const char *decdev_video_path(const struct decdev *const dev)
+{
+    return !dev ? NULL : dev->vname;
+}
+
+enum v4l2_buf_type decdev_src_type(const struct decdev *const dev)
+{
+    return !dev ? 0 : dev->src_type;
+}
+
+uint32_t decdev_src_pixelformat(const struct decdev *const dev)
+{
+    return !dev ? 0 : dev->src_fmt_v4l2;
+}
+
+
+const struct decdev *devscan_find(struct devscan *const scan,
+                  const uint32_t src_fmt_v4l2)
+{
+    unsigned int i;
+
+    if (scan->env.mname && scan->env.vname)
+        return &scan->env;
+
+    if (!src_fmt_v4l2)
+        return scan->dev_count ? scan->devs + 0 : NULL;
+
+    for (i = 0; i != scan->dev_count; ++i) {
+        if (scan->devs[i].src_fmt_v4l2 == src_fmt_v4l2)
+            return scan->devs + i;
+    }
+    return NULL;
+}
+
+int devscan_build(void * const dc, struct devscan **pscan)
+{
+    int ret;
+    struct udev *udev;
+    struct udev_enumerate *enumerate;
+    struct udev_list_entry *devices;
+    struct udev_list_entry *entry;
+    struct udev_device *device;
+    struct devscan * scan;
+
+    *pscan = NULL;
+
+    scan = calloc(1, sizeof(*scan));
+    if (!scan) {
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    scan->env.mname = getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH");
+    scan->env.vname = getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH");
+    if (scan->env.mname && scan->env.vname) {
+        request_info(dc, "Media/video device env overrides found: %s,%s\n",
+                 scan->env.mname, scan->env.vname);
+        *pscan = scan;
+        return 0;
+    }
+
+    udev = udev_new();
+    if (!udev) {
+        request_err(dc, "%s: allocating udev context failed\n", __func__);
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    enumerate = udev_enumerate_new(udev);
+    if (!enumerate) {
+        request_err(dc, "%s: allocating udev enumerator failed\n", __func__);
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    udev_enumerate_add_match_subsystem(enumerate, "media");
+    udev_enumerate_scan_devices(enumerate);
+
+    devices = udev_enumerate_get_list_entry(enumerate);
+    udev_list_entry_foreach(entry, devices) {
+        const char *path = udev_list_entry_get_name(entry);
+        if (!path)
+            continue;
+
+        device = udev_device_new_from_syspath(udev, path);
+        if (!device)
+            continue;
+
+        probe_media_device(dc, device, scan);
+        udev_device_unref(device);
+    }
+
+    udev_enumerate_unref(enumerate);
+
+    *pscan = scan;
+    return 0;
+
+fail:
+    udev_unref(udev);
+    devscan_delete(&scan);
+    return ret;
+}
+
diff --git a/libavcodec/v4l2_req_devscan.h b/libavcodec/v4l2_req_devscan.h
new file mode 100644
index 0000000000..956d9234f1
--- /dev/null
+++ b/libavcodec/v4l2_req_devscan.h
@@ -0,0 +1,23 @@
+#ifndef _DEVSCAN_H_
+#define _DEVSCAN_H_
+
+#include <stdint.h>
+
+struct devscan;
+struct decdev;
+enum v4l2_buf_type;
+
+/* These return pointers to data in the devscan structure and so are vaild
+ * for the lifetime of that
+ */
+const char *decdev_media_path(const struct decdev *const dev);
+const char *decdev_video_path(const struct decdev *const dev);
+enum v4l2_buf_type decdev_src_type(const struct decdev *const dev);
+uint32_t decdev_src_pixelformat(const struct decdev *const dev);
+
+const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2);
+
+int devscan_build(void * const dc, struct devscan **pscan);
+void devscan_delete(struct devscan **const pScan);
+
+#endif
diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
new file mode 100644
index 0000000000..ae6c648369
--- /dev/null
+++ b/libavcodec/v4l2_req_dmabufs.c
@@ -0,0 +1,266 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <linux/mman.h>
+#include <linux/dma-buf.h>
+#include <linux/dma-heap.h>
+
+#include "v4l2_req_dmabufs.h"
+#include "v4l2_req_utils.h"
+
+#define DMABUF_NAME1  "/dev/dma_heap/linux,cma"
+#define DMABUF_NAME2  "/dev/dma_heap/reserved"
+
+#define TRACE_ALLOC 0
+
+struct dmabufs_ctl {
+    int fd;
+    size_t page_size;
+};
+
+struct dmabuf_h {
+    int fd;
+    size_t size;
+    size_t len;
+    void * mapptr;
+};
+
+#if TRACE_ALLOC
+static unsigned int total_bufs = 0;
+static size_t total_size = 0;
+#endif
+
+struct dmabuf_h * dmabuf_import(int fd, size_t size)
+{
+    struct dmabuf_h *dh;
+
+    fd = dup(fd);
+    if (fd < 0  || size == 0)
+        return NULL;
+
+    dh = malloc(sizeof(*dh));
+    if (!dh) {
+        close(fd);
+        return NULL;
+    }
+
+    *dh = (struct dmabuf_h) {
+        .fd = fd,
+        .size = size,
+        .mapptr = MAP_FAILED
+    };
+
+#if TRACE_ALLOC
+    ++total_bufs;
+    total_size += dh->size;
+    request_log("%s: Import: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
+#endif
+
+    return dh;
+}
+
+struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size)
+{
+    struct dmabuf_h * dh;
+    struct dma_heap_allocation_data data = {
+        .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1),
+        .fd = 0,
+        .fd_flags = O_RDWR,
+        .heap_flags = 0
+    };
+
+    if (old != NULL) {
+        if (old->size == data.len) {
+            return old;
+        }
+        dmabuf_free(old);
+    }
+
+    if (size == 0 ||
+        (dh = malloc(sizeof(*dh))) == NULL)
+        return NULL;
+
+    while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) {
+        int err = errno;
+        request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n",
+                (uint64_t)data.len,
+                dbsc->fd,
+                err,
+                strerror(err));
+        if (err == EINTR)
+            continue;
+        goto fail;
+    }
+
+    *dh = (struct dmabuf_h){
+        .fd = data.fd,
+        .size = (size_t)data.len,
+        .mapptr = MAP_FAILED
+    };
+
+#if TRACE_ALLOC
+    ++total_bufs;
+    total_size += dh->size;
+    request_log("%s: Alloc: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
+#endif
+
+    return dh;
+
+fail:
+    free(dh);
+    return NULL;
+}
+
+int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags)
+{
+    struct dma_buf_sync sync = {
+        .flags = flags
+    };
+    while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) {
+        const int err = errno;
+        if (errno == EINTR)
+            continue;
+        request_log("%s: ioctl failed: flags=%#x\n", __func__, flags);
+        return -err;
+    }
+    return 0;
+}
+
+int dmabuf_write_start(struct dmabuf_h * const dh)
+{
+    return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE);
+}
+
+int dmabuf_write_end(struct dmabuf_h * const dh)
+{
+    return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE);
+}
+
+int dmabuf_read_start(struct dmabuf_h * const dh)
+{
+    if (!dmabuf_map(dh))
+        return -1;
+    return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ);
+}
+
+int dmabuf_read_end(struct dmabuf_h * const dh)
+{
+    return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ);
+}
+
+
+void * dmabuf_map(struct dmabuf_h * const dh)
+{
+    if (!dh)
+        return NULL;
+    if (dh->mapptr != MAP_FAILED)
+        return dh->mapptr;
+    dh->mapptr = mmap(NULL, dh->size,
+              PROT_READ | PROT_WRITE,
+              MAP_SHARED | MAP_POPULATE,
+              dh->fd, 0);
+    if (dh->mapptr == MAP_FAILED) {
+        request_log("%s: Map failed\n", __func__);
+        return NULL;
+    }
+    return dh->mapptr;
+}
+
+int dmabuf_fd(const struct dmabuf_h * const dh)
+{
+    if (!dh)
+        return -1;
+    return dh->fd;
+}
+
+size_t dmabuf_size(const struct dmabuf_h * const dh)
+{
+    if (!dh)
+        return 0;
+    return dh->size;
+}
+
+size_t dmabuf_len(const struct dmabuf_h * const dh)
+{
+    if (!dh)
+        return 0;
+    return dh->len;
+}
+
+void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len)
+{
+    dh->len = len;
+}
+
+
+
+void dmabuf_free(struct dmabuf_h * dh)
+{
+    if (!dh)
+        return;
+
+#if TRACE_ALLOC
+    --total_bufs;
+    total_size -= dh->size;
+    request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
+#endif
+
+    if (dh->mapptr != MAP_FAILED)
+        munmap(dh->mapptr, dh->size);
+    while (close(dh->fd) == -1 && errno == EINTR)
+        /* loop */;
+    free(dh);
+}
+
+struct dmabufs_ctl * dmabufs_ctl_new(void)
+{
+    struct dmabufs_ctl * dbsc = malloc(sizeof(*dbsc));
+
+    if (!dbsc)
+        return NULL;
+
+    while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 &&
+           errno == EINTR)
+        /* Loop */;
+
+    if (dbsc->fd == -1) {
+        while ((dbsc->fd = open(DMABUF_NAME2, O_RDWR)) == -1 &&
+               errno == EINTR)
+            /* Loop */;
+        if (dbsc->fd == -1) {
+            request_log("Unable to open either %s or %s\n",
+                    DMABUF_NAME1, DMABUF_NAME2);
+            goto fail;
+        }
+    }
+
+    dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE);
+
+    return dbsc;
+
+fail:
+    free(dbsc);
+    return NULL;
+}
+
+void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
+{
+    struct dmabufs_ctl * const dbsc = *pDbsc;
+
+    if (!dbsc)
+        return;
+    *pDbsc = NULL;
+
+    while (close(dbsc->fd) == -1 && errno == EINTR)
+        /* loop */;
+
+    free(dbsc);
+}
+
+
diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h
new file mode 100644
index 0000000000..cfb17e801d
--- /dev/null
+++ b/libavcodec/v4l2_req_dmabufs.h
@@ -0,0 +1,40 @@
+#ifndef DMABUFS_H
+#define DMABUFS_H
+
+#include <stddef.h>
+
+struct dmabufs_ctl;
+struct dmabuf_h;
+
+struct dmabufs_ctl * dmabufs_ctl_new(void);
+void dmabufs_ctl_delete(struct dmabufs_ctl ** const pdbsc);
+
+// Need not preserve old contents
+// On NULL return old buffer is freed
+struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h *, size_t size);
+
+static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t size) {
+    return dmabuf_realloc(dbsc, NULL, size);
+}
+/* Create from existing fd - dups(fd) */
+struct dmabuf_h * dmabuf_import(int fd, size_t size);
+void * dmabuf_map(struct dmabuf_h * const dh);
+
+/* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */
+int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags);
+
+int dmabuf_write_start(struct dmabuf_h * const dh);
+int dmabuf_write_end(struct dmabuf_h * const dh);
+int dmabuf_read_start(struct dmabuf_h * const dh);
+int dmabuf_read_end(struct dmabuf_h * const dh);
+
+int dmabuf_fd(const struct dmabuf_h * const dh);
+/* Allocated size */
+size_t dmabuf_size(const struct dmabuf_h * const dh);
+/* Bytes in use */
+size_t dmabuf_len(const struct dmabuf_h * const dh);
+/* Set bytes in use */
+void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len);
+void dmabuf_free(struct dmabuf_h * dh);
+
+#endif
diff --git a/libavcodec/v4l2_req_hevc_v1.c b/libavcodec/v4l2_req_hevc_v1.c
new file mode 100644
index 0000000000..169b532832
--- /dev/null
+++ b/libavcodec/v4l2_req_hevc_v1.c
@@ -0,0 +1,3 @@
+#define HEVC_CTRLS_VERSION 1
+#include "v4l2_req_hevc_vx.c"
+
diff --git a/libavcodec/v4l2_req_hevc_v2.c b/libavcodec/v4l2_req_hevc_v2.c
new file mode 100644
index 0000000000..42af98e156
--- /dev/null
+++ b/libavcodec/v4l2_req_hevc_v2.c
@@ -0,0 +1,3 @@
+#define HEVC_CTRLS_VERSION 2
+#include "v4l2_req_hevc_vx.c"
+
diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
new file mode 100644
index 0000000000..0ae03b10c4
--- /dev/null
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -0,0 +1,1213 @@
+// File included by v4l2_req_hevc_v* - not compiled on its own
+
+#include "decode.h"
+#include "hevcdec.h"
+#include "hwconfig.h"
+#include "internal.h"
+#include "thread.h"
+
+#include "v4l2_request_hevc.h"
+
+#if HEVC_CTRLS_VERSION == 1
+#include "hevc-ctrls-v1.h"
+
+// Fixup renamed entries
+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT
+
+#elif HEVC_CTRLS_VERSION == 2
+#include "hevc-ctrls-v2.h"
+#else
+#error Unknown HEVC_CTRLS_VERSION
+#endif
+
+#include "libavutil/hwcontext_drm.h"
+
+#include <semaphore.h>
+#include <pthread.h>
+
+#include "v4l2_req_devscan.h"
+#include "v4l2_req_dmabufs.h"
+#include "v4l2_req_pollqueue.h"
+#include "v4l2_req_media.h"
+#include "v4l2_req_utils.h"
+
+// Attached to buf[0] in frame
+// Pooled in hwcontext so generally create once - 1/frame
+typedef struct V4L2MediaReqDescriptor {
+    AVDRMFrameDescriptor drm;
+
+    // Media
+    uint64_t timestamp;
+    struct qent_dst * qe_dst;
+
+    // Decode only - should be NULL by the time we emit the frame
+    struct req_decode_ent decode_ent;
+
+    struct media_request *req;
+    struct qent_src *qe_src;
+
+#if HEVC_CTRLS_VERSION >= 2
+    struct v4l2_ctrl_hevc_decode_params dec;
+#endif
+
+    size_t num_slices;
+    size_t alloced_slices;
+    struct v4l2_ctrl_hevc_slice_params * slice_params;
+    struct slice_info * slices;
+
+} V4L2MediaReqDescriptor;
+
+struct slice_info {
+    const uint8_t * ptr;
+    size_t len; // bytes
+};
+
+// Handy container for accumulating controls before setting
+struct req_controls {
+    int has_scaling;
+    struct timeval tv;
+    struct v4l2_ctrl_hevc_sps sps;
+    struct v4l2_ctrl_hevc_pps pps;
+    struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix;
+};
+
+//static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 };
+
+
+// Get an FFmpeg format from the v4l2 format
+static enum AVPixelFormat pixel_format_from_format(const struct v4l2_format *const format)
+{
+    switch (V4L2_TYPE_IS_MULTIPLANAR(format->type) ?
+            format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat) {
+    case V4L2_PIX_FMT_YUV420:
+        return AV_PIX_FMT_YUV420P;
+    case V4L2_PIX_FMT_NV12:
+        return AV_PIX_FMT_NV12;
+#if CONFIG_SAND
+    case V4L2_PIX_FMT_NV12_COL128:
+        return AV_PIX_FMT_RPI4_8;
+    case V4L2_PIX_FMT_NV12_10_COL128:
+        return AV_PIX_FMT_RPI4_10;
+#endif
+    default:
+        break;
+    }
+    return AV_PIX_FMT_NONE;
+}
+
+static inline uint64_t frame_capture_dpb(const AVFrame * const frame)
+{
+    const V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
+    return rd->timestamp;
+}
+
+static inline void frame_set_capture_dpb(AVFrame * const frame, const uint64_t dpb_stamp)
+{
+    V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
+    rd->timestamp = dpb_stamp;
+}
+
+static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table)
+{
+    int32_t luma_weight_denom, chroma_weight_denom;
+    const SliceHeader *sh = &h->sh;
+
+    if (sh->slice_type == HEVC_SLICE_I ||
+        (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) ||
+        (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag))
+        return;
+
+    table->luma_log2_weight_denom = sh->luma_log2_weight_denom;
+
+    if (h->ps.sps->chroma_format_idc)
+        table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom;
+
+    luma_weight_denom = (1 << sh->luma_log2_weight_denom);
+    chroma_weight_denom = (1 << sh->chroma_log2_weight_denom);
+
+    for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) {
+        table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom;
+        table->luma_offset_l0[i] = sh->luma_offset_l0[i];
+        table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom;
+        table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom;
+        table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0];
+        table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1];
+    }
+
+    if (sh->slice_type != HEVC_SLICE_B)
+        return;
+
+    for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) {
+        table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom;
+        table->luma_offset_l1[i] = sh->luma_offset_l1[i];
+        table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom;
+        table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom;
+        table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0];
+        table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1];
+    }
+}
+
+static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
+{
+    const HEVCFrame *frame;
+    int i;
+
+    for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) {
+        frame = h->rps[ST_CURR_BEF].ref[i];
+        if (frame && timestamp == frame_capture_dpb(frame->frame))
+            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE;
+    }
+
+    for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) {
+        frame = h->rps[ST_CURR_AFT].ref[i];
+        if (frame && timestamp == frame_capture_dpb(frame->frame))
+            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER;
+    }
+
+    for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) {
+        frame = h->rps[LT_CURR].ref[i];
+        if (frame && timestamp == frame_capture_dpb(frame->frame))
+            return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR;
+    }
+
+    return 0;
+}
+
+static unsigned int
+get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
+                  const struct v4l2_hevc_dpb_entry * const entries,
+                  const unsigned int num_entries)
+{
+    uint64_t timestamp;
+
+    if (!frame)
+        return 0;
+
+    timestamp = frame_capture_dpb(frame->frame);
+
+    for (unsigned int i = 0; i < num_entries; i++) {
+        if (entries[i].timestamp == timestamp)
+            return i;
+    }
+
+    return 0;
+}
+
+static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx)
+{
+    unsigned int z = 0;
+    while (idx--) {
+        if (*b++ == 0) {
+            ++z;
+            if (z >= 2 && *b == 3) {
+                ++b;
+                z = 0;
+            }
+        }
+        else {
+            z = 0;
+        }
+    }
+    return b;
+}
+
+static int slice_add(V4L2MediaReqDescriptor * const rd)
+{
+    if (rd->num_slices >= rd->alloced_slices) {
+        struct v4l2_ctrl_hevc_slice_params * p2;
+        struct slice_info * s2;
+        size_t n2 = rd->num_slices == 0 ? 8 : rd->num_slices * 2;
+
+        p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2));
+        if (p2 == NULL)
+            return AVERROR(ENOMEM);
+        rd->slice_params = p2;
+
+        s2 = av_realloc_array(rd->slices, n2, sizeof(*s2));
+        if (s2 == NULL)
+            return AVERROR(ENOMEM);
+        rd->slices = s2;
+
+        rd->alloced_slices = n2;
+    }
+    ++rd->num_slices;
+    return 0;
+}
+
+static unsigned int
+fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries)
+{
+    unsigned int i;
+    unsigned int n = 0;
+    const HEVCFrame * const pic = h->ref;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) {
+        const HEVCFrame * const frame = &h->DPB[i];
+        if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) {
+            struct v4l2_hevc_dpb_entry * const entry = entries + n++;
+
+            entry->timestamp = frame_capture_dpb(frame->frame);
+            entry->rps = find_frame_rps_type(h, entry->timestamp);
+            entry->field_pic = frame->frame->interlaced_frame;
+
+            /* TODO: Interleaved: Get the POC for each field. */
+            entry->pic_order_cnt[0] = frame->poc;
+            entry->pic_order_cnt[1] = frame->poc;
+        }
+    }
+    return n;
+}
+
+static void fill_slice_params(const HEVCContext * const h,
+#if HEVC_CTRLS_VERSION >= 2
+                              const struct v4l2_ctrl_hevc_decode_params * const dec,
+#endif
+                              struct v4l2_ctrl_hevc_slice_params *slice_params,
+                              uint32_t bit_size, uint32_t bit_offset)
+{
+    const SliceHeader * const sh = &h->sh;
+#if HEVC_CTRLS_VERSION >= 2
+    const struct v4l2_hevc_dpb_entry *const dpb = dec->dpb;
+    const unsigned int dpb_n = dec->num_active_dpb_entries;
+#else
+    struct v4l2_hevc_dpb_entry *const dpb = slice_params->dpb;
+    unsigned int dpb_n;
+#endif
+    unsigned int i;
+    RefPicList *rpl;
+
+    *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
+        .bit_size = bit_size,
+        .data_bit_offset = bit_offset,
+
+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+        .slice_segment_addr = sh->slice_segment_addr,
+
+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+        .nal_unit_type = h->nal_unit_type,
+        .nuh_temporal_id_plus1 = h->temporal_id + 1,
+
+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+        .slice_type = sh->slice_type,
+        .colour_plane_id = sh->colour_plane_id,
+        .slice_pic_order_cnt = h->ref->poc,
+        .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0,
+        .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0,
+        .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0,
+        .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand,
+        .slice_qp_delta = sh->slice_qp_delta,
+        .slice_cb_qp_offset = sh->slice_cb_qp_offset,
+        .slice_cr_qp_offset = sh->slice_cr_qp_offset,
+        .slice_act_y_qp_offset = 0,
+        .slice_act_cb_qp_offset = 0,
+        .slice_act_cr_qp_offset = 0,
+        .slice_beta_offset_div2 = sh->beta_offset / 2,
+        .slice_tc_offset_div2 = sh->tc_offset / 2,
+
+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
+        .pic_struct = h->sei.picture_timing.picture_struct,
+
+#if HEVC_CTRLS_VERSION < 2
+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+        .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
+        .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
+        .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs,
+#endif
+    };
+
+    if (sh->slice_sample_adaptive_offset_flag[0])
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA;
+
+    if (sh->slice_sample_adaptive_offset_flag[1])
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA;
+
+    if (sh->slice_temporal_mvp_enabled_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED;
+
+    if (sh->mvd_l1_zero_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO;
+
+    if (sh->cabac_init_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT;
+
+    if (sh->collocated_list == L0)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0;
+
+    if (sh->disable_deblocking_filter_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED;
+
+    if (sh->slice_loop_filter_across_slices_enabled_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED;
+
+    if (sh->dependent_slice_segment_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT;
+
+#if HEVC_CTRLS_VERSION < 2
+    dpb_n = fill_dpb_entries(h, dpb);
+    slice_params->num_active_dpb_entries = dpb_n;
+#endif
+
+    if (sh->slice_type != HEVC_SLICE_I) {
+        rpl = &h->ref->refPicList[0];
+        for (i = 0; i < rpl->nb_refs; i++)
+            slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
+    }
+
+    if (sh->slice_type == HEVC_SLICE_B) {
+        rpl = &h->ref->refPicList[1];
+        for (i = 0; i < rpl->nb_refs; i++)
+            slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
+    }
+
+    fill_pred_table(h, &slice_params->pred_weight_table);
+
+    slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
+    if (slice_params->num_entry_point_offsets > 256) {
+        slice_params->num_entry_point_offsets = 256;
+        av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
+    }
+
+    for (i = 0; i < slice_params->num_entry_point_offsets; i++)
+        slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
+}
+
+#if HEVC_CTRLS_VERSION >= 2
+static void
+fill_decode_params(const HEVCContext * const h,
+                   struct v4l2_ctrl_hevc_decode_params * const dec)
+{
+    unsigned int i;
+
+    *dec = (struct v4l2_ctrl_hevc_decode_params){
+        .pic_order_cnt_val = h->poc,
+        .num_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
+        .num_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
+        .num_poc_lt_curr = h->rps[LT_CURR].nb_refs,
+    };
+
+    dec->num_active_dpb_entries = fill_dpb_entries(h, dec->dpb);
+
+    // The docn does seem to ask that we fit our 32 bit signed POC into
+    // a U8 so... (To be fair 16 bits would be enough)
+    // Luckily we (Pi) don't use these fields
+    for (i = 0; i != h->rps[ST_CURR_BEF].nb_refs; ++i)
+        dec->poc_st_curr_before[i] = h->rps[ST_CURR_BEF].ref[i]->poc;
+    for (i = 0; i != h->rps[ST_CURR_AFT].nb_refs; ++i)
+        dec->poc_st_curr_after[i] = h->rps[ST_CURR_AFT].ref[i]->poc;
+    for (i = 0; i != h->rps[LT_CURR].nb_refs; ++i)
+        dec->poc_lt_curr[i] = h->rps[LT_CURR].ref[i]->poc;
+
+    if (IS_IRAP(h))
+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC;
+    if (IS_IDR(h))
+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC;
+    if (h->sh.no_output_of_prior_pics_flag)
+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR;
+
+}
+#endif
+
+static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCSPS *sps)
+{
+    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
+    *ctrl = (struct v4l2_ctrl_hevc_sps) {
+        .chroma_format_idc = sps->chroma_format_idc,
+        .pic_width_in_luma_samples = sps->width,
+        .pic_height_in_luma_samples = sps->height,
+        .bit_depth_luma_minus8 = sps->bit_depth - 8,
+        .bit_depth_chroma_minus8 = sps->bit_depth - 8,
+        .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
+        .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1,
+        .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics,
+        .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1,
+        .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3,
+        .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size,
+        .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2,
+        .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size,
+        .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter,
+        .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra,
+        .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1,
+        .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1,
+        .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3,
+        .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size,
+        .num_short_term_ref_pic_sets = sps->nb_st_rps,
+        .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps,
+        .chroma_format_idc = sps->chroma_format_idc,
+        .sps_max_sub_layers_minus1 = sps->max_sub_layers - 1,
+    };
+
+    if (sps->separate_colour_plane_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE;
+
+    if (sps->scaling_list_enable_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED;
+
+    if (sps->amp_enabled_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED;
+
+    if (sps->sao_enabled)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET;
+
+    if (sps->pcm_enabled_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED;
+
+    if (sps->pcm.loop_filter_disable_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED;
+
+    if (sps->long_term_ref_pics_present_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT;
+
+    if (sps->sps_temporal_mvp_enabled_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED;
+
+    if (sps->sps_strong_intra_smoothing_enable_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED;
+}
+
+static void fill_scaling_matrix(const ScalingList * const sl,
+                                struct v4l2_ctrl_hevc_scaling_matrix * const sm)
+{
+    unsigned int i;
+
+    for (i = 0; i < 6; i++) {
+        unsigned int j;
+
+        for (j = 0; j < 16; j++)
+            sm->scaling_list_4x4[i][j] = sl->sl[0][i][j];
+        for (j = 0; j < 64; j++) {
+            sm->scaling_list_8x8[i][j]   = sl->sl[1][i][j];
+            sm->scaling_list_16x16[i][j] = sl->sl[2][i][j];
+            if (i < 2)
+                sm->scaling_list_32x32[i][j] = sl->sl[3][i * 3][j];
+        }
+        sm->scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i];
+        if (i < 2)
+            sm->scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3];
+    }
+}
+
+static void fill_pps(struct v4l2_ctrl_hevc_pps * const ctrl, const HEVCPPS * const pps)
+{
+    uint64_t flags = 0;
+
+    if (pps->dependent_slice_segments_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED;
+
+    if (pps->output_flag_present_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT;
+
+    if (pps->sign_data_hiding_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED;
+
+    if (pps->cabac_init_present_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT;
+
+    if (pps->constrained_intra_pred_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED;
+
+    if (pps->transform_skip_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED;
+
+    if (pps->cu_qp_delta_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED;
+
+    if (pps->pic_slice_level_chroma_qp_offsets_present_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT;
+
+    if (pps->weighted_pred_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED;
+
+    if (pps->weighted_bipred_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED;
+
+    if (pps->transquant_bypass_enable_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED;
+
+    if (pps->tiles_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED;
+
+    if (pps->entropy_coding_sync_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED;
+
+    if (pps->loop_filter_across_tiles_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED;
+
+    if (pps->seq_loop_filter_across_slices_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED;
+
+    if (pps->deblocking_filter_override_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED;
+
+    if (pps->disable_dbf)
+        flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER;
+
+    if (pps->lists_modification_present_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT;
+
+    if (pps->slice_header_extension_present_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT;
+
+    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
+    *ctrl = (struct v4l2_ctrl_hevc_pps) {
+        .num_extra_slice_header_bits = pps->num_extra_slice_header_bits,
+        .init_qp_minus26 = pps->pic_init_qp_minus26,
+        .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth,
+        .pps_cb_qp_offset = pps->cb_qp_offset,
+        .pps_cr_qp_offset = pps->cr_qp_offset,
+        .pps_beta_offset_div2 = pps->beta_offset / 2,
+        .pps_tc_offset_div2 = pps->tc_offset / 2,
+        .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2,
+        .flags = flags
+    };
+
+
+    if (pps->tiles_enabled_flag) {
+        ctrl->num_tile_columns_minus1 = pps->num_tile_columns - 1;
+        ctrl->num_tile_rows_minus1 = pps->num_tile_rows - 1;
+
+        for (int i = 0; i < pps->num_tile_columns; i++)
+            ctrl->column_width_minus1[i] = pps->column_width[i] - 1;
+
+        for (int i = 0; i < pps->num_tile_rows; i++)
+            ctrl->row_height_minus1[i] = pps->row_height[i] - 1;
+    }
+}
+
+// Called before finally returning the frame to the user
+// Set corrupt flag here as this is actually the frame structure that
+// is going to the user (in MT land each thread has its own pool)
+static int frame_post_process(void *logctx, AVFrame *frame)
+{
+    V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)frame->data[0];
+
+//    av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
+    frame->flags &= ~AV_FRAME_FLAG_CORRUPT;
+    if (rd->qe_dst) {
+        MediaBufsStatus stat = qent_dst_wait(rd->qe_dst);
+        if (stat != MEDIABUFS_STATUS_SUCCESS) {
+            av_log(logctx, AV_LOG_ERROR, "%s: Decode fail\n", __func__);
+            frame->flags |= AV_FRAME_FLAG_CORRUPT;
+        }
+    }
+
+    return 0;
+}
+
+static inline struct timeval cvt_dpb_to_tv(uint64_t t)
+{
+    t /= 1000;
+    return (struct timeval){
+        .tv_usec = t % 1000000,
+        .tv_sec = t / 1000000
+    };
+}
+
+static inline uint64_t cvt_timestamp_to_dpb(const unsigned int t)
+{
+    return (uint64_t)t * 1000;
+}
+
+static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
+                                         av_unused const uint8_t *buffer,
+                                         av_unused uint32_t size)
+{
+    const HEVCContext *h = avctx->priv_data;
+    V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+
+//    av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
+    decode_q_add(&ctx->decode_q, &rd->decode_ent);
+
+    rd->num_slices = 0;
+    ctx->timestamp++;
+    rd->timestamp = cvt_timestamp_to_dpb(ctx->timestamp);
+
+    {
+        FrameDecodeData * const fdd = (FrameDecodeData*)h->ref->frame->private_ref->data;
+        fdd->post_process = frame_post_process;
+    }
+
+    // qe_dst needs to be bound to the data buffer and only returned when that is
+    if (!rd->qe_dst)
+    {
+        if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
+            av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
+
+    return 0;
+}
+
+// Object fd & size will be zapped by this & need setting later
+static int drm_from_format(AVDRMFrameDescriptor * const desc, const struct v4l2_format * const format)
+{
+    AVDRMLayerDescriptor *layer = &desc->layers[0];
+    unsigned int width;
+    unsigned int height;
+    unsigned int bpl;
+    uint32_t pixelformat;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
+        width       = format->fmt.pix_mp.width;
+        height      = format->fmt.pix_mp.height;
+        pixelformat = format->fmt.pix_mp.pixelformat;
+        bpl         = format->fmt.pix_mp.plane_fmt[0].bytesperline;
+    }
+    else {
+        width       = format->fmt.pix.width;
+        height      = format->fmt.pix.height;
+        pixelformat = format->fmt.pix.pixelformat;
+        bpl         = format->fmt.pix.bytesperline;
+    }
+
+    switch (pixelformat) {
+    case V4L2_PIX_FMT_NV12:
+        layer->format = DRM_FORMAT_NV12;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        break;
+#if CONFIG_SAND
+    case V4L2_PIX_FMT_NV12_COL128:
+        layer->format = DRM_FORMAT_NV12;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
+        break;
+    case V4L2_PIX_FMT_NV12_10_COL128:
+        layer->format = DRM_FORMAT_P030;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
+        break;
+#endif
+#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
+    case V4L2_PIX_FMT_SUNXI_TILED_NV12:
+        layer->format = DRM_FORMAT_NV12;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED;
+        break;
+#endif
+#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15)
+    case V4L2_PIX_FMT_NV15:
+        layer->format = DRM_FORMAT_NV15;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        break;
+#endif
+    case V4L2_PIX_FMT_NV16:
+        layer->format = DRM_FORMAT_NV16;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        break;
+#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20)
+    case V4L2_PIX_FMT_NV20:
+        layer->format = DRM_FORMAT_NV20;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        break;
+#endif
+    default:
+        return -1;
+    }
+
+    desc->nb_objects = 1;
+    desc->objects[0].fd = -1;
+    desc->objects[0].size = 0;
+
+    desc->nb_layers = 1;
+    layer->nb_planes = 2;
+
+    layer->planes[0].object_index = 0;
+    layer->planes[0].offset = 0;
+    layer->planes[0].pitch = bpl;
+#if CONFIG_SAND
+    if (pixelformat == V4L2_PIX_FMT_NV12_COL128) {
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = height * 128;
+        layer->planes[0].pitch = width;
+        layer->planes[1].pitch = width;
+    }
+    else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = height * 128;
+        layer->planes[0].pitch = width * 2; // Lies but it keeps DRM import happy
+        layer->planes[1].pitch = width * 2;
+    }
+    else
+#endif
+    {
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = layer->planes[0].pitch * height;
+        layer->planes[1].pitch = layer->planes[0].pitch;
+    }
+
+    return 0;
+}
+
+static int
+set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
+    struct req_controls *const controls,
+#if HEVC_CTRLS_VERSION >= 2
+    struct v4l2_ctrl_hevc_decode_params * const dec,
+#endif
+    struct v4l2_ctrl_hevc_slice_params * const slices,
+    const unsigned int slice_no,
+    const unsigned int slice_count)
+{
+    int rv;
+
+    struct v4l2_ext_control control[] = {
+        {
+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
+            .ptr = &controls->sps,
+            .size = sizeof(controls->sps),
+        },
+        {
+            .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS,
+            .ptr = &controls->pps,
+            .size = sizeof(controls->pps),
+        },
+#if HEVC_CTRLS_VERSION >= 2
+        {
+            .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS,
+            .ptr = dec,
+            .size = sizeof(*dec),
+        },
+#endif
+        {
+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS,
+            .ptr = slices + slice_no,
+            .size = sizeof(*slices) * slice_count,
+        },
+        // Optional
+        {
+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX,
+            .ptr = &controls->scaling_matrix,
+            .size = sizeof(controls->scaling_matrix),
+        },
+    };
+
+    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control,
+            controls->has_scaling ?
+                FF_ARRAY_ELEMS(control) :
+                FF_ARRAY_ELEMS(control) - 1);
+
+    return rv;
+}
+
+static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
+{
+    const HEVCContext * const h = avctx->priv_data;
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
+    int bcount = get_bits_count(&h->HEVClc->gb);
+    uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount;
+
+    int rv;
+    struct slice_info * si;
+
+    if ((rv = slice_add(rd)) != 0)
+        return rv;
+
+    si = rd->slices + rd->num_slices - 1;
+    si->ptr = buffer;
+    si->len = size;
+
+    if (ctx->multi_slice && rd->num_slices > 1) {
+        struct slice_info *const si0 = rd->slices;
+        const size_t offset = (buffer - si0->ptr);
+        boff += offset * 8;
+        size += offset;
+        si0->len = si->len + offset;
+    }
+
+#if HEVC_CTRLS_VERSION >= 2
+    if (rd->num_slices == 1)
+        fill_decode_params(h, &rd->dec);
+    fill_slice_params(h, &rd->dec, rd->slice_params + rd->num_slices - 1, size * 8, boff);
+#else
+    fill_slice_params(h, rd->slice_params + rd->num_slices - 1, size * 8, boff);
+#endif
+
+    return 0;
+}
+
+static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx)
+{
+    const HEVCContext * const h = avctx->priv_data;
+    if (h->ref != NULL) {
+        V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
+        V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+
+        media_request_abort(&rd->req);
+        mediabufs_src_qent_abort(ctx->mbufs, &rd->qe_src);
+
+        decode_q_remove(&ctx->decode_q, &rd->decode_ent);
+    }
+}
+
+static int send_slice(AVCodecContext * const avctx,
+                      V4L2MediaReqDescriptor * const rd,
+                      struct req_controls *const controls,
+                      const unsigned int i, const unsigned int j)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+
+    struct slice_info *const si = rd->slices + i;
+    struct media_request * req = NULL;
+    struct qent_src * src = NULL;
+    MediaBufsStatus stat;
+
+    if ((req = media_request_get(ctx->mpool)) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__);
+        return AVERROR(ENOMEM);
+    }
+
+    if (set_req_ctls(ctx, req,
+                     controls,
+#if HEVC_CTRLS_VERSION >= 2
+                     &rd->dec,
+#endif
+                     rd->slice_params,
+                     i, j - i)) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__);
+        goto fail1;
+    }
+
+    if ((src = mediabufs_src_qent_get(ctx->mbufs)) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to get src buffer\n", __func__);
+        goto fail1;
+    }
+
+    if (qent_src_data_copy(src, 0, si->ptr, si->len, ctx->dbufs) != 0) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed data copy\n", __func__);
+        goto fail2;
+    }
+
+    if (qent_src_params_set(src, &controls->tv)) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed src param set\n", __func__);
+        goto fail2;
+    }
+
+#warning ANNEX_B start code
+//        if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
+//        }
+
+    stat = mediabufs_start_request(ctx->mbufs, &req, &src,
+                                   i == 0 ? rd->qe_dst : NULL,
+                                   j == rd->num_slices);
+
+    if (stat != MEDIABUFS_STATUS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__);
+        return AVERROR_UNKNOWN;
+    }
+    return 0;
+
+fail2:
+    mediabufs_src_qent_abort(ctx->mbufs, &src);
+fail1:
+    media_request_abort(&req);
+    return AVERROR_UNKNOWN;
+}
+
+static int v4l2_request_hevc_end_frame(AVCodecContext *avctx)
+{
+    const HEVCContext * const h = avctx->priv_data;
+    V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
+    struct req_controls rc;
+    unsigned int i;
+    int rv;
+
+    // It is possible, though maybe a bug, to get an end_frame without
+    // a previous start_frame.  If we do then give up.
+    if (!decode_q_in_q(&rd->decode_ent)) {
+        av_log(avctx, AV_LOG_DEBUG, "%s: Frame not in decode Q\n", __func__);
+        return AVERROR_INVALIDDATA;
+    }
+
+    {
+        const ScalingList *sl = h->ps.pps->scaling_list_data_present_flag ?
+                                    &h->ps.pps->scaling_list :
+                                h->ps.sps->scaling_list_enable_flag ?
+                                    &h->ps.sps->scaling_list : NULL;
+
+
+        memset(&rc, 0, sizeof(rc));
+        rc.tv = cvt_dpb_to_tv(rd->timestamp);
+        fill_sps(&rc.sps, h->ps.sps);
+        fill_pps(&rc.pps, h->ps.pps);
+        if (sl) {
+            rc.has_scaling = 1;
+            fill_scaling_matrix(sl, &rc.scaling_matrix);
+        }
+    }
+
+    decode_q_wait(&ctx->decode_q, &rd->decode_ent);
+
+    // qe_dst needs to be bound to the data buffer and only returned when that is
+    // Alloc almost certainly wants to be serialised if there is any chance of blocking
+    // so we get the next frame to be free in the thread that needs it for decode first.
+    //
+    // In our current world this probably isn't a concern but put it here anyway
+    if (!rd->qe_dst)
+    {
+        if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
+            av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
+            rv = AVERROR(ENOMEM);
+            goto fail;
+        }
+    }
+
+    // Send as slices
+    if (ctx->multi_slice)
+    {
+        if ((rv = send_slice(avctx, rd, &rc, 0, rd->num_slices)) != 0)
+            goto fail;
+    }
+    else
+    {
+        for (i = 0; i != rd->num_slices; ++i) {
+            if ((rv = send_slice(avctx, rd, &rc, i, i + 1)) != 0)
+                goto fail;
+        }
+    }
+
+    // Set the drm_prime desriptor
+    drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs));
+    rd->drm.objects[0].fd = dmabuf_fd(qent_dst_dmabuf(rd->qe_dst, 0));
+    rd->drm.objects[0].size = dmabuf_size(qent_dst_dmabuf(rd->qe_dst, 0));
+
+    decode_q_remove(&ctx->decode_q, &rd->decode_ent);
+    return 0;
+
+fail:
+    decode_q_remove(&ctx->decode_q, &rd->decode_ent);
+    return rv;
+}
+
+// Initial check & init
+static int
+probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
+{
+    const HEVCContext *h = avctx->priv_data;
+    const HEVCSPS * const sps = h->ps.sps;
+    struct v4l2_ctrl_hevc_sps ctrl_sps;
+    unsigned int i;
+
+    // Check for var slice array
+    struct v4l2_query_ext_ctrl qc[] = {
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS },
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS },
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS },
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX },
+#if HEVC_CTRLS_VERSION >= 2
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS },
+#endif
+    };
+    // Order & size must match!
+    static const size_t ctrl_sizes[] = {
+        sizeof(struct v4l2_ctrl_hevc_slice_params),
+        sizeof(struct v4l2_ctrl_hevc_sps),
+        sizeof(struct v4l2_ctrl_hevc_pps),
+        sizeof(struct v4l2_ctrl_hevc_scaling_matrix),
+#if HEVC_CTRLS_VERSION >= 2
+        sizeof(struct v4l2_ctrl_hevc_decode_params),
+#endif
+    };
+    const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc);
+
+    if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) {
+        av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION);
+        return AVERROR(EINVAL);
+    }
+    for (i = 0; i != noof_ctrls; ++i) {
+        if (ctrl_sizes[i] != (size_t)qc[i].elem_size) {
+            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n",
+                   HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size);
+            return AVERROR(EINVAL);
+        }
+    }
+
+    fill_sps(&ctrl_sps, sps);
+
+    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_MPEG_VIDEO_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n");
+        return AVERROR(EINVAL);
+    }
+
+    ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0;
+    return 0;
+}
+
+// Final init
+static int
+set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
+{
+    int ret;
+
+    struct v4l2_query_ext_ctrl querys[] = {
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, },
+    };
+
+    struct v4l2_ext_control ctrls[] = {
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
+    };
+
+    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));
+
+    ctx->decode_mode = querys[0].default_value;
+
+    if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED &&
+        ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) {
+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
+        return AVERROR(EINVAL);
+    }
+
+    ctx->start_code = querys[1].default_value;
+    if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE &&
+        ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
+        return AVERROR(EINVAL);
+    }
+
+    ctx->max_slices = querys[2].elems;
+    if (ctx->max_slices > MAX_SLICES) {
+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices);
+        return AVERROR(EINVAL);
+    }
+
+    ctrls[0].value = ctx->decode_mode;
+    ctrls[1].value = ctx->start_code;
+
+    ret = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, NULL, ctrls, FF_ARRAY_ELEMS(ctrls));
+    return !ret ? 0 : AVERROR(-ret);
+}
+
+static void v4l2_req_frame_free(void *opaque, uint8_t *data)
+{
+    AVCodecContext *avctx = opaque;
+    V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)data;
+
+    av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p\n", __func__, avctx, data);
+
+    qent_dst_unref(&rd->qe_dst);
+
+    // We don't expect req or qe_src to be set
+    if (rd->req || rd->qe_src)
+        av_log(NULL, AV_LOG_ERROR, "%s: qe_src %p or req %p not NULL\n", __func__, rd->req, rd->qe_src);
+
+    av_freep(&rd->slices);
+    av_freep(&rd->slice_params);
+
+    av_free(rd);
+}
+
+static AVBufferRef *v4l2_req_frame_alloc(void *opaque, int size)
+{
+    AVCodecContext *avctx = opaque;
+//    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
+//    V4L2MediaReqDescriptor *req;
+    AVBufferRef *ref;
+    uint8_t *data;
+//    int ret;
+
+    data = av_mallocz(size);
+    if (!data)
+        return NULL;
+
+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data);
+    ref = av_buffer_create(data, size, v4l2_req_frame_free, avctx, 0);
+    if (!ref) {
+        av_freep(&data);
+        return NULL;
+    }
+    return ref;
+}
+
+#if 0
+static void v4l2_req_pool_free(void *opaque)
+{
+    av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque);
+}
+
+static void v4l2_req_hwframe_ctx_free(AVHWFramesContext *hwfc)
+{
+    av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool);
+
+    av_buffer_pool_uninit(&hwfc->pool);
+}
+#endif
+
+static int frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
+{
+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
+    AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data;
+    const struct v4l2_format *vfmt = mediabufs_dst_fmt(ctx->mbufs);
+
+    hwfc->format = AV_PIX_FMT_DRM_PRIME;
+    hwfc->sw_format = pixel_format_from_format(vfmt);
+    if (V4L2_TYPE_IS_MULTIPLANAR(vfmt->type)) {
+        hwfc->width = vfmt->fmt.pix_mp.width;
+        hwfc->height = vfmt->fmt.pix_mp.height;
+    } else {
+        hwfc->width = vfmt->fmt.pix.width;
+        hwfc->height = vfmt->fmt.pix.height;
+    }
+#if 0
+    hwfc->pool = av_buffer_pool_init2(sizeof(V4L2MediaReqDescriptor), avctx, v4l2_req_frame_alloc, v4l2_req_pool_free);
+    if (!hwfc->pool)
+        return AVERROR(ENOMEM);
+
+    hwfc->free = v4l2_req_hwframe_ctx_free;
+
+    hwfc->initial_pool_size = 1;
+
+    switch (avctx->codec_id) {
+    case AV_CODEC_ID_VP9:
+        hwfc->initial_pool_size += 8;
+        break;
+    case AV_CODEC_ID_VP8:
+        hwfc->initial_pool_size += 3;
+        break;
+    default:
+        hwfc->initial_pool_size += 2;
+    }
+#endif
+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size);
+
+    return 0;
+}
+
+static int alloc_frame(AVCodecContext * avctx, AVFrame *frame)
+{
+    int rv;
+
+    frame->buf[0] = v4l2_req_frame_alloc(avctx, sizeof(V4L2MediaReqDescriptor));
+    if (!frame->buf[0])
+        return AVERROR(ENOMEM);
+
+    frame->data[0] = frame->buf[0]->data;
+
+    frame->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx);
+
+    if ((rv = ff_attach_decode_data(frame)) != 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to attach decode data to frame\n");
+        av_frame_unref(frame);
+        return rv;
+    }
+
+    return 0;
+}
+
+const v4l2_req_decode_fns V(ff_v4l2_req_hevc) = {
+    .src_pix_fmt_v4l2 = V4L2_PIX_FMT_HEVC_SLICE,
+    .name = "V4L2 HEVC stateless V" STR(HEVC_CTRLS_VERSION),
+    .probe = probe,
+    .set_controls = set_controls,
+
+    .start_frame    = v4l2_request_hevc_start_frame,
+    .decode_slice   = v4l2_request_hevc_decode_slice,
+    .end_frame      = v4l2_request_hevc_end_frame,
+    .abort_frame    = v4l2_request_hevc_abort_frame,
+    .frame_params   = frame_params,
+    .alloc_frame    = alloc_frame,
+};
+
diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c
new file mode 100644
index 0000000000..eb00ecb406
--- /dev/null
+++ b/libavcodec/v4l2_req_media.c
@@ -0,0 +1,1596 @@
+/*
+ * Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <linux/media.h>
+#include <sys/ioctl.h>
+#include <sys/select.h>
+#include <sys/ioctl.h>
+
+#include <linux/videodev2.h>
+
+#include "v4l2_req_dmabufs.h"
+#include "v4l2_req_media.h"
+#include "v4l2_req_pollqueue.h"
+#include "v4l2_req_utils.h"
+#include "weak_link.h"
+
+
+/* floor(log2(x)) */
+static unsigned int log2_size(size_t x)
+{
+    unsigned int n = 0;
+
+    if (x & ~0xffff) {
+        n += 16;
+        x >>= 16;
+    }
+    if (x & ~0xff) {
+        n += 8;
+        x >>= 8;
+    }
+    if (x & ~0xf) {
+        n += 4;
+        x >>= 4;
+    }
+    if (x & ~3) {
+        n += 2;
+        x >>= 2;
+    }
+    return (x & ~1) ? n + 1 : n;
+}
+
+static size_t round_up_size(const size_t x)
+{
+    /* Admit no size < 256 */
+    const unsigned int n = x < 256 ? 8 : log2_size(x) - 1;
+
+    return x >= (3 << n) ? 4 << n : (3 << n);
+}
+
+struct media_request;
+
+struct media_pool {
+    int fd;
+    sem_t sem;
+    pthread_mutex_t lock;
+    struct media_request * free_reqs;
+    struct pollqueue * pq;
+};
+
+struct media_request {
+    struct media_request * next;
+    struct media_pool * mp;
+    int fd;
+    struct polltask * pt;
+};
+
+
+static inline int do_trywait(sem_t *const sem)
+{
+    while (sem_trywait(sem)) {
+        if (errno != EINTR)
+            return -errno;
+    }
+    return 0;
+}
+
+static inline int do_wait(sem_t *const sem)
+{
+    while (sem_wait(sem)) {
+        if (errno != EINTR)
+            return -errno;
+    }
+    return 0;
+}
+
+static int request_buffers(int video_fd, unsigned int type,
+                           enum v4l2_memory memory, unsigned int buffers_count)
+{
+    struct v4l2_requestbuffers buffers;
+    int rc;
+
+    memset(&buffers, 0, sizeof(buffers));
+    buffers.type = type;
+    buffers.memory = memory;
+    buffers.count = buffers_count;
+
+    rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers);
+    if (rc < 0) {
+        rc = -errno;
+        request_log("Unable to request %d type %d buffers: %s\n", buffers_count, type, strerror(-rc));
+        return rc;
+    }
+
+    return 0;
+}
+
+
+static int set_stream(int video_fd, unsigned int type, bool enable)
+{
+    enum v4l2_buf_type buf_type = type;
+    int rc;
+
+    rc = ioctl(video_fd, enable ? VIDIOC_STREAMON : VIDIOC_STREAMOFF,
+           &buf_type);
+    if (rc < 0) {
+        rc = -errno;
+        request_log("Unable to %sable stream: %s\n",
+                enable ? "en" : "dis", strerror(-rc));
+        return rc;
+    }
+
+    return 0;
+}
+
+
+
+struct media_request * media_request_get(struct media_pool * const mp)
+{
+    struct media_request *req = NULL;
+
+    /* Timeout handled by poll code */
+    if (do_wait(&mp->sem))
+        return NULL;
+
+    pthread_mutex_lock(&mp->lock);
+    req = mp->free_reqs;
+    if (req) {
+        mp->free_reqs = req->next;
+        req->next = NULL;
+    }
+    pthread_mutex_unlock(&mp->lock);
+    return req;
+}
+
+int media_request_fd(const struct media_request * const req)
+{
+    return req->fd;
+}
+
+int media_request_start(struct media_request * const req)
+{
+    while (ioctl(req->fd, MEDIA_REQUEST_IOC_QUEUE, NULL) == -1)
+    {
+        const int err = errno;
+        if (err == EINTR)
+            continue;
+        request_log("%s: Failed to Q media: (%d) %s\n", __func__, err, strerror(err));
+        return -err;
+    }
+
+    pollqueue_add_task(req->pt, 2000);
+    return 0;
+}
+
+static void media_request_done(void *v, short revents)
+{
+    struct media_request *const req = v;
+    struct media_pool *const mp = req->mp;
+
+    /* ** Not sure what to do about timeout */
+
+    if (ioctl(req->fd, MEDIA_REQUEST_IOC_REINIT, NULL) < 0)
+        request_log("Unable to reinit media request: %s\n",
+                strerror(errno));
+
+    pthread_mutex_lock(&mp->lock);
+    req->next = mp->free_reqs;
+    mp->free_reqs = req;
+    pthread_mutex_unlock(&mp->lock);
+    sem_post(&mp->sem);
+}
+
+int media_request_abort(struct media_request ** const preq)
+{
+    struct media_request * const req = *preq;
+
+    if (req == NULL)
+        return 0;
+    *preq = NULL;
+
+    media_request_done(req, 0);
+    return 0;
+}
+
+static void delete_req_chain(struct media_request * const chain)
+{
+    struct media_request * next = chain;
+    while (next) {
+        struct media_request * const req = next;
+        next = req->next;
+        if (req->pt)
+            polltask_delete(&req->pt);
+        if (req->fd != -1)
+            close(req->fd);
+        free(req);
+    }
+}
+
+struct media_pool * media_pool_new(const char * const media_path,
+                   struct pollqueue * const pq,
+                   const unsigned int n)
+{
+    struct media_pool * const mp = calloc(1, sizeof(*mp));
+    unsigned int i;
+
+    if (!mp)
+        goto fail0;
+
+    mp->pq = pq;
+    pthread_mutex_init(&mp->lock, NULL);
+    mp->fd = open(media_path, O_RDWR | O_NONBLOCK);
+    if (mp->fd == -1) {
+        request_log("Failed to open '%s': %s\n", media_path, strerror(errno));
+        goto fail1;
+    }
+
+    for (i = 0; i != n; ++i) {
+        struct media_request * req = malloc(sizeof(*req));
+        if (!req)
+            goto fail4;
+
+        *req = (struct media_request){
+            .next = mp->free_reqs,
+            .mp = mp,
+            .fd = -1
+        };
+        mp->free_reqs = req;
+
+        if (ioctl(mp->fd, MEDIA_IOC_REQUEST_ALLOC, &req->fd) == -1) {
+            request_log("Failed to alloc request %d: %s\n", i, strerror(errno));
+            goto fail4;
+        }
+
+        req->pt = polltask_new(pq, req->fd, POLLPRI, media_request_done, req);
+        if (!req->pt)
+            goto fail4;
+    }
+
+    sem_init(&mp->sem, 0, n);
+
+    return mp;
+
+fail4:
+    delete_req_chain(mp->free_reqs);
+    close(mp->fd);
+    pthread_mutex_destroy(&mp->lock);
+fail1:
+    free(mp);
+fail0:
+    return NULL;
+}
+
+void media_pool_delete(struct media_pool ** pMp)
+{
+    struct media_pool * const mp = *pMp;
+
+    if (!mp)
+        return;
+    *pMp = NULL;
+
+    delete_req_chain(mp->free_reqs);
+    close(mp->fd);
+    sem_destroy(&mp->sem);
+    pthread_mutex_destroy(&mp->lock);
+    free(mp);
+}
+
+
+#define INDEX_UNSET (~(uint32_t)0)
+
+enum qent_status {
+    QENT_NEW = 0,       // Initial state - shouldn't last
+    QENT_FREE,          // On free chain
+    QENT_PENDING,       // User has ent
+    QENT_WAITING,       // On inuse
+    QENT_DONE,          // Frame rx
+    QENT_ERROR,         // Error
+    QENT_IMPORT
+};
+
+struct qent_base {
+    atomic_int ref_count;
+    struct qent_base *next;
+    struct qent_base *prev;
+    enum qent_status status;
+    uint32_t index;
+    struct dmabuf_h *dh[VIDEO_MAX_PLANES];
+    struct timeval timestamp;
+};
+
+struct qent_src {
+    struct qent_base base;
+    int fixed_size;
+};
+
+struct qent_dst {
+    struct qent_base base;
+    bool waiting;
+    pthread_mutex_t lock;
+    pthread_cond_t cond;
+    struct ff_weak_link_client * mbc_wl;
+};
+
+struct qe_list_head {
+    struct qent_base *head;
+    struct qent_base *tail;
+};
+
+struct buf_pool {
+    pthread_mutex_t lock;
+    sem_t free_sem;
+    enum v4l2_buf_type buf_type;
+    struct qe_list_head free;
+    struct qe_list_head inuse;
+};
+
+
+static inline struct qent_dst *base_to_dst(struct qent_base *be)
+{
+    return (struct qent_dst *)be;
+}
+
+static inline struct qent_src *base_to_src(struct qent_base *be)
+{
+    return (struct qent_src *)be;
+}
+
+
+#define QENT_BASE_INITIALIZER {\
+    .ref_count = ATOMIC_VAR_INIT(0),\
+    .status = QENT_NEW,\
+    .index  = INDEX_UNSET\
+}
+
+static void qe_base_uninit(struct qent_base *const be)
+{
+    unsigned int i;
+    for (i = 0; i != VIDEO_MAX_PLANES; ++i) {
+        dmabuf_free(be->dh[i]);
+        be->dh[i] = NULL;
+    }
+}
+
+static void qe_src_free(struct qent_src *const be_src)
+{
+    if (!be_src)
+        return;
+    qe_base_uninit(&be_src->base);
+    free(be_src);
+}
+
+static struct qent_src * qe_src_new(void)
+{
+    struct qent_src *const be_src = malloc(sizeof(*be_src));
+    if (!be_src)
+        return NULL;
+    *be_src = (struct qent_src){
+        .base = QENT_BASE_INITIALIZER
+    };
+    return be_src;
+}
+
+static void qe_dst_free(struct qent_dst *const be_dst)
+{
+    if (!be_dst)
+        return;
+
+    ff_weak_link_unref(&be_dst->mbc_wl);
+    pthread_cond_destroy(&be_dst->cond);
+    pthread_mutex_destroy(&be_dst->lock);
+    qe_base_uninit(&be_dst->base);
+    free(be_dst);
+}
+
+static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl)
+{
+    struct qent_dst *const be_dst = malloc(sizeof(*be_dst));
+    if (!be_dst)
+        return NULL;
+    *be_dst = (struct qent_dst){
+        .base = QENT_BASE_INITIALIZER,
+        .lock = PTHREAD_MUTEX_INITIALIZER,
+        .cond = PTHREAD_COND_INITIALIZER,
+        .mbc_wl = ff_weak_link_ref(wl)
+    };
+    return be_dst;
+}
+
+static void ql_add_tail(struct qe_list_head * const ql, struct qent_base * be)
+{
+    if (ql->tail)
+        ql->tail->next = be;
+    else
+        ql->head = be;
+    be->prev = ql->tail;
+    be->next = NULL;
+    ql->tail = be;
+}
+
+static struct qent_base * ql_extract(struct qe_list_head * const ql, struct qent_base * be)
+{
+    if (!be)
+        return NULL;
+
+    if (be->next)
+        be->next->prev = be->prev;
+    else
+        ql->tail = be->prev;
+    if (be->prev)
+        be->prev->next = be->next;
+    else
+        ql->head = be->next;
+    be->next = NULL;
+    be->prev = NULL;
+    return be;
+}
+
+
+static void bq_put_free(struct buf_pool *const bp, struct qent_base * be)
+{
+    ql_add_tail(&bp->free, be);
+}
+
+static struct qent_base * bq_get_free(struct buf_pool *const bp)
+{
+    return ql_extract(&bp->free, bp->free.head);
+}
+
+static struct qent_base * bq_extract_inuse(struct buf_pool *const bp, struct qent_base *const be)
+{
+    return ql_extract(&bp->inuse, be);
+}
+
+static struct qent_base * bq_get_inuse(struct buf_pool *const bp)
+{
+    return ql_extract(&bp->inuse, bp->inuse.head);
+}
+
+static void bq_free_all_free_src(struct buf_pool *const bp)
+{
+    struct qent_base *be;
+    while ((be = bq_get_free(bp)) != NULL)
+        qe_src_free(base_to_src(be));
+}
+
+static void bq_free_all_inuse_src(struct buf_pool *const bp)
+{
+    struct qent_base *be;
+    while ((be = bq_get_inuse(bp)) != NULL)
+        qe_src_free(base_to_src(be));
+}
+
+static void bq_free_all_free_dst(struct buf_pool *const bp)
+{
+    struct qent_base *be;
+    while ((be = bq_get_free(bp)) != NULL)
+        qe_dst_free(base_to_dst(be));
+}
+
+static void queue_put_free(struct buf_pool *const bp, struct qent_base *be)
+{
+    unsigned int i;
+
+    pthread_mutex_lock(&bp->lock);
+    /* Clear out state vars */
+    be->timestamp.tv_sec = 0;
+    be->timestamp.tv_usec = 0;
+    be->status = QENT_FREE;
+    for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i)
+        dmabuf_len_set(be->dh[i], 0);
+    bq_put_free(bp, be);
+    pthread_mutex_unlock(&bp->lock);
+    sem_post(&bp->free_sem);
+}
+
+static bool queue_is_inuse(const struct buf_pool *const bp)
+{
+    return bp->inuse.tail != NULL;
+}
+
+static void queue_put_inuse(struct buf_pool *const bp, struct qent_base *be)
+{
+    if (!be)
+        return;
+    pthread_mutex_lock(&bp->lock);
+    ql_add_tail(&bp->inuse, be);
+    be->status = QENT_WAITING;
+    pthread_mutex_unlock(&bp->lock);
+}
+
+static struct qent_base *queue_get_free(struct buf_pool *const bp)
+{
+    struct qent_base *buf;
+
+    if (do_wait(&bp->free_sem))
+        return NULL;
+    pthread_mutex_lock(&bp->lock);
+    buf = bq_get_free(bp);
+    pthread_mutex_unlock(&bp->lock);
+    return buf;
+}
+
+static struct qent_base *queue_tryget_free(struct buf_pool *const bp)
+{
+    struct qent_base *buf;
+
+    if (do_trywait(&bp->free_sem))
+        return NULL;
+    pthread_mutex_lock(&bp->lock);
+    buf = bq_get_free(bp);
+    pthread_mutex_unlock(&bp->lock);
+    return buf;
+}
+
+static struct qent_base * queue_find_extract_fd(struct buf_pool *const bp, const int fd)
+{
+    struct qent_base *be;
+
+    pthread_mutex_lock(&bp->lock);
+    /* Expect 1st in Q, but allow anywhere */
+    for (be = bp->inuse.head; be; be = be->next) {
+        if (dmabuf_fd(be->dh[0]) == fd) {
+            bq_extract_inuse(bp, be);
+            break;
+        }
+    }
+    pthread_mutex_unlock(&bp->lock);
+
+    return be;
+}
+
+static void queue_delete(struct buf_pool *const bp)
+{
+    sem_destroy(&bp->free_sem);
+    pthread_mutex_destroy(&bp->lock);
+    free(bp);
+}
+
+static struct buf_pool* queue_new(const int vfd)
+{
+    struct buf_pool *bp = calloc(1, sizeof(*bp));
+    if (!bp)
+        return NULL;
+    pthread_mutex_init(&bp->lock, NULL);
+    sem_init(&bp->free_sem, 0, 0);
+    return bp;
+}
+
+
+struct mediabufs_ctl {
+    atomic_int ref_count;  /* 0 is single ref for easier atomics */
+    void * dc;
+    int vfd;
+    bool stream_on;
+    bool polling;
+    bool dst_fixed;             // Dst Q is fixed size
+    pthread_mutex_t lock;
+    struct buf_pool * src;
+    struct buf_pool * dst;
+    struct polltask * pt;
+    struct pollqueue * pq;
+    struct ff_weak_link_master * this_wlm;
+
+    struct v4l2_format src_fmt;
+    struct v4l2_format dst_fmt;
+};
+
+static int qe_v4l2_queue(struct qent_base *const be,
+               const int vfd, struct media_request *const mreq,
+               const struct v4l2_format *const fmt,
+               const bool is_dst, const bool hold_flag)
+{
+    struct v4l2_buffer buffer = {
+        .type = fmt->type,
+        .memory = V4L2_MEMORY_DMABUF,
+        .index = be->index
+    };
+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        unsigned int i;
+        for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) {
+            if (is_dst)
+                dmabuf_len_set(be->dh[i], 0);
+
+            /* *** Really need a pixdesc rather than a format so we can fill in data_offset */
+            planes[i].length = dmabuf_size(be->dh[i]);
+            planes[i].bytesused = dmabuf_len(be->dh[i]);
+            planes[i].m.fd = dmabuf_fd(be->dh[i]);
+        }
+        buffer.m.planes = planes;
+        buffer.length = i;
+    }
+    else {
+        if (is_dst)
+            dmabuf_len_set(be->dh[0], 0);
+
+        buffer.bytesused = dmabuf_len(be->dh[0]);
+        buffer.length = dmabuf_size(be->dh[0]);
+        buffer.m.fd = dmabuf_fd(be->dh[0]);
+    }
+
+    if (!is_dst && mreq) {
+        buffer.flags |= V4L2_BUF_FLAG_REQUEST_FD;
+        buffer.request_fd = media_request_fd(mreq);
+        if (hold_flag)
+            buffer.flags |= V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF;
+    }
+
+    if (is_dst)
+        be->timestamp = (struct timeval){0,0};
+
+    buffer.timestamp = be->timestamp;
+
+    while (ioctl(vfd, VIDIOC_QBUF, &buffer)) {
+        const int err = errno;
+        if (err != EINTR) {
+            request_log("%s: Failed to Q buffer: err=%d (%s)\n", __func__, err, strerror(err));
+            return -err;
+        }
+    }
+    return 0;
+}
+
+static struct qent_base * qe_dequeue(struct buf_pool *const bp,
+                     const int vfd,
+                     const struct v4l2_format * const f)
+{
+    int fd;
+    struct qent_base *be;
+    int rc;
+    const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type);
+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
+    struct v4l2_buffer buffer = {
+        .type =  f->type,
+        .memory = V4L2_MEMORY_DMABUF
+    };
+    if (mp) {
+        buffer.length = f->fmt.pix_mp.num_planes;
+        buffer.m.planes = planes;
+    }
+
+    while ((rc = ioctl(vfd, VIDIOC_DQBUF, &buffer)) != 0 &&
+           errno == EINTR)
+        /* Loop */;
+    if (rc) {
+        request_log("Error DQing buffer type %d: %s\n", f->type, strerror(errno));
+        return NULL;
+    }
+
+    fd = mp ? planes[0].m.fd : buffer.m.fd;
+    be = queue_find_extract_fd(bp, fd);
+    if (!be) {
+        request_log("Failed to find fd %d in Q\n", fd);
+        return NULL;
+    }
+
+    be->timestamp = buffer.timestamp;
+    be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE;
+    return be;
+}
+
+static void qe_dst_done(struct qent_dst * dst_be)
+{
+    pthread_mutex_lock(&dst_be->lock);
+    dst_be->waiting = false;
+    pthread_cond_broadcast(&dst_be->cond);
+    pthread_mutex_unlock(&dst_be->lock);
+
+    qent_dst_unref(&dst_be);
+}
+
+static bool qe_dst_waiting(struct qent_dst *const dst_be)
+{
+    bool waiting;
+    pthread_mutex_lock(&dst_be->lock);
+    waiting = dst_be->waiting;
+    dst_be->waiting = true;
+    pthread_mutex_unlock(&dst_be->lock);
+    return waiting;
+}
+
+
+static bool mediabufs_wants_poll(const struct mediabufs_ctl *const mbc)
+{
+    return queue_is_inuse(mbc->src) || queue_is_inuse(mbc->dst);
+}
+
+static void mediabufs_poll_cb(void * v, short revents)
+{
+    struct mediabufs_ctl *mbc = v;
+    struct qent_src *src_be = NULL;
+    struct qent_dst *dst_be = NULL;
+
+    if (!revents)
+        request_err(mbc->dc, "%s: Timeout\n", __func__);
+
+    pthread_mutex_lock(&mbc->lock);
+    mbc->polling = false;
+
+    if ((revents & POLLOUT) != 0)
+        src_be = base_to_src(qe_dequeue(mbc->src, mbc->vfd, &mbc->src_fmt));
+    if ((revents & POLLIN) != 0)
+        dst_be = base_to_dst(qe_dequeue(mbc->dst, mbc->vfd, &mbc->dst_fmt));
+
+    /* Reschedule */
+    if (mediabufs_wants_poll(mbc)) {
+        mbc->polling = true;
+        pollqueue_add_task(mbc->pt, 2000);
+    }
+    pthread_mutex_unlock(&mbc->lock);
+
+    if (src_be)
+        queue_put_free(mbc->src, &src_be->base);
+    if (dst_be)
+        qe_dst_done(dst_be);
+}
+
+int qent_src_params_set(struct qent_src *const be_src, const struct timeval * timestamp)
+{
+    struct qent_base *const be = &be_src->base;
+
+    be->timestamp = *timestamp;
+    return 0;
+}
+
+struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst)
+{
+    return be_dst->base.timestamp;
+}
+
+static int qent_base_realloc(struct qent_base *const be, const size_t len, struct dmabufs_ctl * dbsc)
+{
+    if (!be->dh[0] || len > dmabuf_size(be->dh[0])) {
+        size_t newsize = round_up_size(len);
+        request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize);
+        if (!dbsc) {
+            request_log("%s: No dmbabuf_ctrl for realloc\n", __func__);
+            return -ENOMEM;
+        }
+        if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) {
+            request_log("%s: Realloc %zd failed\n", __func__, newsize);
+            return -ENOMEM;
+        }
+    }
+    return 0;
+}
+
+int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc)
+{
+    struct qent_base *const be = &be_src->base;
+    return qent_base_realloc(be, len, dbsc);
+}
+
+
+int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc)
+{
+    void * dst;
+    struct qent_base *const be = &be_src->base;
+    int rv;
+
+    // Realloc doesn't copy so don't alloc if offset != 0
+    if ((rv = qent_base_realloc(be, offset + len,
+                                be_src->fixed_size || offset ? NULL : dbsc)) != 0)
+        return rv;
+
+    dmabuf_write_start(be->dh[0]);
+    dst = dmabuf_map(be->dh[0]);
+    if (!dst)
+        return -1;
+    memcpy((char*)dst + offset, src, len);
+    dmabuf_len_set(be->dh[0], len);
+    dmabuf_write_end(be->dh[0]);
+    return 0;
+}
+
+const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be_dst, unsigned int plane)
+{
+    const struct qent_base *const be = &be_dst->base;
+
+    return (plane >= sizeof(be->dh)/sizeof(be->dh[0])) ? NULL : be->dh[plane];
+}
+
+int qent_dst_dup_fd(const struct qent_dst *const be_dst, unsigned int plane)
+{
+    return dup(dmabuf_fd(qent_dst_dmabuf(be_dst, plane)));
+}
+
+MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
+                struct media_request **const pmreq,
+                struct qent_src **const psrc_be,
+                struct qent_dst *const dst_be,
+                const bool is_final)
+{
+    struct media_request * mreq = *pmreq;
+    struct qent_src *const src_be = *psrc_be;
+
+    // Req & src are always both "consumed"
+    *pmreq = NULL;
+    *psrc_be = NULL;
+
+    pthread_mutex_lock(&mbc->lock);
+
+    if (!src_be)
+        goto fail1;
+
+    if (dst_be) {
+        if (qe_dst_waiting(dst_be)) {
+            request_info(mbc->dc, "Request buffer already waiting on start\n");
+            goto fail1;
+        }
+        dst_be->base.timestamp = (struct timeval){0,0};
+        if (qe_v4l2_queue(&dst_be->base, mbc->vfd, NULL, &mbc->dst_fmt, true, false))
+            goto fail1;
+
+        qent_dst_ref(dst_be);
+        queue_put_inuse(mbc->dst, &dst_be->base);
+    }
+
+    if (qe_v4l2_queue(&src_be->base, mbc->vfd, mreq, &mbc->src_fmt, false, !is_final))
+        goto fail1;
+    queue_put_inuse(mbc->src, &src_be->base);
+
+    if (!mbc->polling && mediabufs_wants_poll(mbc)) {
+        mbc->polling = true;
+        pollqueue_add_task(mbc->pt, 2000);
+    }
+    pthread_mutex_unlock(&mbc->lock);
+
+    if (media_request_start(mreq))
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+
+    return MEDIABUFS_STATUS_SUCCESS;
+
+fail1:
+    media_request_abort(&mreq);
+    if (src_be)
+        queue_put_free(mbc->src, &src_be->base);
+
+// *** TODO: If src Q fails this doesnt unwind properly - separate dst Q from src Q
+    if (dst_be) {
+        dst_be->base.status = QENT_ERROR;
+        qe_dst_done(dst_be);
+    }
+    pthread_mutex_unlock(&mbc->lock);
+    return MEDIABUFS_ERROR_OPERATION_FAILED;
+}
+
+
+static int qe_alloc_from_fmt(struct qent_base *const be,
+                   struct dmabufs_ctl *const dbsc,
+                   const struct v4l2_format *const fmt)
+{
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        unsigned int i;
+        for (i = 0; i != fmt->fmt.pix_mp.num_planes; ++i) {
+            be->dh[i] = dmabuf_realloc(dbsc, be->dh[i],
+                fmt->fmt.pix_mp.plane_fmt[i].sizeimage);
+            /* On failure tidy up and die */
+            if (!be->dh[i]) {
+                while (i--) {
+                    dmabuf_free(be->dh[i]);
+                    be->dh[i] = NULL;
+                }
+                return -1;
+            }
+        }
+    }
+    else {
+//      be->dh[0] = dmabuf_alloc(dbsc, fmt->fmt.pix.sizeimage);
+        size_t size = fmt->fmt.pix.sizeimage;
+        be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], size);
+        if (!be->dh[0])
+            return -1;
+    }
+    return 0;
+}
+
+static MediaBufsStatus fmt_set(struct v4l2_format *const fmt, const int fd,
+            const enum v4l2_buf_type buftype,
+            uint32_t pixfmt,
+            const unsigned int width, const unsigned int height,
+                               const size_t bufsize)
+{
+    *fmt = (struct v4l2_format){.type = buftype};
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
+        fmt->fmt.pix_mp.width = width;
+        fmt->fmt.pix_mp.height = height;
+        fmt->fmt.pix_mp.pixelformat = pixfmt;
+        if (bufsize) {
+            fmt->fmt.pix_mp.num_planes = 1;
+            fmt->fmt.pix_mp.plane_fmt[0].sizeimage = bufsize;
+        }
+    }
+    else {
+        fmt->fmt.pix.width = width;
+        fmt->fmt.pix.height = height;
+        fmt->fmt.pix.pixelformat = pixfmt;
+        fmt->fmt.pix.sizeimage = bufsize;
+    }
+
+    while (ioctl(fd, VIDIOC_S_FMT, fmt))
+        if (errno != EINTR)
+            return MEDIABUFS_ERROR_OPERATION_FAILED;
+
+    // Treat anything where we don't get at least what we asked for as a fail
+    if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
+        if (fmt->fmt.pix_mp.width < width ||
+            fmt->fmt.pix_mp.height < height ||
+            fmt->fmt.pix_mp.pixelformat != pixfmt) {
+            return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
+        }
+    }
+    else {
+        if (fmt->fmt.pix.width < width ||
+            fmt->fmt.pix.height < height ||
+            fmt->fmt.pix.pixelformat != pixfmt) {
+            return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
+        }
+    }
+
+    return MEDIABUFS_STATUS_SUCCESS;
+}
+
+static MediaBufsStatus find_fmt_flags(struct v4l2_format *const fmt,
+                   const int fd,
+                   const unsigned int type_v4l2,
+                   const uint32_t flags_must,
+                   const uint32_t flags_not,
+                   const unsigned int width,
+                   const unsigned int height,
+                   mediabufs_dst_fmt_accept_fn *const accept_fn,
+                   void *const accept_v)
+{
+    unsigned int i;
+
+    for (i = 0;; ++i) {
+        struct v4l2_fmtdesc fmtdesc = {
+            .index = i,
+            .type = type_v4l2
+        };
+        while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
+            if (errno != EINTR)
+                return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
+        }
+        if ((fmtdesc.flags & flags_must) != flags_must ||
+            (fmtdesc.flags & flags_not))
+            continue;
+        if (!accept_fn(accept_v, &fmtdesc))
+            continue;
+
+        if (fmt_set(fmt, fd, fmtdesc.type, fmtdesc.pixelformat,
+                width, height, 0) == MEDIABUFS_STATUS_SUCCESS)
+            return MEDIABUFS_STATUS_SUCCESS;
+    }
+    return 0;
+}
+
+
+/* Wait for qent done */
+
+MediaBufsStatus qent_dst_wait(struct qent_dst *const be_dst)
+{
+    struct qent_base *const be = &be_dst->base;
+    enum qent_status estat;
+
+    pthread_mutex_lock(&be_dst->lock);
+    while (be_dst->waiting &&
+           !pthread_cond_wait(&be_dst->cond, &be_dst->lock))
+        /* Loop */;
+    estat = be->status;
+    pthread_mutex_unlock(&be_dst->lock);
+
+    return estat == QENT_DONE ? MEDIABUFS_STATUS_SUCCESS :
+        estat == QENT_ERROR ? MEDIABUFS_ERROR_DECODING_ERROR :
+            MEDIABUFS_ERROR_OPERATION_FAILED;
+}
+
+const uint8_t * qent_dst_data(struct qent_dst *const be_dst, unsigned int buf_no)
+{
+    struct qent_base *const be = &be_dst->base;
+    return dmabuf_map(be->dh[buf_no]);
+}
+
+MediaBufsStatus qent_dst_read_start(struct qent_dst *const be_dst)
+{
+    struct qent_base *const be = &be_dst->base;
+    unsigned int i;
+    for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
+        if (dmabuf_read_start(be->dh[i])) {
+            while (i--)
+                dmabuf_read_end(be->dh[i]);
+            return MEDIABUFS_ERROR_ALLOCATION_FAILED;
+        }
+    }
+    return MEDIABUFS_STATUS_SUCCESS;
+}
+
+MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be_dst)
+{
+    struct qent_base *const be = &be_dst->base;
+    unsigned int i;
+    MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
+
+    for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
+        if (dmabuf_read_end(be->dh[i]))
+            status = MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+    return status;
+}
+
+struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst)
+{
+    if (be_dst)
+        atomic_fetch_add(&be_dst->base.ref_count, 1);
+    return be_dst;
+}
+
+void qent_dst_unref(struct qent_dst ** const pbe_dst)
+{
+    struct qent_dst * const be_dst = *pbe_dst;
+    struct mediabufs_ctl * mbc;
+    if (!be_dst)
+        return;
+    *pbe_dst = NULL;
+
+    if (atomic_fetch_sub(&be_dst->base.ref_count, 1) != 0)
+        return;
+
+    if ((mbc = ff_weak_link_lock(&be_dst->mbc_wl)) != NULL) {
+        queue_put_free(mbc->dst, &be_dst->base);
+        ff_weak_link_unlock(be_dst->mbc_wl);
+    }
+    else {
+        qe_dst_free(be_dst);
+    }
+}
+
+MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
+                unsigned int plane,
+                int fd, size_t size)
+{
+    struct qent_base *const be = &be_dst->base;
+    struct dmabuf_h * dh;
+
+    if (be->status != QENT_IMPORT || be->dh[plane])
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+
+    dh = dmabuf_import(fd, size);
+    if (!dh)
+        return MEDIABUFS_ERROR_ALLOCATION_FAILED;
+
+    be->dh[plane] = dh;
+    return MEDIABUFS_STATUS_SUCCESS;
+}
+
+// Returns noof buffers created, -ve for error
+static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, struct qent_dst * const qes[])
+{
+    unsigned int i;
+
+    struct v4l2_create_buffers cbuf = {
+        .count = n,
+        .memory = V4L2_MEMORY_DMABUF,
+        .format = mbc->dst_fmt,
+    };
+
+    while (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf)) {
+        const int err = -errno;
+        if (err != EINTR) {
+            request_err(mbc->dc, "%s: Failed to create V4L2 buffer\n", __func__);
+            return -err;
+        }
+    }
+
+    if (cbuf.count != n)
+        request_warn(mbc->dc, "%s: Created %d of %d V4L2 buffers requested\n", __func__, cbuf.count, n);
+
+    for (i = 0; i != cbuf.count; ++i)
+        qes[i]->base.index = cbuf.index + i;
+
+    return cbuf.count;
+}
+
+struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc)
+{
+    struct qent_dst * be_dst;
+
+    if (mbc == NULL) {
+        be_dst = qe_dst_new(NULL);
+        if (be_dst)
+            be_dst->base.status = QENT_IMPORT;
+        return be_dst;
+    }
+
+    if (mbc->dst_fixed) {
+        be_dst = base_to_dst(queue_get_free(mbc->dst));
+        if (!be_dst)
+            return NULL;
+    }
+    else {
+        be_dst = base_to_dst(queue_tryget_free(mbc->dst));
+        if (!be_dst) {
+            be_dst = qe_dst_new(mbc->this_wlm);
+            if (!be_dst)
+                return NULL;
+
+            if (create_dst_bufs(mbc, 1, &be_dst) != 1) {
+                qe_dst_free(be_dst);
+                return NULL;
+            }
+        }
+    }
+
+    if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) {
+        /* Given  how create buf works we can't uncreate it on alloc failure
+         * all we can do is put it on the free Q
+        */
+        queue_put_free(mbc->dst, &be_dst->base);
+        return NULL;
+    }
+
+    be_dst->base.status = QENT_PENDING;
+    atomic_store(&be_dst->base.ref_count, 0);
+    return be_dst;
+}
+
+const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc)
+{
+    return &mbc->dst_fmt;
+}
+
+MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
+               const unsigned int width,
+               const unsigned int height,
+               mediabufs_dst_fmt_accept_fn *const accept_fn,
+               void *const accept_v)
+{
+    MediaBufsStatus status;
+    unsigned int i;
+    const enum v4l2_buf_type buf_type = mbc->dst_fmt.type;
+    static const struct {
+        unsigned int flags_must;
+        unsigned int flags_not;
+    } trys[] = {
+        {0, V4L2_FMT_FLAG_EMULATED},
+        {V4L2_FMT_FLAG_EMULATED, 0},
+    };
+    for (i = 0; i != sizeof(trys)/sizeof(trys[0]); ++i) {
+        status = find_fmt_flags(&mbc->dst_fmt, mbc->vfd,
+                                buf_type,
+                                trys[i].flags_must,
+                                trys[i].flags_not,
+                                width, height, accept_fn, accept_v);
+        if (status != MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE)
+            return status;
+    }
+
+    if (status != MEDIABUFS_STATUS_SUCCESS)
+        return status;
+
+    /* Try to create a buffer - don't alloc */
+    return status;
+}
+
+// ** This is a mess if we get partial alloc but without any way to remove
+//    individual V4L2 Q members we are somewhat stuffed
+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed)
+{
+    unsigned int i;
+    int a = 0;
+    unsigned int qc;
+    struct qent_dst * qes[32];
+
+    if (n > 32)
+        return MEDIABUFS_ERROR_ALLOCATION_FAILED;
+
+    // Create qents first as it is hard to get rid of the V4L2 buffers on error
+    for (qc = 0; qc != n; ++qc)
+    {
+        if ((qes[qc] = qe_dst_new(mbc->this_wlm)) == NULL)
+            goto fail;
+    }
+
+    if ((a = create_dst_bufs(mbc, n, qes)) < 0)
+        goto fail;
+
+    for (i = 0; i != a; ++i)
+        queue_put_free(mbc->dst, &qes[i]->base);
+
+    if (a != n)
+        goto fail;
+
+    mbc->dst_fixed = fixed;
+    return MEDIABUFS_STATUS_SUCCESS;
+
+fail:
+    for (i = (a < 0 ? 0 : a); i != qc; ++i)
+        qe_dst_free(qes[i]);
+
+    return MEDIABUFS_ERROR_ALLOCATION_FAILED;
+}
+
+struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc)
+{
+    struct qent_base * buf = queue_get_free(mbc->src);
+    buf->status = QENT_PENDING;
+    return base_to_src(buf);
+}
+
+void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src)
+{
+    struct qent_src *const qe_src = *pqe_src;
+    if (!qe_src)
+        return;
+    *pqe_src = NULL;
+    queue_put_free(mbc->src, &qe_src->base);
+}
+
+/* src format must have been set up before this */
+MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc,
+                  struct dmabufs_ctl * const dbsc,
+                  unsigned int n)
+{
+    unsigned int i;
+    struct v4l2_requestbuffers req = {
+        .count = n,
+        .type = mbc->src_fmt.type,
+        .memory = V4L2_MEMORY_DMABUF
+    };
+
+    bq_free_all_free_src(mbc->src);
+    while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) {
+        if (errno != EINTR) {
+            request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__);
+            return MEDIABUFS_ERROR_OPERATION_FAILED;
+        }
+    }
+
+    if (n > req.count) {
+        request_info(mbc->dc, "Only allocated %d of %d src buffers requested\n", req.count, n);
+        n = req.count;
+    }
+
+    for (i = 0; i != n; ++i) {
+        struct qent_src *const be_src = qe_src_new();
+        if (!be_src) {
+            request_err(mbc->dc, "Failed to create src be %d\n", i);
+            goto fail;
+        }
+        if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) {
+            qe_src_free(be_src);
+            goto fail;
+        }
+        be_src->base.index = i;
+        be_src->fixed_size = !mediabufs_src_resizable(mbc);
+
+        queue_put_free(mbc->src, &be_src->base);
+    }
+
+    return MEDIABUFS_STATUS_SUCCESS;
+
+fail:
+    bq_free_all_free_src(mbc->src);
+    req.count = 0;
+    while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1 &&
+           errno == EINTR)
+        /* Loop */;
+
+    return MEDIABUFS_ERROR_OPERATION_FAILED;
+}
+
+
+
+/*
+ * Set stuff order:
+ *  Set src fmt
+ *  Set parameters (sps) on vfd
+ *  Negotiate dst format (dst_fmt_set)
+ *  Create src buffers
+ *  Alloc a dst buffer or Create dst slots
+*/
+MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc)
+{
+    if (mbc->stream_on)
+        return MEDIABUFS_STATUS_SUCCESS;
+
+    if (set_stream(mbc->vfd, mbc->src_fmt.type, true) < 0) {
+        request_log("Failed to set stream on src type %d\n", mbc->src_fmt.type);
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+
+    if (set_stream(mbc->vfd, mbc->dst_fmt.type, true) < 0) {
+        request_log("Failed to set stream on dst type %d\n", mbc->dst_fmt.type);
+        set_stream(mbc->vfd, mbc->src_fmt.type, false);
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+
+    mbc->stream_on = true;
+    return MEDIABUFS_STATUS_SUCCESS;
+}
+
+MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc)
+{
+    MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
+
+    if (!mbc->stream_on)
+        return MEDIABUFS_STATUS_SUCCESS;
+
+    if (set_stream(mbc->vfd, mbc->dst_fmt.type, false) < 0) {
+        request_log("Failed to set stream off dst type %d\n", mbc->dst_fmt.type);
+        status = MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+
+    if (set_stream(mbc->vfd, mbc->src_fmt.type, false) < 0) {
+        request_log("Failed to set stream off src type %d\n", mbc->src_fmt.type);
+        status = MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+
+    mbc->stream_on = false;
+    return status;
+}
+
+int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, struct v4l2_ext_control control_array[], unsigned int n)
+{
+    struct v4l2_ext_controls controls = {
+        .controls = control_array,
+        .count = n
+    };
+
+    if (mreq) {
+        controls.which = V4L2_CTRL_WHICH_REQUEST_VAL;
+        controls.request_fd = media_request_fd(mreq);
+    }
+
+    while (ioctl(mbc->vfd, VIDIOC_S_EXT_CTRLS, &controls))
+    {
+        const int err = errno;
+        if (err != EINTR) {
+            request_err(mbc->dc, "Unable to set controls: %s\n", strerror(err));
+            return -err;
+        }
+    }
+
+    return 0;
+}
+
+MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
+                struct media_request * const mreq,
+                unsigned int id, void *data,
+                unsigned int size)
+{
+    struct v4l2_ext_control control = {
+        .id = id,
+        .ptr = data,
+        .size = size
+    };
+
+    int rv = mediabufs_ctl_set_ext_ctrls(mbc, mreq, &control, 1);
+    return !rv ? MEDIABUFS_STATUS_SUCCESS : MEDIABUFS_ERROR_OPERATION_FAILED;
+}
+
+MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
+                                      enum v4l2_buf_type buf_type,
+                   const uint32_t pixfmt,
+                   const uint32_t width, const uint32_t height,
+                                      const size_t bufsize)
+{
+    MediaBufsStatus rv = fmt_set(&mbc->src_fmt, mbc->vfd, buf_type, pixfmt, width, height, bufsize);
+    if (rv != MEDIABUFS_STATUS_SUCCESS)
+        request_err(mbc->dc, "Failed to set src buftype %d, format %#x %dx%d\n", buf_type, pixfmt, width, height);
+
+    return rv;
+}
+
+int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n)
+{
+    int rv = 0;
+    while (n--) {
+        while (ioctl(mbc->vfd, VIDIOC_QUERY_EXT_CTRL, ctrls)) {
+            const int err = errno;
+            if (err != EINTR) {
+                // Often used for probing - errors are to be expected
+                request_debug(mbc->dc, "Failed to query ext id=%#x, err=%d\n", ctrls->id, err);
+                ctrls->type = 0; // 0 is invalid
+                rv = -err;
+                break;
+            }
+        }
+        ++ctrls;
+    }
+    return rv;
+}
+
+int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc)
+{
+    // Single planar OUTPUT can only take exact size buffers
+    // Multiplanar will take larger than negotiated
+    return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type);
+}
+
+static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc)
+{
+    if (!mbc)
+        return;
+
+    // Break the weak link first
+    ff_weak_link_break(&mbc->this_wlm);
+
+    polltask_delete(&mbc->pt);
+
+    mediabufs_stream_off(mbc);
+
+    // Empty v4l2 buffer stash
+    request_buffers(mbc->vfd, mbc->src_fmt.type, V4L2_MEMORY_MMAP, 0);
+    request_buffers(mbc->vfd, mbc->dst_fmt.type, V4L2_MEMORY_MMAP, 0);
+
+    bq_free_all_free_src(mbc->src);
+    bq_free_all_inuse_src(mbc->src);
+    bq_free_all_free_dst(mbc->dst);
+
+    {
+        struct qent_dst *dst_be;
+        while ((dst_be = base_to_dst(bq_get_inuse(mbc->dst))) != NULL) {
+            dst_be->base.timestamp = (struct timeval){0};
+            dst_be->base.status = QENT_ERROR;
+            qe_dst_done(dst_be);
+        }
+    }
+
+    queue_delete(mbc->dst);
+    queue_delete(mbc->src);
+    close(mbc->vfd);
+    pthread_mutex_destroy(&mbc->lock);
+
+    free(mbc);
+}
+
+struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc)
+{
+    atomic_fetch_add(&mbc->ref_count, 1);
+    return mbc;
+}
+
+void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc)
+{
+    struct mediabufs_ctl *const mbc = *pmbc;
+    int n;
+
+    if (!mbc)
+        return;
+    *pmbc = NULL;
+    n = atomic_fetch_sub(&mbc->ref_count, 1);
+    if (n)
+        return;
+    mediabufs_ctl_delete(mbc);
+}
+
+static int set_capabilities(struct mediabufs_ctl *const mbc)
+{
+    struct v4l2_capability capability = { 0 };
+    uint32_t caps;
+
+    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &capability)) {
+        int err = errno;
+        request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err));
+        return -err;
+    }
+
+    caps = (capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
+            capability.device_caps :
+            capability.capabilities;
+
+    if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) {
+        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
+        mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
+    }
+    else if ((caps & V4L2_CAP_VIDEO_M2M) != 0) {
+        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
+        mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    }
+    else {
+        request_err(mbc->dc, "No M2M capabilities (%#x)\n", caps);
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+/* One of these per context */
+struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, const char * vpath, struct pollqueue *const pq)
+{
+    struct mediabufs_ctl *const mbc = calloc(1, sizeof(*mbc));
+
+    if (!mbc)
+        return NULL;
+
+    mbc->dc = dc;
+    // Default mono planar
+    mbc->pq = pq;
+    pthread_mutex_init(&mbc->lock, NULL);
+
+    /* Pick a default  - could we scan for this? */
+    if (vpath == NULL)
+        vpath = "/dev/media0";
+
+    while ((mbc->vfd = open(vpath, O_RDWR)) == -1)
+    {
+        const int err = errno;
+        if (err != EINTR) {
+            request_err(dc, "Failed to open video dev '%s': %s\n", vpath, strerror(err));
+            goto fail0;
+        }
+    }
+
+    if (set_capabilities(mbc)) {
+        request_err(dc, "Bad capabilities for video dev '%s'\n", vpath);
+        goto fail1;
+    }
+
+    mbc->src = queue_new(mbc->vfd);
+    if (!mbc->src)
+        goto fail1;
+    mbc->dst = queue_new(mbc->vfd);
+    if (!mbc->dst)
+        goto fail2;
+    mbc->pt = polltask_new(pq, mbc->vfd, POLLIN | POLLOUT, mediabufs_poll_cb, mbc);
+    if (!mbc->pt)
+        goto fail3;
+    mbc->this_wlm = ff_weak_link_new(mbc);
+    if (!mbc->this_wlm)
+        goto fail4;
+
+    /* Cannot add polltask now - polling with nothing pending
+     * generates infinite error polls
+    */
+    return mbc;
+
+fail4:
+    polltask_delete(&mbc->pt);
+fail3:
+    queue_delete(mbc->dst);
+fail2:
+    queue_delete(mbc->src);
+fail1:
+    close(mbc->vfd);
+fail0:
+    free(mbc);
+    request_info(dc, "%s: FAILED\n", __func__);
+    return NULL;
+}
+
+
+
diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h
new file mode 100644
index 0000000000..2f826cfb14
--- /dev/null
+++ b/libavcodec/v4l2_req_media.h
@@ -0,0 +1,151 @@
+/*
+e.h
+*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _MEDIA_H_
+#define _MEDIA_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+struct v4l2_format;
+struct v4l2_fmtdesc;
+struct v4l2_query_ext_ctrl;
+
+struct pollqueue;
+struct media_request;
+struct media_pool;
+
+typedef enum media_buf_status {
+    MEDIABUFS_STATUS_SUCCESS = 0,
+    MEDIABUFS_ERROR_OPERATION_FAILED,
+    MEDIABUFS_ERROR_DECODING_ERROR,
+    MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE,
+    MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT,
+    MEDIABUFS_ERROR_ALLOCATION_FAILED,
+} MediaBufsStatus;
+
+struct media_pool * media_pool_new(const char * const media_path,
+                   struct pollqueue * const pq,
+                   const unsigned int n);
+void media_pool_delete(struct media_pool ** pmp);
+
+// Obtain a media request
+// Will block if none availible - has a 2sec timeout
+struct media_request * media_request_get(struct media_pool * const mp);
+int media_request_fd(const struct media_request * const req);
+
+// Start this request
+// Request structure is returned to pool once done
+int media_request_start(struct media_request * const req);
+
+// Return an *unstarted* media_request to the pool
+// May later be upgraded to allow for aborting a started req
+int media_request_abort(struct media_request ** const preq);
+
+
+struct mediabufs_ctl;
+struct qent_src;
+struct qent_dst;
+struct dmabuf_h;
+struct dmabufs_ctl;
+
+int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp);
+struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst);
+
+// prealloc
+int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc);
+// dbsc may be NULL if realloc not required
+int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc);
+const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be, unsigned int plane);
+int qent_dst_dup_fd(const struct qent_dst *const be, unsigned int plane);
+MediaBufsStatus qent_dst_wait(struct qent_dst *const be);
+void qent_dst_delete(struct qent_dst *const be);
+// Returns a qent_dst to its mbc free Q or deletes it if the mbc is dead
+void qent_dst_unref(struct qent_dst ** const pbe_dst);
+struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst);
+
+const uint8_t * qent_dst_data(struct qent_dst *const be, unsigned int buf_no);
+MediaBufsStatus qent_dst_read_start(struct qent_dst *const be);
+MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be);
+/* Import an fd unattached to any mediabuf */
+MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
+                unsigned int plane,
+                int fd, size_t size);
+
+MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
+                struct media_request **const pmreq,
+                struct qent_src **const psrc_be,
+                struct qent_dst *const dst_be,
+                const bool is_final);
+// Get / alloc a dst buffer & associate with a slot
+// If the dst pool is empty then behaviour depends on the fixed flag passed to
+// dst_slots_create.  Default is !fixed = unlimited alloc
+struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc,
+                           struct dmabufs_ctl *const dbsc);
+// Create dst slots without alloc
+// If fixed true then qent_alloc will only get slots from this pool and will
+// block until a qent has been unrefed
+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed);
+
+MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc);
+MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc);
+const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc);
+
+typedef int mediabufs_dst_fmt_accept_fn(void * v, const struct v4l2_fmtdesc *fmtdesc);
+
+MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
+               const unsigned int width,
+               const unsigned int height,
+               mediabufs_dst_fmt_accept_fn *const accept_fn,
+               void *const accept_v);
+struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc);
+void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src);
+
+int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq,
+                                struct v4l2_ext_control control_array[], unsigned int n);
+MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
+                struct media_request * const mreq,
+                unsigned int id, void *data,
+                unsigned int size);
+int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n);
+
+int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc);
+
+MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
+                                      enum v4l2_buf_type buf_type,
+                                      const uint32_t pixfmt,
+                                      const uint32_t width, const uint32_t height,
+                                      const size_t bufsize);
+
+MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw,
+                  struct dmabufs_ctl * const dbsc,
+                  unsigned int n);
+
+struct mediabufs_ctl * mediabufs_ctl_new(void * const dc,
+                     const char *vpath, struct pollqueue *const pq);
+void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc);
+struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc);
+
+
+#endif
diff --git a/libavcodec/v4l2_req_pollqueue.c b/libavcodec/v4l2_req_pollqueue.c
new file mode 100644
index 0000000000..cc8a5d4001
--- /dev/null
+++ b/libavcodec/v4l2_req_pollqueue.c
@@ -0,0 +1,361 @@
+#include <errno.h>
+#include <limits.h>
+#include <poll.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/eventfd.h>
+
+#include "v4l2_req_pollqueue.h"
+#include "v4l2_req_utils.h"
+
+
+struct pollqueue;
+
+enum polltask_state {
+    POLLTASK_UNQUEUED = 0,
+    POLLTASK_QUEUED,
+    POLLTASK_RUNNING,
+    POLLTASK_Q_KILL,
+    POLLTASK_RUN_KILL,
+};
+
+struct polltask {
+    struct polltask *next;
+    struct polltask *prev;
+    struct pollqueue *q;
+    enum polltask_state state;
+
+    int fd;
+    short events;
+
+    void (*fn)(void *v, short revents);
+    void * v;
+
+    uint64_t timeout; /* CLOCK_MONOTONIC time, 0 => never */
+    sem_t kill_sem;
+};
+
+struct pollqueue {
+    atomic_int ref_count;
+    pthread_mutex_t lock;
+
+    struct polltask *head;
+    struct polltask *tail;
+
+    bool kill;
+    bool no_prod;
+    int prod_fd;
+    struct polltask *prod_pt;
+    pthread_t worker;
+};
+
+struct polltask *polltask_new(struct pollqueue *const pq,
+                              const int fd, const short events,
+                  void (*const fn)(void *v, short revents),
+                  void *const v)
+{
+    struct polltask *pt;
+
+    if (!events)
+        return NULL;
+
+    pt = malloc(sizeof(*pt));
+    if (!pt)
+        return NULL;
+
+    *pt = (struct polltask){
+        .next = NULL,
+        .prev = NULL,
+        .q = pollqueue_ref(pq),
+        .fd = fd,
+        .events = events,
+        .fn = fn,
+        .v = v
+    };
+
+    sem_init(&pt->kill_sem, 0, 0);
+
+    return pt;
+}
+
+static void pollqueue_rem_task(struct pollqueue *const pq, struct polltask *const pt)
+{
+    if (pt->prev)
+        pt->prev->next = pt->next;
+    else
+        pq->head = pt->next;
+    if (pt->next)
+        pt->next->prev = pt->prev;
+    else
+        pq->tail = pt->prev;
+    pt->next = NULL;
+    pt->prev = NULL;
+}
+
+static void polltask_free(struct polltask * const pt)
+{
+    sem_destroy(&pt->kill_sem);
+    free(pt);
+}
+
+static int pollqueue_prod(const struct pollqueue *const pq)
+{
+    static const uint64_t one = 1;
+    return write(pq->prod_fd, &one, sizeof(one));
+}
+
+void polltask_delete(struct polltask **const ppt)
+{
+    struct polltask *const pt = *ppt;
+    struct pollqueue * pq;
+    enum polltask_state state;
+    bool prodme;
+
+    if (!pt)
+        return;
+
+    pq = pt->q;
+    pthread_mutex_lock(&pq->lock);
+    state = pt->state;
+    pt->state = (state == POLLTASK_RUNNING) ? POLLTASK_RUN_KILL : POLLTASK_Q_KILL;
+    prodme = !pq->no_prod;
+    pthread_mutex_unlock(&pq->lock);
+
+    if (state != POLLTASK_UNQUEUED) {
+        if (prodme)
+            pollqueue_prod(pq);
+        while (sem_wait(&pt->kill_sem) && errno == EINTR)
+            /* loop */;
+    }
+
+    // Leave zapping the ref until we have DQed the PT as might well be
+    // legitimately used in it
+    *ppt = NULL;
+    polltask_free(pt);
+    pollqueue_unref(&pq);
+}
+
+static uint64_t pollqueue_now(int timeout)
+{
+    struct timespec now;
+    uint64_t now_ms;
+
+    if (clock_gettime(CLOCK_MONOTONIC, &now))
+        return 0;
+    now_ms = (now.tv_nsec / 1000000) + (uint64_t)now.tv_sec * 1000 + timeout;
+    return now_ms ? now_ms : (uint64_t)1;
+}
+
+void pollqueue_add_task(struct polltask *const pt, const int timeout)
+{
+    bool prodme = false;
+    struct pollqueue * const pq = pt->q;
+
+    pthread_mutex_lock(&pq->lock);
+    if (pt->state != POLLTASK_Q_KILL && pt->state != POLLTASK_RUN_KILL) {
+        if (pq->tail)
+            pq->tail->next = pt;
+        else
+            pq->head = pt;
+        pt->prev = pq->tail;
+        pt->next = NULL;
+        pt->state = POLLTASK_QUEUED;
+        pt->timeout = timeout < 0 ? 0 : pollqueue_now(timeout);
+        pq->tail = pt;
+        prodme = !pq->no_prod;
+    }
+    pthread_mutex_unlock(&pq->lock);
+    if (prodme)
+        pollqueue_prod(pq);
+}
+
+static void *poll_thread(void *v)
+{
+    struct pollqueue *const pq = v;
+    struct pollfd *a = NULL;
+    size_t asize = 0;
+
+    pthread_mutex_lock(&pq->lock);
+    do {
+        unsigned int i;
+        unsigned int n = 0;
+        struct polltask *pt;
+        struct polltask *pt_next;
+        uint64_t now = pollqueue_now(0);
+        int timeout = -1;
+        int rv;
+
+        for (pt = pq->head; pt; pt = pt_next) {
+            int64_t t;
+
+            pt_next = pt->next;
+
+            if (pt->state == POLLTASK_Q_KILL) {
+                pollqueue_rem_task(pq, pt);
+                sem_post(&pt->kill_sem);
+                continue;
+            }
+
+            if (n >= asize) {
+                asize = asize ? asize * 2 : 4;
+                a = realloc(a, asize * sizeof(*a));
+                if (!a) {
+                    request_log("Failed to realloc poll array to %zd\n", asize);
+                    goto fail_locked;
+                }
+            }
+
+            a[n++] = (struct pollfd){
+                .fd = pt->fd,
+                .events = pt->events
+            };
+
+            t = (int64_t)(pt->timeout - now);
+            if (pt->timeout && t < INT_MAX &&
+                (timeout < 0 || (int)t < timeout))
+                timeout = (t < 0) ? 0 : (int)t;
+        }
+        pthread_mutex_unlock(&pq->lock);
+
+        if ((rv = poll(a, n, timeout)) == -1) {
+            if (errno != EINTR) {
+                request_log("Poll error: %s\n", strerror(errno));
+                goto fail_unlocked;
+            }
+        }
+
+        pthread_mutex_lock(&pq->lock);
+        now = pollqueue_now(0);
+
+        /* Prodding in this loop is pointless and might lead to
+         * infinite looping
+        */
+        pq->no_prod = true;
+        for (i = 0, pt = pq->head; i < n; ++i, pt = pt_next) {
+            pt_next = pt->next;
+
+            /* Pending? */
+            if (a[i].revents ||
+                (pt->timeout && (int64_t)(now - pt->timeout) >= 0)) {
+                pollqueue_rem_task(pq, pt);
+                if (pt->state == POLLTASK_QUEUED)
+                    pt->state = POLLTASK_RUNNING;
+                if (pt->state == POLLTASK_Q_KILL)
+                    pt->state = POLLTASK_RUN_KILL;
+                pthread_mutex_unlock(&pq->lock);
+
+                /* This can add new entries to the Q but as
+                 * those are added to the tail our existing
+                 * chain remains intact
+                */
+                pt->fn(pt->v, a[i].revents);
+
+                pthread_mutex_lock(&pq->lock);
+                if (pt->state == POLLTASK_RUNNING)
+                    pt->state = POLLTASK_UNQUEUED;
+                if (pt->state == POLLTASK_RUN_KILL)
+                    sem_post(&pt->kill_sem);
+            }
+        }
+        pq->no_prod = false;
+
+    } while (!pq->kill);
+
+fail_locked:
+    pthread_mutex_unlock(&pq->lock);
+fail_unlocked:
+    free(a);
+    return NULL;
+}
+
+static void prod_fn(void *v, short revents)
+{
+    struct pollqueue *const pq = v;
+    char buf[8];
+    if (revents)
+        read(pq->prod_fd, buf, 8);
+    if (!pq->kill)
+        pollqueue_add_task(pq->prod_pt, -1);
+}
+
+struct pollqueue * pollqueue_new(void)
+{
+    struct pollqueue *pq = malloc(sizeof(*pq));
+    if (!pq)
+        return NULL;
+    *pq = (struct pollqueue){
+        .ref_count = ATOMIC_VAR_INIT(0),
+        .lock = PTHREAD_MUTEX_INITIALIZER,
+        .head = NULL,
+        .tail = NULL,
+        .kill = false,
+        .prod_fd = -1
+    };
+
+    pq->prod_fd = eventfd(0, EFD_NONBLOCK);
+    if (pq->prod_fd == 1)
+        goto fail1;
+    pq->prod_pt = polltask_new(pq, pq->prod_fd, POLLIN, prod_fn, pq);
+    if (!pq->prod_pt)
+        goto fail2;
+    pollqueue_add_task(pq->prod_pt, -1);
+    if (pthread_create(&pq->worker, NULL, poll_thread, pq))
+        goto fail3;
+    // Reset ref count which will have been inced by the add_task
+    atomic_store(&pq->ref_count, 0);
+    return pq;
+
+fail3:
+    polltask_free(pq->prod_pt);
+fail2:
+    close(pq->prod_fd);
+fail1:
+    free(pq);
+    return NULL;
+}
+
+static void pollqueue_free(struct pollqueue *const pq)
+{
+    void *rv;
+
+    pthread_mutex_lock(&pq->lock);
+    pq->kill = true;
+    pollqueue_prod(pq);
+    pthread_mutex_unlock(&pq->lock);
+
+    pthread_join(pq->worker, &rv);
+    polltask_free(pq->prod_pt);
+    pthread_mutex_destroy(&pq->lock);
+    close(pq->prod_fd);
+    free(pq);
+}
+
+struct pollqueue * pollqueue_ref(struct pollqueue *const pq)
+{
+    atomic_fetch_add(&pq->ref_count, 1);
+    return pq;
+}
+
+void pollqueue_unref(struct pollqueue **const ppq)
+{
+    struct pollqueue * const pq = *ppq;
+
+    if (!pq)
+        return;
+    *ppq = NULL;
+
+    if (atomic_fetch_sub(&pq->ref_count, 1) != 0)
+        return;
+
+    pollqueue_free(pq);
+}
+
+
+
diff --git a/libavcodec/v4l2_req_pollqueue.h b/libavcodec/v4l2_req_pollqueue.h
new file mode 100644
index 0000000000..e1182cb2fc
--- /dev/null
+++ b/libavcodec/v4l2_req_pollqueue.h
@@ -0,0 +1,18 @@
+#ifndef POLLQUEUE_H_
+#define POLLQUEUE_H_
+
+struct polltask;
+struct pollqueue;
+
+struct polltask *polltask_new(struct pollqueue *const pq,
+			      const int fd, const short events,
+			      void (*const fn)(void *v, short revents),
+			      void *const v);
+void polltask_delete(struct polltask **const ppt);
+
+void pollqueue_add_task(struct polltask *const pt, const int timeout);
+struct pollqueue * pollqueue_new(void);
+void pollqueue_unref(struct pollqueue **const ppq);
+struct pollqueue * pollqueue_ref(struct pollqueue *const pq);
+
+#endif /* POLLQUEUE_H_ */
diff --git a/libavcodec/v4l2_req_utils.h b/libavcodec/v4l2_req_utils.h
new file mode 100644
index 0000000000..a31cc1f4ec
--- /dev/null
+++ b/libavcodec/v4l2_req_utils.h
@@ -0,0 +1,27 @@
+#ifndef AVCODEC_V4L2_REQ_UTILS_H
+#define AVCODEC_V4L2_REQ_UTILS_H
+
+#include <stdint.h>
+#include "libavutil/log.h"
+
+#define request_log(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__)
+
+#define request_err(_ctx, ...) av_log(_ctx, AV_LOG_ERROR, __VA_ARGS__)
+#define request_warn(_ctx, ...) av_log(_ctx, AV_LOG_WARNING, __VA_ARGS__)
+#define request_info(_ctx, ...) av_log(_ctx, AV_LOG_INFO, __VA_ARGS__)
+#define request_debug(_ctx, ...) av_log(_ctx, AV_LOG_DEBUG, __VA_ARGS__)
+
+static inline char safechar(char c) {
+    return c > 0x20 && c < 0x7f ? c : '.';
+}
+
+static inline const char * strfourcc(char tbuf[5], uint32_t fcc) {
+    tbuf[0] = safechar((fcc >>  0) & 0xff);
+    tbuf[1] = safechar((fcc >>  8) & 0xff);
+    tbuf[2] = safechar((fcc >> 16) & 0xff);
+    tbuf[3] = safechar((fcc >> 24) & 0xff);
+    tbuf[4] = '\0';
+    return tbuf;
+}
+
+#endif
diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
new file mode 100644
index 0000000000..b0a5930844
--- /dev/null
+++ b/libavcodec/v4l2_request_hevc.c
@@ -0,0 +1,297 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+
+#include "decode.h"
+#include "hevcdec.h"
+#include "hwconfig.h"
+#include "internal.h"
+
+#include "v4l2_request_hevc.h"
+
+#include "libavutil/hwcontext_drm.h"
+
+#include "v4l2_req_devscan.h"
+#include "v4l2_req_dmabufs.h"
+#include "v4l2_req_pollqueue.h"
+#include "v4l2_req_media.h"
+#include "v4l2_req_utils.h"
+
+static size_t bit_buf_size(unsigned int w, unsigned int h, unsigned int bits_minus8)
+{
+    const size_t wxh = w * h;
+    size_t bits_alloc;
+
+    /* Annex A gives a min compression of 2 @ lvl 3.1
+     * (wxh <= 983040) and min 4 thereafter but avoid
+     * the odity of 983041 having a lower limit than
+     * 983040.
+     * Multiply by 3/2 for 4:2:0
+     */
+    bits_alloc = wxh < 983040 ? wxh * 3 / 4 :
+        wxh < 983040 * 2 ? 983040 * 3 / 4 :
+        wxh * 3 / 8;
+    /* Allow for bit depth */
+    bits_alloc += (bits_alloc * bits_minus8) / 8;
+    /* Add a few bytes (16k) for overhead */
+    bits_alloc += 0x4000;
+    return bits_alloc;
+}
+
+static int v4l2_req_hevc_start_frame(AVCodecContext *avctx,
+                                     av_unused const uint8_t *buffer,
+                                     av_unused uint32_t size)
+{
+    const V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    return ctx->fns->start_frame(avctx, buffer, size);
+}
+
+static int v4l2_req_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    return ctx->fns->decode_slice(avctx, buffer, size);
+}
+
+static int v4l2_req_hevc_end_frame(AVCodecContext *avctx)
+{
+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
+    return ctx->fns->end_frame(avctx);
+}
+
+static void v4l2_req_hevc_abort_frame(AVCodecContext * const avctx)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    ctx->fns->abort_frame(avctx);
+}
+
+static int v4l2_req_hevc_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    return ctx->fns->frame_params(avctx, hw_frames_ctx);
+}
+
+static int v4l2_req_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    return ctx->fns->alloc_frame(avctx, frame);
+}
+
+
+static int v4l2_request_hevc_uninit(AVCodecContext *avctx)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+
+    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    decode_q_wait(&ctx->decode_q, NULL);  // Wait for all other threads to be out of decode
+
+    mediabufs_ctl_unref(&ctx->mbufs);
+    media_pool_delete(&ctx->mpool);
+    pollqueue_unref(&ctx->pq);
+    dmabufs_ctl_delete(&ctx->dbufs);
+    devscan_delete(&ctx->devscan);
+
+    decode_q_uninit(&ctx->decode_q);
+
+//    if (avctx->hw_frames_ctx) {
+//        AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
+//        av_buffer_pool_flush(hwfc->pool);
+//    }
+    return 0;
+}
+
+static int dst_fmt_accept_cb(void * v, const struct v4l2_fmtdesc *fmtdesc)
+{
+    AVCodecContext *const avctx = v;
+    const HEVCContext *const h = avctx->priv_data;
+
+    if (h->ps.sps->bit_depth == 8) {
+        if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_COL128 ||
+            fmtdesc->pixelformat == V4L2_PIX_FMT_NV12) {
+            return 1;
+        }
+    }
+    else if (h->ps.sps->bit_depth == 10) {
+        if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static int v4l2_request_hevc_init(AVCodecContext *avctx)
+{
+    const HEVCContext *h = avctx->priv_data;
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    const HEVCSPS * const sps = h->ps.sps;
+    int ret;
+    const struct decdev * decdev;
+    const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2;  // Assuming constant for all APIs but avoiding V4L2 includes
+    size_t src_size;
+
+    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) {
+        av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n");
+        return (AVERROR(-ret));
+    }
+    ret = AVERROR(ENOMEM);  // Assume mem fail by default for these
+
+    if ((decdev = devscan_find(ctx->devscan, src_pix_fmt)) == NULL)
+    {
+        av_log(avctx, AV_LOG_WARNING, "Failed to find a V4L2 device for H265\n");
+        ret = AVERROR(ENODEV);
+        goto fail0;
+    }
+    av_log(avctx, AV_LOG_DEBUG, "Trying V4L2 devices: %s,%s\n",
+           decdev_media_path(decdev), decdev_video_path(decdev));
+
+    if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to open dmabufs\n");
+        goto fail0;
+    }
+
+    if ((ctx->pq = pollqueue_new()) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to create pollqueue\n");
+        goto fail1;
+    }
+
+    if ((ctx->mpool = media_pool_new(decdev_media_path(decdev), ctx->pq, 4)) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to create media pool\n");
+        goto fail2;
+    }
+
+    if ((ctx->mbufs = mediabufs_ctl_new(avctx, decdev_video_path(decdev), ctx->pq)) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to create media controls\n");
+        goto fail3;
+    }
+
+    // Ask for an initial bitbuf size of max size / 4
+    // We will realloc if we need more
+    // Must use sps->h/w as avctx contains cropped size
+    src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8);
+    if (mediabufs_src_resizable(ctx->mbufs))
+        src_size /= 4;
+    // Kludge for conformance tests which break Annex A limits
+    else if (src_size < 0x40000)
+        src_size = 0x40000;
+
+    if (mediabufs_src_fmt_set(ctx->mbufs, decdev_src_type(decdev), src_pix_fmt,
+                              sps->width, sps->height, src_size)) {
+        char tbuf1[5];
+        av_log(avctx, AV_LOG_ERROR, "Failed to set source format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
+        goto fail4;
+    }
+
+    if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n");
+        ctx->fns = &V2(ff_v4l2_req_hevc, 2);
+    }
+    else if (V2(ff_v4l2_req_hevc, 1).probe(avctx, ctx) == 0) {
+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n");
+        ctx->fns = &V2(ff_v4l2_req_hevc, 1);
+    }
+    else {
+        av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n");
+        ret = AVERROR(EINVAL);
+        goto fail4;
+    }
+
+    if (mediabufs_dst_fmt_set(ctx->mbufs, sps->width, sps->height, dst_fmt_accept_cb, avctx)) {
+        char tbuf1[5];
+        av_log(avctx, AV_LOG_ERROR, "Failed to set destination format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
+        goto fail4;
+    }
+
+    if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6)) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n");
+        goto fail4;
+    }
+
+    {
+        unsigned int dst_slots = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering +
+            avctx->thread_count + (avctx->extra_hw_frames > 0 ? avctx->extra_hw_frames : 6);
+        av_log(avctx, AV_LOG_DEBUG, "Slots=%d: Reordering=%d, threads=%d, hw+=%d\n", dst_slots,
+               sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering,
+               avctx->thread_count, avctx->extra_hw_frames);
+
+        // extra_hw_frames is -1 if unset
+        if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0))) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n");
+            goto fail4;
+        }
+    }
+
+    if (mediabufs_stream_on(ctx->mbufs)) {
+        av_log(avctx, AV_LOG_ERROR, "Failed stream on\n");
+        goto fail4;
+    }
+
+    if ((ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM)) != 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create frame ctx\n");
+        goto fail4;
+    }
+
+    if ((ret = ctx->fns->set_controls(avctx, ctx)) != 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed set controls\n");
+        goto fail5;
+    }
+
+    decode_q_init(&ctx->decode_q);
+
+    // Set our s/w format
+    avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format;
+
+    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s\n",
+           ctx->fns->name,
+           decdev_media_path(decdev), decdev_video_path(decdev));
+
+    return 0;
+
+fail5:
+    av_buffer_unref(&avctx->hw_frames_ctx);
+fail4:
+    mediabufs_ctl_unref(&ctx->mbufs);
+fail3:
+    media_pool_delete(&ctx->mpool);
+fail2:
+    pollqueue_unref(&ctx->pq);
+fail1:
+    dmabufs_ctl_delete(&ctx->dbufs);
+fail0:
+    devscan_delete(&ctx->devscan);
+    return ret;
+}
+
+const AVHWAccel ff_hevc_v4l2request_hwaccel = {
+    .name           = "hevc_v4l2request",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_HEVC,
+    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
+    .alloc_frame    = v4l2_req_hevc_alloc_frame,
+    .start_frame    = v4l2_req_hevc_start_frame,
+    .decode_slice   = v4l2_req_hevc_decode_slice,
+    .end_frame      = v4l2_req_hevc_end_frame,
+    .abort_frame    = v4l2_req_hevc_abort_frame,
+    .init           = v4l2_request_hevc_init,
+    .uninit         = v4l2_request_hevc_uninit,
+    .priv_data_size = sizeof(V4L2RequestContextHEVC),
+    .frame_params   = v4l2_req_hevc_frame_params,
+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
+};
diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
new file mode 100644
index 0000000000..f14f594564
--- /dev/null
+++ b/libavcodec/v4l2_request_hevc.h
@@ -0,0 +1,102 @@
+#ifndef AVCODEC_V4L2_REQUEST_HEVC_H
+#define AVCODEC_V4L2_REQUEST_HEVC_H
+
+#include <stdint.h>
+#include <drm_fourcc.h>
+#include "v4l2_req_decode_q.h"
+
+#ifndef DRM_FORMAT_NV15
+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
+#endif
+
+#ifndef DRM_FORMAT_NV20
+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
+#endif
+
+// P030 should be defined in drm_fourcc.h and hopefully will be sometime
+// in the future but until then...
+#ifndef DRM_FORMAT_P030
+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
+#endif
+
+#ifndef DRM_FORMAT_NV15
+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
+#endif
+
+#ifndef DRM_FORMAT_NV20
+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
+#endif
+
+#include <linux/videodev2.h>
+#ifndef V4L2_CID_CODEC_BASE
+#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
+#endif
+
+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
+// in drm_fourcc.h hopefully will be sometime in the future but until then...
+#ifndef V4L2_PIX_FMT_NV12_10_COL128
+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
+#endif
+
+#ifndef V4L2_PIX_FMT_NV12_COL128
+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
+#endif
+
+#ifndef V4L2_CTRL_FLAG_DYNAMIC_ARRAY
+#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY	0x0800
+#endif
+
+#define MAX_SLICES 128
+
+#define VCAT(name, version) name##_v##version
+#define V2(n,v) VCAT(n, v)
+#define V(n) V2(n, HEVC_CTRLS_VERSION)
+
+#define S2(x) #x
+#define STR(x) S2(x)
+
+// 1 per decoder
+struct v4l2_req_decode_fns;
+
+typedef struct V4L2RequestContextHEVC {
+//    V4L2RequestContext base;
+    const struct v4l2_req_decode_fns * fns;
+
+    unsigned int timestamp;  // ?? maybe uint64_t
+
+    int multi_slice;
+    int decode_mode;
+    int start_code;
+    int max_slices;
+
+    req_decode_q decode_q;
+
+    struct devscan *devscan;
+    struct dmabufs_ctl *dbufs;
+    struct pollqueue *pq;
+    struct media_pool * mpool;
+    struct mediabufs_ctl *mbufs;
+} V4L2RequestContextHEVC;
+
+typedef struct v4l2_req_decode_fns {
+    int src_pix_fmt_v4l2;
+    const char * name;
+
+    // Init setup
+    int (*probe)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
+    int (*set_controls)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
+
+    // Passthrough of hwaccel fns
+    int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
+    int (*decode_slice)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
+    int (*end_frame)(AVCodecContext *avctx);
+    void (*abort_frame)(AVCodecContext *avctx);
+    int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
+    int (*alloc_frame)(AVCodecContext * avctx, AVFrame *frame);
+} v4l2_req_decode_fns;
+
+
+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
+
+#endif

From c99a0fe4d59212079de9bed222114abf95f7c989 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 19:30:36 +0100
Subject: [PATCH 013/161] Add no_cvt_hw option to ffmpeg

---
 fftools/ffmpeg.c     | 6 ++++--
 fftools/ffmpeg.h     | 2 ++
 fftools/ffmpeg_opt.c | 3 +++
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
index 15e084f0b2..5dc2cd73c1 100644
--- a/fftools/ffmpeg.c
+++ b/fftools/ffmpeg.c
@@ -2005,6 +2005,9 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame, int keep_ref
         (ifilter->hw_frames_ctx && ifilter->hw_frames_ctx->data != frame->hw_frames_ctx->data))
         need_reinit = 1;

+    if (no_cvt_hw && fg->graph)
+        need_reinit = 0;
+
     if (sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DISPLAYMATRIX)) {
         if (!ifilter->displaymatrix || memcmp(sd->data, ifilter->displaymatrix, sizeof(int32_t) * 9))
             need_reinit = 1;
@@ -2274,8 +2277,7 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_
         decoded_frame->top_field_first = ist->top_field_first;

     ist->frames_decoded++;
-
-    if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
+    if (!no_cvt_hw && ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
         err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame);
         if (err < 0)
             goto fail;
diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h
index f1412f6446..8f478619b3 100644
--- a/fftools/ffmpeg.h
+++ b/fftools/ffmpeg.h
@@ -729,6 +729,8 @@ extern enum VideoSyncMethod video_sync_method;
 extern float frame_drop_threshold;
 extern int do_benchmark;
 extern int do_benchmark_all;
+extern int no_cvt_hw;
+extern int do_deinterlace;
 extern int do_hex_dump;
 extern int do_pkt_dump;
 extern int copy_ts;
diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c
index 055275d813..761db36588 100644
--- a/fftools/ffmpeg_opt.c
+++ b/fftools/ffmpeg_opt.c
@@ -71,6 +71,7 @@ enum VideoSyncMethod video_sync_method = VSYNC_AUTO;
 float frame_drop_threshold = 0;
 int do_benchmark      = 0;
 int do_benchmark_all  = 0;
+int no_cvt_hw         = 0;
 int do_hex_dump       = 0;
 int do_pkt_dump       = 0;
 int copy_ts           = 0;
@@ -1427,6 +1428,8 @@ const OptionDef options[] = {
         "add timings for benchmarking" },
     { "benchmark_all",  OPT_BOOL | OPT_EXPERT,                       { &do_benchmark_all },
       "add timings for each task" },
+    { "no_cvt_hw",      OPT_BOOL | OPT_EXPERT,                       { &no_cvt_hw },
+      "do not auto-convert hw frames to sw" },
     { "progress",       HAS_ARG | OPT_EXPERT,                        { .func_arg = opt_progress },
       "write program-readable progress information", "url" },
     { "stdin",          OPT_BOOL | OPT_EXPERT,                       { &stdin_interaction },

From 27e0c78a2df53fb2337bee4c383cdb58cbbc717e Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 28 Apr 2021 10:16:39 +0100
Subject: [PATCH 014/161] Add vout_drm

---
 configure                |   4 +
 libavdevice/Makefile     |   1 +
 libavdevice/alldevices.c |   1 +
 libavdevice/drm_vout.c   | 638 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 644 insertions(+)
 create mode 100644 libavdevice/drm_vout.c

diff --git a/configure b/configure
index 199aa2b3d5..49744cab19 100755
--- a/configure
+++ b/configure
@@ -346,6 +346,7 @@ External library support:
   --enable-libnpp          enable Nvidia Performance Primitives-based code [no]
   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
   --enable-sand            enable sand video formats [rpi]
+  --enable-vout-drm        enable the vout_drm module - for internal testing only [no]
   --disable-nvdec          disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
   --disable-nvenc          disable Nvidia video encoding code [autodetect]
   --enable-omx             enable OpenMAX IL code [no]
@@ -1940,6 +1941,7 @@ FEATURE_LIST="
     small
     static
     swscale_alpha
+    vout_drm
 "

 # this list should be kept in linking order
@@ -3559,8 +3561,10 @@ sndio_indev_deps="sndio"
 sndio_outdev_deps="sndio"
 v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h"
 v4l2_indev_suggest="libv4l2"
+v4l2_outdev_deps="libdrm"
 v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
 v4l2_outdev_suggest="libv4l2"
+vout_drm_outdev_deps="libdrm vout_drm"
 vfwcap_indev_deps="vfw32 vfwcap_defines"
 xcbgrab_indev_deps="libxcb"
 xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
diff --git a/libavdevice/Makefile b/libavdevice/Makefile
index 8a62822b69..36aac30186 100644
--- a/libavdevice/Makefile
+++ b/libavdevice/Makefile
@@ -48,6 +48,7 @@ OBJS-$(CONFIG_SNDIO_OUTDEV)              += sndio_enc.o sndio.o
 OBJS-$(CONFIG_V4L2_INDEV)                += v4l2.o v4l2-common.o timefilter.o
 OBJS-$(CONFIG_V4L2_OUTDEV)               += v4l2enc.o v4l2-common.o
 OBJS-$(CONFIG_VFWCAP_INDEV)              += vfwcap.o
+OBJS-$(CONFIG_VOUT_DRM_OUTDEV)           += drm_vout.o
 OBJS-$(CONFIG_XCBGRAB_INDEV)             += xcbgrab.o
 OBJS-$(CONFIG_XV_OUTDEV)                 += xv.o

diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c
index 8a90fcb5d7..e2a8669f27 100644
--- a/libavdevice/alldevices.c
+++ b/libavdevice/alldevices.c
@@ -52,6 +52,7 @@ extern const FFOutputFormat ff_sndio_muxer;
 extern const AVInputFormat  ff_v4l2_demuxer;
 extern const FFOutputFormat ff_v4l2_muxer;
 extern const AVInputFormat  ff_vfwcap_demuxer;
+extern const FFOutputFormat ff_vout_drm_muxer;
 extern const AVInputFormat  ff_xcbgrab_demuxer;
 extern const FFOutputFormat ff_xv_muxer;

diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
new file mode 100644
index 0000000000..cfb33ce7c3
--- /dev/null
+++ b/libavdevice/drm_vout.c
@@ -0,0 +1,638 @@
+/*
+ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+// *** This module is a work in progress and its utility is strictly
+//     limited to testing.
+
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/hwcontext_drm.h"
+#include "libavformat/mux.h"
+#include "avdevice.h"
+
+#include "pthread.h"
+#include <semaphore.h>
+#include <unistd.h>
+
+#include <xf86drm.h>
+#include <xf86drmMode.h>
+
+#define TRACE_ALL 0
+
+#define DRM_MODULE "vc4"
+
+#define ERRSTR strerror(errno)
+
+struct drm_setup {
+   int conId;
+   uint32_t crtcId;
+   int crtcIdx;
+   uint32_t planeId;
+   unsigned int out_fourcc;
+   struct {
+       int x, y, width, height;
+   } compose;
+};
+
+typedef struct drm_aux_s {
+    unsigned int fb_handle;
+    uint32_t bo_handles[AV_DRM_MAX_PLANES];
+    AVFrame * frame;
+} drm_aux_t;
+
+// Aux size should only need to be 2, but on a few streams (Hobbit) under FKMS
+// we get initial flicker probably due to dodgy drm timing
+#define AUX_SIZE 3
+typedef struct drm_display_env_s
+{
+    AVClass *class;
+
+    int drm_fd;
+    uint32_t con_id;
+    struct drm_setup setup;
+    enum AVPixelFormat avfmt;
+    int show_all;
+
+    unsigned int ano;
+    drm_aux_t aux[AUX_SIZE];
+
+    pthread_t q_thread;
+    sem_t q_sem_in;
+    sem_t q_sem_out;
+    int q_terminate;
+    AVFrame * q_next;
+
+} drm_display_env_t;
+
+
+static int drm_vout_write_trailer(AVFormatContext *s)
+{
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
+#endif
+
+    return 0;
+}
+
+static int drm_vout_write_header(AVFormatContext *s)
+{
+    const AVCodecParameters * const par = s->streams[0]->codecpar;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
+#endif
+    if (   s->nb_streams > 1
+        || par->codec_type != AVMEDIA_TYPE_VIDEO
+        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
+        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+static int find_plane(struct AVFormatContext * const avctx,
+                      const int drmfd, const int crtcidx, const uint32_t format,
+                      uint32_t * const pplane_id)
+{
+   drmModePlaneResPtr planes;
+   drmModePlanePtr plane;
+   unsigned int i;
+   unsigned int j;
+   int ret = 0;
+
+   planes = drmModeGetPlaneResources(drmfd);
+   if (!planes)
+   {
+       av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR);
+       return -1;
+   }
+
+   for (i = 0; i < planes->count_planes; ++i) {
+      plane = drmModeGetPlane(drmfd, planes->planes[i]);
+      if (!planes)
+      {
+          av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR);
+          break;
+      }
+
+      if (!(plane->possible_crtcs & (1 << crtcidx))) {
+         drmModeFreePlane(plane);
+         continue;
+      }
+
+      for (j = 0; j < plane->count_formats; ++j) {
+         if (plane->formats[j] == format)
+            break;
+      }
+
+      if (j == plane->count_formats) {
+         drmModeFreePlane(plane);
+         continue;
+      }
+
+      *pplane_id = plane->plane_id;
+      drmModeFreePlane(plane);
+      break;
+   }
+
+   if (i == planes->count_planes)
+      ret = -1;
+
+   drmModeFreePlaneResources(planes);
+   return ret;
+}
+
+static void da_uninit(drm_display_env_t * const de, drm_aux_t * da)
+{
+    if (da->fb_handle != 0) {
+        drmModeRmFB(de->drm_fd, da->fb_handle);
+        da->fb_handle = 0;
+    }
+
+    for (unsigned int i = 0; i != AV_DRM_MAX_PLANES; ++i) {
+        if (da->bo_handles[i]) {
+            struct drm_gem_close gem_close = {.handle = da->bo_handles[i]};
+            drmIoctl(de->drm_fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
+            da->bo_handles[i] = 0;
+        }
+    }
+    av_frame_free(&da->frame);
+}
+
+static int do_display(AVFormatContext * const s, drm_display_env_t * const de, AVFrame * frame)
+{
+    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
+    drm_aux_t * da = de->aux + de->ano;
+    const uint32_t format = desc->layers[0].format;
+    int ret = 0;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "<<< %s: fd=%d\n", __func__, desc->objects[0].fd);
+#endif
+
+    if (de->setup.out_fourcc != format) {
+        if (find_plane(s, de->drm_fd, de->setup.crtcIdx, format, &de->setup.planeId)) {
+            av_frame_free(&frame);
+            av_log(s, AV_LOG_WARNING, "No plane for format: %#x\n", format);
+            return -1;
+        }
+        de->setup.out_fourcc = format;
+    }
+
+    {
+        drmVBlank vbl = {
+            .request = {
+                .type = DRM_VBLANK_RELATIVE,
+                .sequence = 0
+            }
+        };
+
+        while (drmWaitVBlank(de->drm_fd, &vbl)) {
+            if (errno != EINTR) {
+//                av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR);
+                break;
+            }
+        }
+    }
+
+    da_uninit(de, da);
+
+    {
+        uint32_t pitches[4] = {0};
+        uint32_t offsets[4] = {0};
+        uint64_t modifiers[4] = {0};
+        uint32_t bo_handles[4] = {0};
+        int i, j, n;
+
+        da->frame = frame;
+
+        for (i = 0; i < desc->nb_objects; ++i) {
+            if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) {
+                av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR);
+                return -1;
+            }
+        }
+
+        n = 0;
+        for (i = 0; i < desc->nb_layers; ++i) {
+            for (j = 0; j < desc->layers[i].nb_planes; ++j) {
+                const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
+                const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
+                pitches[n] = p->pitch;
+                offsets[n] = p->offset;
+                modifiers[n] = obj->format_modifier;
+                bo_handles[n] = da->bo_handles[p->object_index];
+                ++n;
+            }
+        }
+
+#if 1 && TRACE_ALL
+        av_log(s, AV_LOG_DEBUG, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
+               " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
+               av_frame_cropped_width(frame),
+               av_frame_cropped_height(frame),
+               desc->layers[0].format,
+               bo_handles[0],
+               bo_handles[1],
+               bo_handles[2],
+               bo_handles[3],
+               pitches[0],
+               pitches[1],
+               pitches[2],
+               pitches[3],
+               offsets[0],
+               offsets[1],
+               offsets[2],
+               offsets[3],
+               (long long)modifiers[0],
+               (long long)modifiers[1],
+               (long long)modifiers[2],
+               (long long)modifiers[3]
+               );
+#endif
+
+        if (drmModeAddFB2WithModifiers(de->drm_fd,
+                                         av_frame_cropped_width(frame),
+                                         av_frame_cropped_height(frame),
+                                         desc->layers[0].format, bo_handles,
+                                         pitches, offsets, modifiers,
+                                         &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) {
+            av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR);
+            return -1;
+        }
+    }
+
+    ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId,
+                              da->fb_handle, 0,
+                de->setup.compose.x, de->setup.compose.y,
+                de->setup.compose.width,
+                de->setup.compose.height,
+                0, 0,
+                av_frame_cropped_width(frame) << 16,
+                av_frame_cropped_height(frame) << 16);
+
+    if (ret != 0) {
+        av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR);
+    }
+
+    de->ano = de->ano + 1 >= AUX_SIZE ? 0 : de->ano + 1;
+
+    return ret;
+}
+
+static int do_sem_wait(sem_t * const sem, const int nowait)
+{
+    while (nowait ? sem_trywait(sem) : sem_wait(sem)) {
+        if (errno != EINTR)
+            return -errno;
+    }
+    return 0;
+}
+
+static void * display_thread(void * v)
+{
+    AVFormatContext * const s = v;
+    drm_display_env_t * const de = s->priv_data;
+    int i;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
+#endif
+
+    sem_post(&de->q_sem_out);
+
+    for (;;) {
+        AVFrame * frame;
+
+        do_sem_wait(&de->q_sem_in, 0);
+
+        if (de->q_terminate)
+            break;
+
+        frame = de->q_next;
+        de->q_next = NULL;
+        sem_post(&de->q_sem_out);
+
+        do_display(s, de, frame);
+    }
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
+#endif
+
+    for (i = 0; i != AUX_SIZE; ++i)
+        da_uninit(de, de->aux + i);
+
+    av_frame_free(&de->q_next);
+
+    return NULL;
+}
+
+static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
+{
+    const AVFrame * const src_frame = (AVFrame *)pkt->data;
+    AVFrame * frame;
+    drm_display_env_t * const de = s->priv_data;
+    int ret;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
+#endif
+
+    if ((src_frame->flags & AV_FRAME_FLAG_CORRUPT) != 0) {
+        av_log(s, AV_LOG_WARNING, "Discard corrupt frame: fmt=%d, ts=%" PRId64 "\n", src_frame->format, src_frame->pts);
+        return 0;
+    }
+
+    if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
+        frame = av_frame_alloc();
+        av_frame_ref(frame, src_frame);
+    }
+    else if (src_frame->format == AV_PIX_FMT_VAAPI) {
+        frame = av_frame_alloc();
+        frame->format = AV_PIX_FMT_DRM_PRIME;
+        if (av_hwframe_map(frame, src_frame, 0) != 0)
+        {
+            av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
+            av_frame_free(&frame);
+            return AVERROR(EINVAL);
+        }
+    }
+    else {
+        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
+        return AVERROR(EINVAL);
+    }
+
+    ret = do_sem_wait(&de->q_sem_out, !de->show_all);
+    if (ret) {
+        av_frame_free(&frame);
+    }
+    else {
+        de->q_next = frame;
+        sem_post(&de->q_sem_in);
+    }
+
+    return 0;
+}
+
+static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
+                          unsigned flags)
+{
+    av_log(s, AV_LOG_ERROR, "%s: NIF: idx=%d, flags=%#x\n", __func__, stream_index, flags);
+    return AVERROR_PATCHWELCOME;
+}
+
+static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
+{
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "%s: %d\n", __func__, type);
+#endif
+    switch(type) {
+    case AV_APP_TO_DEV_WINDOW_REPAINT:
+        return 0;
+    default:
+        break;
+    }
+    return AVERROR(ENOSYS);
+}
+
+static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId)
+{
+   int ret = -1;
+   int i;
+   drmModeRes *res = drmModeGetResources(drmfd);
+   drmModeConnector *c;
+
+   if(!res)
+   {
+      printf( "drmModeGetResources failed: %s\n", ERRSTR);
+      return -1;
+   }
+
+   if (res->count_crtcs <= 0)
+   {
+      printf( "drm: no crts\n");
+      goto fail_res;
+   }
+
+   if (!s->conId) {
+      fprintf(stderr,
+         "No connector ID specified.  Choosing default from list:\n");
+
+      for (i = 0; i < res->count_connectors; i++) {
+         drmModeConnector *con =
+            drmModeGetConnector(drmfd, res->connectors[i]);
+         drmModeEncoder *enc = NULL;
+         drmModeCrtc *crtc = NULL;
+
+         if (con->encoder_id) {
+            enc = drmModeGetEncoder(drmfd, con->encoder_id);
+            if (enc->crtc_id) {
+               crtc = drmModeGetCrtc(drmfd, enc->crtc_id);
+            }
+         }
+
+         if (!s->conId && crtc) {
+            s->conId = con->connector_id;
+            s->crtcId = crtc->crtc_id;
+         }
+
+         av_log(avctx, AV_LOG_DEBUG, "Connector %d (crtc %d): type %d, %dx%d%s\n",
+                con->connector_id,
+                crtc ? crtc->crtc_id : 0,
+                con->connector_type,
+                crtc ? crtc->width : 0,
+                crtc ? crtc->height : 0,
+                (s->conId == (int)con->connector_id ?
+            " (chosen)" : ""));
+      }
+
+      if (!s->conId) {
+         av_log(avctx, AV_LOG_ERROR,
+            "No suitable enabled connector found.\n");
+         return -1;;
+      }
+   }
+
+   s->crtcIdx = -1;
+
+   for (i = 0; i < res->count_crtcs; ++i) {
+      if (s->crtcId == res->crtcs[i]) {
+         s->crtcIdx = i;
+         break;
+      }
+   }
+
+   if (s->crtcIdx == -1)
+   {
+       av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId);
+       goto fail_res;
+   }
+
+   if (res->count_connectors <= 0)
+   {
+       av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n");
+       goto fail_res;
+   }
+
+   c = drmModeGetConnector(drmfd, s->conId);
+   if (!c)
+   {
+       av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR);
+       goto fail_res;
+   }
+
+   if (!c->count_modes)
+   {
+       av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n");
+       goto fail_conn;
+   }
+
+   {
+      drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId);
+      s->compose.x = crtc->x;
+      s->compose.y = crtc->y;
+      s->compose.width = crtc->width;
+      s->compose.height = crtc->height;
+      drmModeFreeCrtc(crtc);
+   }
+
+   if (pConId)
+      *pConId = c->connector_id;
+   ret = 0;
+
+fail_conn:
+   drmModeFreeConnector(c);
+
+fail_res:
+   drmModeFreeResources(res);
+
+   return ret;
+}
+
+// deinit is called if init fails so no need to clean up explicity here
+static int drm_vout_init(struct AVFormatContext * s)
+{
+    drm_display_env_t * const de = s->priv_data;
+    int rv;
+    const char * drm_module = DRM_MODULE;
+
+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    de->drm_fd = -1;
+    de->con_id = 0;
+    de->setup = (struct drm_setup){0};
+    de->q_terminate = 0;
+
+    if ((de->drm_fd = drmOpen(drm_module, NULL)) < 0)
+    {
+        rv = AVERROR(errno);
+        av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", drm_module, av_err2str(rv));
+        return rv;
+    }
+
+    if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0)
+    {
+        av_log(s, AV_LOG_ERROR, "failed to find valid mode\n");
+        rv = AVERROR(EINVAL);
+        goto fail_close;
+    }
+
+    sem_init(&de->q_sem_in, 0, 0);
+    sem_init(&de->q_sem_out, 0, 0);
+    if (pthread_create(&de->q_thread, NULL, display_thread, s)) {
+        rv = AVERROR(errno);
+        av_log(s, AV_LOG_ERROR, "Failed to creatye display thread: %s\n", av_err2str(rv));
+        goto fail_close;
+    }
+
+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
+
+    return 0;
+
+fail_close:
+    close(de->drm_fd);
+    de->drm_fd = -1;
+    av_log(s, AV_LOG_DEBUG, ">>> %s: FAIL\n", __func__);
+
+    return rv;
+}
+
+static void drm_vout_deinit(struct AVFormatContext * s)
+{
+    drm_display_env_t * const de = s->priv_data;
+
+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    de->q_terminate = 1;
+    sem_post(&de->q_sem_in);
+    pthread_join(de->q_thread, NULL);
+    sem_destroy(&de->q_sem_in);
+    sem_destroy(&de->q_sem_out);
+
+    for (unsigned int i = 0; i != AUX_SIZE; ++i)
+        da_uninit(de, de->aux + i);
+
+    av_frame_free(&de->q_next);
+
+    if (de->drm_fd >= 0) {
+        close(de->drm_fd);
+        de->drm_fd = -1;
+    }
+
+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
+}
+
+
+#define OFFSET(x) offsetof(drm_display_env_t, x)
+static const AVOption options[] = {
+    { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
+    { NULL }
+};
+
+static const AVClass drm_vout_class = {
+    .class_name = "drm vid outdev",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
+};
+
+FFOutputFormat ff_vout_drm_muxer = {
+    .p = {
+        .name           = "vout_drm",
+        .long_name      = NULL_IF_CONFIG_SMALL("Drm video output device"),
+        .audio_codec    = AV_CODEC_ID_NONE,
+        .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
+        .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
+        .priv_class     = &drm_vout_class,
+    },
+    .priv_data_size = sizeof(drm_display_env_t),
+    .write_header   = drm_vout_write_header,
+    .write_packet   = drm_vout_write_packet,
+    .write_uncoded_frame = drm_vout_write_frame,
+    .write_trailer  = drm_vout_write_trailer,
+    .control_message = drm_vout_control_message,
+    .init           = drm_vout_init,
+    .deinit         = drm_vout_deinit,
+};
+

From cc536672adf4eefeaec16e9808f583c693ad7819 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 28 Apr 2021 11:34:18 +0100
Subject: [PATCH 015/161] Add vout_egl

---
 configure                |   6 +
 libavdevice/Makefile     |   1 +
 libavdevice/alldevices.c |   1 +
 libavdevice/egl_vout.c   | 811 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 819 insertions(+)
 create mode 100644 libavdevice/egl_vout.c

diff --git a/configure b/configure
index 49744cab19..b41663c794 100755
--- a/configure
+++ b/configure
@@ -347,6 +347,7 @@ External library support:
   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
   --enable-sand            enable sand video formats [rpi]
   --enable-vout-drm        enable the vout_drm module - for internal testing only [no]
+  --enable-vout-egl        enable the vout_egl module - for internal testing only [no]
   --disable-nvdec          disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
   --disable-nvenc          disable Nvidia video encoding code [autodetect]
   --enable-omx             enable OpenMAX IL code [no]
@@ -1818,6 +1819,7 @@ EXTERNAL_LIBRARY_LIST="
     libdav1d
     libdc1394
     libdrm
+    epoxy
     libflite
     libfontconfig
     libfreetype
@@ -1942,6 +1944,7 @@ FEATURE_LIST="
     static
     swscale_alpha
     vout_drm
+    vout_egl
 "

 # this list should be kept in linking order
@@ -3565,6 +3568,8 @@ v4l2_outdev_deps="libdrm"
 v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
 v4l2_outdev_suggest="libv4l2"
 vout_drm_outdev_deps="libdrm vout_drm"
+vout_egl_outdev_deps="xlib"
+vout_egl_outdev_select="epoxy"
 vfwcap_indev_deps="vfw32 vfwcap_defines"
 xcbgrab_indev_deps="libxcb"
 xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
@@ -6596,6 +6601,7 @@ enabled libdav1d          && require_pkg_config libdav1d "dav1d >= 0.5.0" "dav1d
 enabled libdavs2          && require_pkg_config libdavs2 "davs2 >= 1.6.0" davs2.h davs2_decoder_open
 enabled libdc1394         && require_pkg_config libdc1394 libdc1394-2 dc1394/dc1394.h dc1394_new
 enabled libdrm            && require_pkg_config libdrm libdrm xf86drm.h drmGetVersion
+enabled epoxy             && require_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
 enabled libfdk_aac        && { check_pkg_config libfdk_aac fdk-aac "fdk-aac/aacenc_lib.h" aacEncOpen ||
                                { require libfdk_aac fdk-aac/aacenc_lib.h aacEncOpen -lfdk-aac &&
                                  warn "using libfdk without pkg-config"; } }
diff --git a/libavdevice/Makefile b/libavdevice/Makefile
index 36aac30186..0989cb895f 100644
--- a/libavdevice/Makefile
+++ b/libavdevice/Makefile
@@ -49,6 +49,7 @@ OBJS-$(CONFIG_V4L2_INDEV)                += v4l2.o v4l2-common.o timefilter.o
 OBJS-$(CONFIG_V4L2_OUTDEV)               += v4l2enc.o v4l2-common.o
 OBJS-$(CONFIG_VFWCAP_INDEV)              += vfwcap.o
 OBJS-$(CONFIG_VOUT_DRM_OUTDEV)           += drm_vout.o
+OBJS-$(CONFIG_VOUT_EGL_OUTDEV)           += egl_vout.o
 OBJS-$(CONFIG_XCBGRAB_INDEV)             += xcbgrab.o
 OBJS-$(CONFIG_XV_OUTDEV)                 += xv.o

diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c
index e2a8669f27..ffb410b92d 100644
--- a/libavdevice/alldevices.c
+++ b/libavdevice/alldevices.c
@@ -53,6 +53,7 @@ extern const AVInputFormat  ff_v4l2_demuxer;
 extern const FFOutputFormat ff_v4l2_muxer;
 extern const AVInputFormat  ff_vfwcap_demuxer;
 extern const FFOutputFormat ff_vout_drm_muxer;
+extern const FFOutputFormat ff_vout_egl_muxer;
 extern const AVInputFormat  ff_xcbgrab_demuxer;
 extern const FFOutputFormat ff_xv_muxer;

diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c
new file mode 100644
index 0000000000..7b9c610ace
--- /dev/null
+++ b/libavdevice/egl_vout.c
@@ -0,0 +1,811 @@
+/*
+ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+// *** This module is a work in progress and its utility is strictly
+//     limited to testing.
+//     Amongst other issues it doesn't wait for the pic to be displayed before
+//     returning the buffer so flikering does occur.
+
+#include <epoxy/gl.h>
+#include <epoxy/egl.h>
+
+#include "libavutil/opt.h"
+#include "libavutil/avassert.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/hwcontext_drm.h"
+#include "libavformat/mux.h"
+#include "avdevice.h"
+
+#include "pthread.h"
+#include <semaphore.h>
+#include <stdatomic.h>
+#include <unistd.h>
+
+#include <X11/Xlib.h>
+#include <X11/Xutil.h>
+
+#include "libavutil/rpi_sand_fns.h"
+
+#define TRACE_ALL 0
+
+struct egl_setup {
+   int conId;
+
+   Display *dpy;
+   EGLDisplay egl_dpy;
+   EGLContext ctx;
+   EGLSurface surf;
+   Window win;
+
+   uint32_t crtcId;
+   int crtcIdx;
+   uint32_t planeId;
+   struct {
+       int x, y, width, height;
+   } compose;
+};
+
+typedef struct egl_aux_s {
+    int fd;
+    GLuint texture;
+
+} egl_aux_t;
+
+typedef struct egl_display_env_s
+{
+    AVClass *class;
+
+    struct egl_setup setup;
+    enum AVPixelFormat avfmt;
+
+    int show_all;
+    int window_width, window_height;
+    int window_x, window_y;
+    int fullscreen;
+
+    egl_aux_t aux[32];
+
+    pthread_t q_thread;
+    pthread_mutex_t q_lock;
+    sem_t display_start_sem;
+    sem_t q_sem;
+    int q_terminate;
+    AVFrame * q_this;
+    AVFrame * q_next;
+
+} egl_display_env_t;
+
+
+/**
+ * Remove window border/decorations.
+ */
+static void
+no_border( Display *dpy, Window w)
+{
+   static const unsigned MWM_HINTS_DECORATIONS = (1 << 1);
+   static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5;
+
+   typedef struct
+   {
+      unsigned long       flags;
+      unsigned long       functions;
+      unsigned long       decorations;
+      long                inputMode;
+      unsigned long       status;
+   } PropMotifWmHints;
+
+   PropMotifWmHints motif_hints;
+   Atom prop, proptype;
+   unsigned long flags = 0;
+
+   /* setup the property */
+   motif_hints.flags = MWM_HINTS_DECORATIONS;
+   motif_hints.decorations = flags;
+
+   /* get the atom for the property */
+   prop = XInternAtom( dpy, "_MOTIF_WM_HINTS", True );
+   if (!prop) {
+      /* something went wrong! */
+      return;
+   }
+
+   /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */
+   proptype = prop;
+
+   XChangeProperty( dpy, w,                         /* display, window */
+                    prop, proptype,                 /* property, type */
+                    32,                             /* format: 32-bit datums */
+                    PropModeReplace,                /* mode */
+                    (unsigned char *) &motif_hints, /* data */
+                    PROP_MOTIF_WM_HINTS_ELEMENTS    /* nelements */
+                  );
+}
+
+
+/*
+ * Create an RGB, double-buffered window.
+ * Return the window and context handles.
+ */
+static int
+make_window(struct AVFormatContext * const s,
+            egl_display_env_t * const de,
+            Display *dpy, EGLDisplay egl_dpy, const char *name,
+            Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet)
+{
+   int scrnum = DefaultScreen( dpy );
+   XSetWindowAttributes attr;
+   unsigned long mask;
+   Window root = RootWindow( dpy, scrnum );
+   Window win;
+   EGLContext ctx;
+   const int fullscreen = de->fullscreen;
+   EGLConfig config;
+   int x = de->window_x;
+   int y = de->window_y;
+   int width = de->window_width ? de->window_width : 1280;
+   int height = de->window_height ? de->window_height : 720;
+
+
+   if (fullscreen) {
+      int scrnum = DefaultScreen(dpy);
+
+      x = 0; y = 0;
+      width = DisplayWidth(dpy, scrnum);
+      height = DisplayHeight(dpy, scrnum);
+   }
+
+   {
+      EGLint num_configs;
+      static const EGLint attribs[] = {
+         EGL_RED_SIZE, 1,
+         EGL_GREEN_SIZE, 1,
+         EGL_BLUE_SIZE, 1,
+         EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
+         EGL_NONE
+      };
+
+      if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) {
+         av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n");
+         return -1;
+      }
+   }
+
+   {
+      EGLint vid;
+      if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) {
+         av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n");
+         return -1;
+      }
+
+      {
+         XVisualInfo visTemplate = {
+            .visualid = vid,
+         };
+         int num_visuals;
+         XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask,
+                                               &visTemplate, &num_visuals);
+
+         /* window attributes */
+         attr.background_pixel = 0;
+         attr.border_pixel = 0;
+         attr.colormap = XCreateColormap( dpy, root, visinfo->visual, AllocNone);
+         attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask;
+         /* XXX this is a bad way to get a borderless window! */
+         mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask;
+
+         win = XCreateWindow( dpy, root, x, y, width, height,
+                              0, visinfo->depth, InputOutput,
+                              visinfo->visual, mask, &attr );
+         XFree(visinfo);
+      }
+   }
+
+   if (fullscreen)
+      no_border(dpy, win);
+
+   /* set hints and properties */
+   {
+      XSizeHints sizehints;
+      sizehints.x = x;
+      sizehints.y = y;
+      sizehints.width  = width;
+      sizehints.height = height;
+      sizehints.flags = USSize | USPosition;
+      XSetNormalHints(dpy, win, &sizehints);
+      XSetStandardProperties(dpy, win, name, name,
+                              None, (char **)NULL, 0, &sizehints);
+   }
+
+   eglBindAPI(EGL_OPENGL_ES_API);
+
+   {
+      static const EGLint ctx_attribs[] = {
+         EGL_CONTEXT_CLIENT_VERSION, 2,
+         EGL_NONE
+      };
+      ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs );
+      if (!ctx) {
+         av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
+         return -1;
+      }
+   }
+
+
+   XMapWindow(dpy, win);
+
+   {
+      EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
+      if (!surf) {
+         av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
+         return -1;
+      }
+
+      if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) {
+         av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
+         return -1;
+      }
+
+      *winRet = win;
+      *ctxRet = ctx;
+      *surfRet = surf;
+   }
+
+   return 0;
+}
+
+static GLint
+compile_shader(struct AVFormatContext * const avctx, GLenum target, const char *source)
+{
+   GLuint s = glCreateShader(target);
+
+   if (s == 0) {
+      av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n");
+      return 0;
+   }
+
+   glShaderSource(s, 1, (const GLchar **) &source, NULL);
+   glCompileShader(s);
+
+   {
+      GLint ok;
+      glGetShaderiv(s, GL_COMPILE_STATUS, &ok);
+
+      if (!ok) {
+         GLchar *info;
+         GLint size;
+
+         glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size);
+         info = malloc(size);
+
+         glGetShaderInfoLog(s, size, NULL, info);
+         av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source);
+
+         return 0;
+      }
+   }
+
+   return s;
+}
+
+static GLuint link_program(struct AVFormatContext * const s, GLint vs, GLint fs)
+{
+   GLuint prog = glCreateProgram();
+
+   if (prog == 0) {
+      av_log(s, AV_LOG_ERROR, "Failed to create program\n");
+      return 0;
+   }
+
+   glAttachShader(prog, vs);
+   glAttachShader(prog, fs);
+   glLinkProgram(prog);
+
+   {
+      GLint ok;
+      glGetProgramiv(prog, GL_LINK_STATUS, &ok);
+      if (!ok) {
+         /* Some drivers return a size of 1 for an empty log.  This is the size
+          * of a log that contains only a terminating NUL character.
+          */
+         GLint size;
+         GLchar *info = NULL;
+         glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size);
+         if (size > 1) {
+            info = malloc(size);
+            glGetProgramInfoLog(prog, size, NULL, info);
+         }
+
+         av_log(s, AV_LOG_ERROR, "Failed to link: %s\n",
+                 (info != NULL) ? info : "<empty log>");
+         return 0;
+      }
+   }
+
+   return prog;
+}
+
+static int
+gl_setup(struct AVFormatContext * const s)
+{
+   const char *vs =
+      "attribute vec4 pos;\n"
+      "varying vec2 texcoord;\n"
+      "\n"
+      "void main() {\n"
+      "  gl_Position = pos;\n"
+      "  texcoord.x = (pos.x + 1.0) / 2.0;\n"
+      "  texcoord.y = (-pos.y + 1.0) / 2.0;\n"
+      "}\n";
+   const char *fs =
+      "#extension GL_OES_EGL_image_external : enable\n"
+      "precision mediump float;\n"
+      "uniform samplerExternalOES s;\n"
+      "varying vec2 texcoord;\n"
+      "void main() {\n"
+      "  gl_FragColor = texture2D(s, texcoord);\n"
+      "}\n";
+
+   GLuint vs_s;
+   GLuint fs_s;
+   GLuint prog;
+
+   if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) ||
+       !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) ||
+       !(prog = link_program(s, vs_s, fs_s)))
+      return -1;
+
+   glUseProgram(prog);
+
+   {
+      static const float verts[] = {
+         -1, -1,
+         1, -1,
+         1, 1,
+         -1, 1,
+      };
+      glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts);
+   }
+
+   glEnableVertexAttribArray(0);
+   return 0;
+}
+
+static int egl_vout_write_trailer(AVFormatContext *s)
+{
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
+#endif
+
+    return 0;
+}
+
+static int egl_vout_write_header(AVFormatContext *s)
+{
+    const AVCodecParameters * const par = s->streams[0]->codecpar;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
+#endif
+    if (   s->nb_streams > 1
+        || par->codec_type != AVMEDIA_TYPE_VIDEO
+        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
+        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+
+static int do_display(AVFormatContext * const s, egl_display_env_t * const de, AVFrame * const frame)
+{
+    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
+    egl_aux_t * da = NULL;
+    unsigned int i;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
+#endif
+
+    for (i = 0; i != 32; ++i) {
+        if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) {
+            da = de->aux + i;
+            break;
+        }
+    }
+
+    if (da == NULL) {
+        av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__);
+        return AVERROR(EINVAL);
+    }
+
+    if (da->texture == 0) {
+        EGLint attribs[50];
+        EGLint * a = attribs;
+        int i, j;
+        static const EGLint anames[] = {
+           EGL_DMA_BUF_PLANE0_FD_EXT,
+           EGL_DMA_BUF_PLANE0_OFFSET_EXT,
+           EGL_DMA_BUF_PLANE0_PITCH_EXT,
+           EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
+           EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
+           EGL_DMA_BUF_PLANE1_FD_EXT,
+           EGL_DMA_BUF_PLANE1_OFFSET_EXT,
+           EGL_DMA_BUF_PLANE1_PITCH_EXT,
+           EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT,
+           EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
+           EGL_DMA_BUF_PLANE2_FD_EXT,
+           EGL_DMA_BUF_PLANE2_OFFSET_EXT,
+           EGL_DMA_BUF_PLANE2_PITCH_EXT,
+           EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT,
+           EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT,
+        };
+        const EGLint * b = anames;
+
+        *a++ = EGL_WIDTH;
+        *a++ = av_frame_cropped_width(frame);
+        *a++ = EGL_HEIGHT;
+        *a++ = av_frame_cropped_height(frame);
+        *a++ = EGL_LINUX_DRM_FOURCC_EXT;
+        *a++ = desc->layers[0].format;
+
+        for (i = 0; i < desc->nb_layers; ++i) {
+            for (j = 0; j < desc->layers[i].nb_planes; ++j) {
+                const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
+                const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
+                *a++ = *b++;
+                *a++ = obj->fd;
+                *a++ = *b++;
+                *a++ = p->offset;
+                *a++ = *b++;
+                *a++ = p->pitch;
+                if (obj->format_modifier == 0) {
+                   b += 2;
+                }
+                else {
+                   *a++ = *b++;
+                   *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF);
+                   *a++ = *b++;
+                   *a++ = (EGLint)(obj->format_modifier >> 32);
+                }
+            }
+        }
+
+        *a = EGL_NONE;
+
+#if TRACE_ALL
+        for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) {
+           av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]);
+        }
+#endif
+        {
+           const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy,
+                                              EGL_NO_CONTEXT,
+                                              EGL_LINUX_DMA_BUF_EXT,
+                                              NULL, attribs);
+           if (!image) {
+              av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd);
+              return -1;
+           }
+
+           glGenTextures(1, &da->texture);
+           glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
+           glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+           glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+           glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
+
+           eglDestroyImageKHR(de->setup.egl_dpy, image);
+        }
+
+        da->fd = desc->objects[0].fd;
+
+#if 0
+        av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
+               " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
+               av_frame_cropped_width(frame),
+               av_frame_cropped_height(frame),
+               desc->layers[0].format,
+               bo_plane_handles[0],
+               bo_plane_handles[1],
+               bo_plane_handles[2],
+               bo_plane_handles[3],
+               pitches[0],
+               pitches[1],
+               pitches[2],
+               pitches[3],
+               offsets[0],
+               offsets[1],
+               offsets[2],
+               offsets[3],
+               (long long)modifiers[0],
+               (long long)modifiers[1],
+               (long long)modifiers[2],
+               (long long)modifiers[3]
+               );
+#endif
+    }
+
+    glClearColor(0.5, 0.5, 0.5, 0.5);
+    glClear(GL_COLOR_BUFFER_BIT);
+
+    glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
+    glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+    eglSwapBuffers(de->setup.egl_dpy, de->setup.surf);
+
+    glDeleteTextures(1, &da->texture);
+    da->texture = 0;
+    da->fd = -1;
+
+    return 0;
+}
+
+static void * display_thread(void * v)
+{
+    AVFormatContext * const s = v;
+    egl_display_env_t * const de = s->priv_data;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
+#endif
+    {
+       EGLint egl_major, egl_minor;
+
+       de->setup.dpy = XOpenDisplay(NULL);
+       if (!de->setup.dpy) {
+          av_log(s, AV_LOG_ERROR, "Couldn't open X display\n");
+          goto fail;
+       }
+
+       de->setup.egl_dpy = eglGetDisplay(de->setup.dpy);
+       if (!de->setup.egl_dpy) {
+          av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n");
+          goto fail;
+       }
+
+       if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) {
+           av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n");
+           goto fail;
+       }
+
+       av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor);
+
+       if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) {
+          av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n");
+          goto fail;
+       }
+    }
+
+    if (!de->window_width || !de->window_height) {
+       de->window_width = 1280;
+       de->window_height = 720;
+    }
+    if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout",
+                    &de->setup.win, &de->setup.ctx, &de->setup.surf)) {
+       av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__);
+       goto fail;
+    }
+
+    if (gl_setup(s)) {
+       av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__);
+       goto fail;
+    }
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "--- %s: Start done\n", __func__);
+#endif
+    sem_post(&de->display_start_sem);
+
+    for (;;) {
+        AVFrame * frame;
+
+        while (sem_wait(&de->q_sem) != 0) {
+            av_assert0(errno == EINTR);
+        }
+
+        if (de->q_terminate)
+            break;
+
+        pthread_mutex_lock(&de->q_lock);
+        frame = de->q_next;
+        de->q_next = NULL;
+        pthread_mutex_unlock(&de->q_lock);
+
+        do_display(s, de, frame);
+
+        av_frame_free(&de->q_this);
+        de->q_this = frame;
+    }
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, ">>> %s\n", __func__);
+#endif
+
+    return NULL;
+
+fail:
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, ">>> %s: FAIL\n", __func__);
+#endif
+    de->q_terminate = 1;
+    sem_post(&de->display_start_sem);
+
+    return NULL;
+}
+
+static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
+{
+    const AVFrame * const src_frame = (AVFrame *)pkt->data;
+    AVFrame * frame;
+    egl_display_env_t * const de = s->priv_data;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
+#endif
+
+    if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
+        frame = av_frame_alloc();
+        av_frame_ref(frame, src_frame);
+    }
+    else if (src_frame->format == AV_PIX_FMT_VAAPI) {
+        frame = av_frame_alloc();
+        frame->format = AV_PIX_FMT_DRM_PRIME;
+        if (av_hwframe_map(frame, src_frame, 0) != 0)
+        {
+            av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
+            av_frame_free(&frame);
+            return AVERROR(EINVAL);
+        }
+    }
+    else {
+        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
+        return AVERROR(EINVAL);
+    }
+
+    // Really hacky sync
+    while (de->show_all && de->q_next) {
+       usleep(3000);
+    }
+
+    pthread_mutex_lock(&de->q_lock);
+    {
+        AVFrame * const t = de->q_next;
+        de->q_next = frame;
+        frame = t;
+    }
+    pthread_mutex_unlock(&de->q_lock);
+
+    if (frame == NULL)
+        sem_post(&de->q_sem);
+    else
+        av_frame_free(&frame);
+
+    return 0;
+}
+
+static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
+                          unsigned flags)
+{
+    av_log(s, AV_LOG_ERROR, "%s: NIF: idx=%d, flags=%#x\n", __func__, stream_index, flags);
+    return AVERROR_PATCHWELCOME;
+}
+
+static int egl_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
+{
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
+#endif
+    switch(type) {
+    case AV_APP_TO_DEV_WINDOW_REPAINT:
+        return 0;
+    default:
+        break;
+    }
+    return AVERROR(ENOSYS);
+}
+
+// deinit is called if init fails so no need to clean up explicity here
+static int egl_vout_init(struct AVFormatContext * s)
+{
+    egl_display_env_t * const de = s->priv_data;
+    unsigned int i;
+
+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    de->setup = (struct egl_setup){0};
+
+    for (i = 0; i != 32; ++i) {
+        de->aux[i].fd = -1;
+    }
+
+    de->q_terminate = 0;
+    pthread_mutex_init(&de->q_lock, NULL);
+    sem_init(&de->q_sem, 0, 0);
+    sem_init(&de->display_start_sem, 0, 0);
+    av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0);
+
+    sem_wait(&de->display_start_sem);
+    if (de->q_terminate) {
+       av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__);
+       return -1;
+    }
+
+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
+
+    return 0;
+}
+
+static void egl_vout_deinit(struct AVFormatContext * s)
+{
+    egl_display_env_t * const de = s->priv_data;
+
+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    de->q_terminate = 1;
+    sem_post(&de->q_sem);
+    pthread_join(de->q_thread, NULL);
+    sem_destroy(&de->q_sem);
+    pthread_mutex_destroy(&de->q_lock);
+
+    av_frame_free(&de->q_next);
+    av_frame_free(&de->q_this);
+
+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
+}
+
+#define OFFSET(x) offsetof(egl_display_env_t, x)
+static const AVOption options[] = {
+   { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
+   { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
+   { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
+   { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
+   { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
+    { NULL }
+
+};
+
+static const AVClass egl_vout_class = {
+    .class_name = "egl vid outdev",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
+};
+
+FFOutputFormat ff_vout_egl_muxer = {
+    .p = {
+        .name           = "vout_egl",
+        .long_name      = NULL_IF_CONFIG_SMALL("Egl video output device"),
+        .audio_codec    = AV_CODEC_ID_NONE,
+        .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
+        .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
+        .priv_class     = &egl_vout_class,
+    },
+    .priv_data_size = sizeof(egl_display_env_t),
+    .write_header   = egl_vout_write_header,
+    .write_packet   = egl_vout_write_packet,
+    .write_uncoded_frame = egl_vout_write_frame,
+    .write_trailer  = egl_vout_write_trailer,
+    .control_message = egl_vout_control_message,
+    .init           = egl_vout_init,
+    .deinit         = egl_vout_deinit,
+};
+

From 867bd7c243e66a1c1756878e20df8f35db8025ec Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 28 Apr 2021 12:51:22 +0100
Subject: [PATCH 016/161] V4L2 stateful rework

---
 libavcodec/Makefile       |   3 +-
 libavcodec/v4l2_buffers.c | 556 +++++++++++++++++++++++++++-----------
 libavcodec/v4l2_buffers.h |  28 +-
 libavcodec/v4l2_context.c | 536 +++++++++++++++++++++++++++---------
 libavcodec/v4l2_context.h |  20 +-
 libavcodec/v4l2_m2m.c     |  20 +-
 libavcodec/v4l2_m2m.h     |  31 +++
 libavcodec/v4l2_m2m_dec.c | 446 ++++++++++++++++++++++++++----
 8 files changed, 1286 insertions(+), 354 deletions(-)

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 2d440b5648..e1aa0ba014 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -169,7 +169,8 @@ OBJS-$(CONFIG_VIDEODSP)                += videodsp.o
 OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
 OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
 OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
-OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
+OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\
+                                          weak_link.o
 OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\
 					  v4l2_req_devscan.o weak_link.o
 OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 3f5471067a..a003934ca1 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -21,6 +21,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

+#include <drm_fourcc.h>
 #include <linux/videodev2.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
@@ -29,12 +30,14 @@
 #include <poll.h>
 #include "libavcodec/avcodec.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/hwcontext.h"
 #include "v4l2_context.h"
 #include "v4l2_buffers.h"
 #include "v4l2_m2m.h"
+#include "weak_link.h"

 #define USEC_PER_SEC 1000000
-static AVRational v4l2_timebase = { 1, USEC_PER_SEC };
+static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };

 static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
 {
@@ -51,34 +54,44 @@ static inline AVCodecContext *logger(V4L2Buffer *buf)
 static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf)
 {
     V4L2m2mContext *s = buf_to_m2mctx(avbuf);
-
-    if (s->avctx->pkt_timebase.num)
-        return s->avctx->pkt_timebase;
-    return s->avctx->time_base;
+    const AVRational tb = s->avctx->pkt_timebase.num ?
+        s->avctx->pkt_timebase :
+        s->avctx->time_base;
+    return tb.num && tb.den ? tb : v4l2_timebase;
 }

-static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts)
+static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts, int no_rescale)
 {
-    int64_t v4l2_pts;
-
-    if (pts == AV_NOPTS_VALUE)
-        pts = 0;
-
     /* convert pts to v4l2 timebase */
-    v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
+    const int64_t v4l2_pts =
+        no_rescale ? pts :
+        pts == AV_NOPTS_VALUE ? 0 :
+            av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
     out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
     out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
 }

-static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf)
+static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf, int no_rescale)
 {
-    int64_t v4l2_pts;
-
     /* convert pts back to encoder timebase */
-    v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
+    const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
                         avbuf->buf.timestamp.tv_usec;

-    return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
+    return
+        no_rescale ? v4l2_pts :
+        v4l2_pts == 0 ? AV_NOPTS_VALUE :
+            av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
+}
+
+static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length)
+{
+    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
+        out->planes[plane].bytesused = bytesused;
+        out->planes[plane].length = length;
+    } else {
+        out->buf.bytesused = bytesused;
+        out->buf.length = length;
+    }
 }

 static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
@@ -209,68 +222,143 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
     return AVCOL_TRC_UNSPECIFIED;
 }

-static void v4l2_free_buffer(void *opaque, uint8_t *unused)
+static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
 {
-    V4L2Buffer* avbuf = opaque;
-    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
+    AVDRMLayerDescriptor *layer;

-    if (atomic_fetch_sub(&avbuf->context_refcount, 1) == 1) {
-        atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel);
+    /* fill the DRM frame descriptor */
+    drm_desc->nb_objects = avbuf->num_planes;
+    drm_desc->nb_layers = 1;

-        if (s->reinit) {
-            if (!atomic_load(&s->refcount))
-                sem_post(&s->refsync);
-        } else {
-            if (s->draining && V4L2_TYPE_IS_OUTPUT(avbuf->context->type)) {
-                /* no need to queue more buffers to the driver */
-                avbuf->status = V4L2BUF_AVAILABLE;
-            }
-            else if (avbuf->context->streamon)
-                ff_v4l2_buffer_enqueue(avbuf);
-        }
+    layer = &drm_desc->layers[0];
+    layer->nb_planes = avbuf->num_planes;
+
+    for (int i = 0; i < avbuf->num_planes; i++) {
+        layer->planes[i].object_index = i;
+        layer->planes[i].offset = 0;
+        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
+    }
+
+    switch (avbuf->context->av_pix_fmt) {
+    case AV_PIX_FMT_YUYV422:
+
+        layer->format = DRM_FORMAT_YUYV;
+        layer->nb_planes = 1;
+
+        break;
+
+    case AV_PIX_FMT_NV12:
+    case AV_PIX_FMT_NV21:
+
+        layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ?
+            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
+
+        if (avbuf->num_planes > 1)
+            break;
+
+        layer->nb_planes = 2;
+
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
+            avbuf->context->format.fmt.pix.height;
+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
+        break;
+
+    case AV_PIX_FMT_YUV420P:
+
+        layer->format = DRM_FORMAT_YUV420;
+
+        if (avbuf->num_planes > 1)
+            break;
+
+        layer->nb_planes = 3;
+
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
+            avbuf->context->format.fmt.pix.height;
+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
+
+        layer->planes[2].object_index = 0;
+        layer->planes[2].offset = layer->planes[1].offset +
+            ((avbuf->plane_info[0].bytesperline *
+              avbuf->context->format.fmt.pix.height) >> 2);
+        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
+        break;

-        av_buffer_unref(&avbuf->context_ref);
+    default:
+        drm_desc->nb_layers = 0;
+        break;
     }
+
+    return (uint8_t *) drm_desc;
 }

-static int v4l2_buf_increase_ref(V4L2Buffer *in)
+static void v4l2_free_bufref(void *opaque, uint8_t *data)
 {
-    V4L2m2mContext *s = buf_to_m2mctx(in);
+    AVBufferRef * bufref = (AVBufferRef *)data;
+    V4L2Buffer *avbuf = (V4L2Buffer *)bufref->data;
+    struct V4L2Context *ctx = ff_weak_link_lock(&avbuf->context_wl);

-    if (in->context_ref)
-        atomic_fetch_add(&in->context_refcount, 1);
-    else {
-        in->context_ref = av_buffer_ref(s->self_ref);
-        if (!in->context_ref)
-            return AVERROR(ENOMEM);
+    if (ctx != NULL) {
+        // Buffer still attached to context
+        V4L2m2mContext *s = buf_to_m2mctx(avbuf);

-        in->context_refcount = 1;
-    }
+        ff_mutex_lock(&ctx->lock);

-    in->status = V4L2BUF_RET_USER;
-    atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed);
+        avbuf->status = V4L2BUF_AVAILABLE;

-    return 0;
+        if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name);
+            /* no need to queue more buffers to the driver */
+        }
+        else if (ctx->streamon) {
+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer requeue\n", ctx->name);
+            avbuf->buf.timestamp.tv_sec = 0;
+            avbuf->buf.timestamp.tv_usec = 0;
+            ff_v4l2_buffer_enqueue(avbuf);  // will set to IN_DRIVER
+        }
+        else {
+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer freed but streamoff\n", ctx->name);
+        }
+
+        ff_mutex_unlock(&ctx->lock);
+    }
+
+    ff_weak_link_unlock(avbuf->context_wl);
+    av_buffer_unref(&bufref);
 }

-static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
 {
-    int ret;
+    struct v4l2_exportbuffer expbuf;
+    int i, ret;

-    if (plane >= in->num_planes)
-        return AVERROR(EINVAL);
+    for (i = 0; i < avbuf->num_planes; i++) {
+        memset(&expbuf, 0, sizeof(expbuf));

-    /* even though most encoders return 0 in data_offset encoding vp8 does require this value */
-    *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset,
-                            in->plane_info[plane].length, v4l2_free_buffer, in, 0);
-    if (!*buf)
-        return AVERROR(ENOMEM);
+        expbuf.index = avbuf->buf.index;
+        expbuf.type = avbuf->buf.type;
+        expbuf.plane = i;

-    ret = v4l2_buf_increase_ref(in);
-    if (ret)
-        av_buffer_unref(buf);
+        ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf);
+        if (ret < 0)
+            return AVERROR(errno);

-    return ret;
+        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) {
+            /* drm frame */
+            avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length;
+            avbuf->drm_frame.objects[i].fd = expbuf.fd;
+            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        } else {
+            /* drm frame */
+            avbuf->drm_frame.objects[0].size = avbuf->buf.length;
+            avbuf->drm_frame.objects[0].fd = expbuf.fd;
+            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        }
+    }
+
+    return 0;
 }

 static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset)
@@ -285,30 +373,50 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i

     memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset));

-    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
-        out->planes[plane].bytesused = bytesused;
-        out->planes[plane].length = length;
-    } else {
-        out->buf.bytesused = bytesused;
-        out->buf.length = length;
-    }
+    set_buf_length(out, plane, bytesused, length);

     return 0;
 }

+static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf)
+{
+    AVBufferRef * bufref = av_buffer_ref(avbuf->context->bufrefs[avbuf->buf.index]);
+    AVBufferRef * newbuf;
+
+    if (!bufref)
+        return NULL;
+
+    newbuf = av_buffer_create((uint8_t *)bufref, sizeof(*bufref), v4l2_free_bufref, NULL, 0);
+    if (newbuf == NULL)
+        av_buffer_unref(&bufref);
+
+    avbuf->status = V4L2BUF_RET_USER;
+    return newbuf;
+}
+
 static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
 {
-    int i, ret;
+    int i;

     frame->format = avbuf->context->av_pix_fmt;

-    for (i = 0; i < avbuf->num_planes; i++) {
-        ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
-        if (ret)
-            return ret;
+    frame->buf[0] = wrap_avbuf(avbuf);
+    if (frame->buf[0] == NULL)
+        return AVERROR(ENOMEM);
+
+    if (buf_to_m2mctx(avbuf)->output_drm) {
+        /* 1. get references to the actual data */
+        frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
+        frame->format = AV_PIX_FMT_DRM_PRIME;
+        frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref);
+        return 0;
+    }
+

+    /* 1. get references to the actual data */
+    for (i = 0; i < avbuf->num_planes; i++) {
+        frame->data[i] = (uint8_t *)avbuf->plane_info[i].mm_addr + avbuf->planes[i].data_offset;
         frame->linesize[i] = avbuf->plane_info[i].bytesperline;
-        frame->data[i] = frame->buf[i]->data;
     }

     /* fixup special cases */
@@ -337,68 +445,95 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
     return 0;
 }

+static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h)
+{
+    if (dst_stride == src_stride && w + 32 >= dst_stride) {
+        memcpy(dst, src, dst_stride * h);
+    }
+    else {
+        while (--h >= 0) {
+            memcpy(dst, src, w);
+            dst += dst_stride;
+            src += src_stride;
+        }
+    }
+}
+
+static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes)
+{
+    return i != 0  && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA));
+}
+
 static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
 {
-    int i, ret;
-    struct v4l2_format fmt = out->context->format;
-    int pixel_format = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
-                       fmt.fmt.pix_mp.pixelformat : fmt.fmt.pix.pixelformat;
-    int height       = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
-                       fmt.fmt.pix_mp.height : fmt.fmt.pix.height;
-    int is_planar_format = 0;
-
-    switch (pixel_format) {
-    case V4L2_PIX_FMT_YUV420M:
-    case V4L2_PIX_FMT_YVU420M:
-#ifdef V4L2_PIX_FMT_YUV422M
-    case V4L2_PIX_FMT_YUV422M:
-#endif
-#ifdef V4L2_PIX_FMT_YVU422M
-    case V4L2_PIX_FMT_YVU422M:
-#endif
-#ifdef V4L2_PIX_FMT_YUV444M
-    case V4L2_PIX_FMT_YUV444M:
-#endif
-#ifdef V4L2_PIX_FMT_YVU444M
-    case V4L2_PIX_FMT_YVU444M:
-#endif
-    case V4L2_PIX_FMT_NV12M:
-    case V4L2_PIX_FMT_NV21M:
-    case V4L2_PIX_FMT_NV12MT_16X16:
-    case V4L2_PIX_FMT_NV12MT:
-    case V4L2_PIX_FMT_NV16M:
-    case V4L2_PIX_FMT_NV61M:
-        is_planar_format = 1;
-    }
-
-    if (!is_planar_format) {
-        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
-        int planes_nb = 0;
-        int offset = 0;
-
-        for (i = 0; i < desc->nb_components; i++)
-            planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
-
-        for (i = 0; i < planes_nb; i++) {
-            int size, h = height;
-            if (i == 1 || i == 2) {
+    int i;
+    int num_planes = 0;
+    int pel_strides[4] = {0};
+
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+
+    if ((desc->flags & AV_PIX_FMT_FLAG_HWACCEL) != 0) {
+        av_log(NULL, AV_LOG_ERROR, "%s: HWACCEL cannot be copied\n", __func__);
+        return -1;
+    }
+
+    for (i = 0; i != desc->nb_components; ++i) {
+        if (desc->comp[i].plane >= num_planes)
+            num_planes = desc->comp[i].plane + 1;
+        pel_strides[desc->comp[i].plane] = desc->comp[i].step;
+    }
+
+    if (out->num_planes > 1) {
+        if (num_planes != out->num_planes) {
+            av_log(NULL, AV_LOG_ERROR, "%s: Num planes mismatch: %d != %d\n", __func__, num_planes, out->num_planes);
+            return -1;
+        }
+        for (i = 0; i != num_planes; ++i) {
+            int w = frame->width;
+            int h = frame->height;
+            if (is_chroma(desc, i, num_planes)) {
+                w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
                 h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
             }
-            size = frame->linesize[i] * h;
-            ret = v4l2_bufref_to_buf(out, 0, frame->data[i], size, offset);
-            if (ret)
-                return ret;
-            offset += size;
+
+            cpy_2d(out->plane_info[i].mm_addr, out->plane_info[i].bytesperline,
+                   frame->data[i], frame->linesize[i],
+                   w * pel_strides[i], h);
+            set_buf_length(out, i, out->plane_info[i].bytesperline * h, out->plane_info[i].length);
         }
-        return 0;
     }
+    else
+    {
+        unsigned int offset = 0;
+
+        for (i = 0; i != num_planes; ++i) {
+            int w = frame->width;
+            int h = frame->height;
+            int dst_stride = out->plane_info[0].bytesperline;
+            uint8_t * const dst = (uint8_t *)out->plane_info[0].mm_addr + offset;
+
+            if (is_chroma(desc, i, num_planes)) {
+                // Is chroma
+                dst_stride >>= desc->log2_chroma_w;
+                offset += dst_stride * (out->context->height >> desc->log2_chroma_h);
+                w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
+                h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
+            }
+            else {
+                // Is luma or alpha
+                offset += dst_stride * out->context->height;
+            }
+            if (offset > out->plane_info[0].length) {
+                av_log(NULL, AV_LOG_ERROR, "%s: Plane total %d > buffer size %d\n", __func__, offset, out->plane_info[0].length);
+                return -1;
+            }

-    for (i = 0; i < out->num_planes; i++) {
-        ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0);
-        if (ret)
-            return ret;
+            cpy_2d(dst, dst_stride,
+                   frame->data[i], frame->linesize[i],
+                   w * pel_strides[i], h);
+        }
+        set_buf_length(out, 0, offset, out->plane_info[0].length);
     }
-
     return 0;
 }

@@ -410,14 +545,15 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)

 int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
 {
-    v4l2_set_pts(out, frame->pts);
+    v4l2_set_pts(out, frame->pts, 0);

     return v4l2_buffer_swframe_to_buf(frame, out);
 }

-int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_rescale_pts)
 {
     int ret;
+    V4L2Context * const ctx = avbuf->context;

     av_frame_unref(frame);

@@ -432,13 +568,22 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
     frame->colorspace = v4l2_get_color_space(avbuf);
     frame->color_range = v4l2_get_color_range(avbuf);
     frame->color_trc = v4l2_get_color_trc(avbuf);
-    frame->pts = v4l2_get_pts(avbuf);
+    frame->pts = v4l2_get_pts(avbuf, no_rescale_pts);
     frame->pkt_dts = AV_NOPTS_VALUE;

     /* these values are updated also during re-init in v4l2_process_driver_event */
-    frame->height = avbuf->context->height;
-    frame->width = avbuf->context->width;
-    frame->sample_aspect_ratio = avbuf->context->sample_aspect_ratio;
+    frame->height = ctx->height;
+    frame->width = ctx->width;
+    frame->sample_aspect_ratio = ctx->sample_aspect_ratio;
+
+    if (ctx->selection.height && ctx->selection.width) {
+        frame->crop_left = ctx->selection.left < frame->width ? ctx->selection.left : 0;
+        frame->crop_top  = ctx->selection.top < frame->height ? ctx->selection.top  : 0;
+        frame->crop_right = ctx->selection.left + ctx->selection.width < frame->width ?
+            frame->width - (ctx->selection.left + ctx->selection.width) : 0;
+        frame->crop_bottom = ctx->selection.top + ctx->selection.height < frame->height ?
+            frame->height - (ctx->selection.top + ctx->selection.height) : 0;
+    }

     /* 3. report errors upstream */
     if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) {
@@ -451,15 +596,14 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)

 int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
 {
-    int ret;
-
     av_packet_unref(pkt);
-    ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf);
-    if (ret)
-        return ret;
+
+    pkt->buf = wrap_avbuf(avbuf);
+    if (pkt->buf == NULL)
+        return AVERROR(ENOMEM);

     pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
-    pkt->data = pkt->buf->data;
+    pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset;

     if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
         pkt->flags |= AV_PKT_FLAG_KEY;
@@ -469,20 +613,27 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
         pkt->flags |= AV_PKT_FLAG_CORRUPT;
     }

-    pkt->dts = pkt->pts = v4l2_get_pts(avbuf);
+    pkt->dts = pkt->pts = v4l2_get_pts(avbuf, 0);

     return 0;
 }

-int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
+                                    const void *extdata, size_t extlen, int no_rescale_pts)
 {
     int ret;

-    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0);
+    if (extlen) {
+        ret = v4l2_bufref_to_buf(out, 0, extdata, extlen, 0);
+        if (ret)
+            return ret;
+    }
+
+    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen);
     if (ret)
         return ret;

-    v4l2_set_pts(out, pkt->pts);
+    v4l2_set_pts(out, pkt->pts, no_rescale_pts);

     if (pkt->flags & AV_PKT_FLAG_KEY)
         out->flags = V4L2_BUF_FLAG_KEYFRAME;
@@ -490,15 +641,61 @@ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
     return 0;
 }

-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
+{
+    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
+}
+
+
+static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data)
+{
+    V4L2Buffer * const avbuf = (V4L2Buffer *)data;
+    int i;
+
+    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->plane_info); ++i) {
+        struct V4L2Plane_info *p = avbuf->plane_info + i;
+        if (p->mm_addr != NULL)
+            munmap(p->mm_addr, p->length);
+    }
+
+    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
+        if (avbuf->drm_frame.objects[i].fd != -1)
+            close(avbuf->drm_frame.objects[i].fd);
+    }
+
+    ff_weak_link_unref(&avbuf->context_wl);
+
+    av_free(avbuf);
+}
+
+
+int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx)
 {
-    V4L2Context *ctx = avbuf->context;
     int ret, i;
+    V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf));
+    AVBufferRef * bufref;
+
+    *pbufref = NULL;
+    if (avbuf == NULL)
+        return AVERROR(ENOMEM);
+
+    bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0);
+    if (bufref == NULL) {
+        av_free(avbuf);
+        return AVERROR(ENOMEM);
+    }

+    avbuf->context = ctx;
     avbuf->buf.memory = V4L2_MEMORY_MMAP;
     avbuf->buf.type = ctx->type;
     avbuf->buf.index = index;

+    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
+        avbuf->drm_frame.objects[i].fd = -1;
+    }
+
+    avbuf->context_wl = ff_weak_link_ref(ctx->wl_master);
+
     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
         avbuf->buf.length = VIDEO_MAX_PLANES;
         avbuf->buf.m.planes = avbuf->planes;
@@ -506,7 +703,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)

     ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf);
     if (ret < 0)
-        return AVERROR(errno);
+        goto fail;

     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
         avbuf->num_planes = 0;
@@ -526,25 +723,33 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)

         if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
-            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
-                                           PROT_READ | PROT_WRITE, MAP_SHARED,
-                                           buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
+
+            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
+                !buf_to_m2mctx(avbuf)->output_drm) {
+                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
+                                               PROT_READ | PROT_WRITE, MAP_SHARED,
+                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
+            }
         } else {
             avbuf->plane_info[i].length = avbuf->buf.length;
-            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
-                                          PROT_READ | PROT_WRITE, MAP_SHARED,
-                                          buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
+
+            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
+                !buf_to_m2mctx(avbuf)->output_drm) {
+                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
+                                               PROT_READ | PROT_WRITE, MAP_SHARED,
+                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
+            }
         }

-        if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
-            return AVERROR(ENOMEM);
+        if (avbuf->plane_info[i].mm_addr == MAP_FAILED) {
+            avbuf->plane_info[i].mm_addr = NULL;
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
     }

     avbuf->status = V4L2BUF_AVAILABLE;

-    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
-        return 0;
-
     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
         avbuf->buf.m.planes = avbuf->planes;
         avbuf->buf.length   = avbuf->num_planes;
@@ -554,7 +759,20 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
         avbuf->buf.length    = avbuf->planes[0].length;
     }

-    return ff_v4l2_buffer_enqueue(avbuf);
+    if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+        if (buf_to_m2mctx(avbuf)->output_drm) {
+            ret = v4l2_buffer_export_drm(avbuf);
+            if (ret)
+                    goto fail;
+        }
+    }
+
+    *pbufref = bufref;
+    return 0;
+
+fail:
+    av_buffer_unref(&bufref);
+    return ret;
 }

 int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
@@ -563,9 +781,27 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)

     avbuf->buf.flags = avbuf->flags;

+    if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) {
+        av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
+               avbuf->context->name, avbuf->buf.index,
+               avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
+               avbuf->context->q_count);
+    }
+
     ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf);
-    if (ret < 0)
-        return AVERROR(errno);
+    if (ret < 0) {
+        int err = errno;
+        av_log(logger(avbuf), AV_LOG_ERROR, "--- %s VIDIOC_QBUF: index %d FAIL err %d (%s)\n",
+               avbuf->context->name, avbuf->buf.index,
+               err, strerror(err));
+        return AVERROR(err);
+    }
+
+    ++avbuf->context->q_count;
+    av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
+           avbuf->context->name, avbuf->buf.index,
+           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
+           avbuf->context->q_count);

     avbuf->status = V4L2BUF_IN_DRIVER;

diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
index 3d2ff1b9a5..111526aee3 100644
--- a/libavcodec/v4l2_buffers.h
+++ b/libavcodec/v4l2_buffers.h
@@ -28,27 +28,37 @@
 #include <stddef.h>
 #include <linux/videodev2.h>

+#include "avcodec.h"
 #include "libavutil/buffer.h"
 #include "libavutil/frame.h"
+#include "libavutil/hwcontext_drm.h"
 #include "packet.h"

 enum V4L2Buffer_status {
     V4L2BUF_AVAILABLE,
     V4L2BUF_IN_DRIVER,
+    V4L2BUF_IN_USE,
     V4L2BUF_RET_USER,
 };

 /**
  * V4L2Buffer (wrapper for v4l2_buffer management)
  */
+struct V4L2Context;
+struct ff_weak_link_client;
+
 typedef struct V4L2Buffer {
-    /* each buffer needs to have a reference to its context */
+    /* each buffer needs to have a reference to its context
+     * The pointer is good enough for most operation but once the buffer has
+     * been passed to the user the buffer may become orphaned so for free ops
+     * the weak link must be used to ensure that the context is actually
+     * there
+     */
     struct V4L2Context *context;
+    struct ff_weak_link_client *context_wl;

-    /* This object is refcounted per-plane, so we need to keep track
-     * of how many context-refs we are holding. */
-    AVBufferRef *context_ref;
-    atomic_uint context_refcount;
+    /* DRM descriptor */
+    AVDRMFrameDescriptor drm_frame;

     /* keep track of the mmap address and mmap length */
     struct V4L2Plane_info {
@@ -73,11 +83,12 @@ typedef struct V4L2Buffer {
  *
  * @param[in] frame The AVFRame to push the information to
  * @param[in] buf The V4L2Buffer to get the information from
+ * @param[in] no_rescale_pts If non-zero do not rescale PTS
  *
  * @returns 0 in case of success, AVERROR(EINVAL) if the number of planes is incorrect,
  * AVERROR(ENOMEM) if the AVBufferRef can't be created.
  */
-int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf);
+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf, int no_rescale_pts);

 /**
  * Extracts the data from a V4L2Buffer to an AVPacket
@@ -101,6 +112,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf);
  */
 int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);

+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
+                                    const void *extdata, size_t extlen, int no_rescale_pts);
+
 /**
  * Extracts the data from an AVFrame to a V4L2Buffer
  *
@@ -119,7 +133,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
  *
  * @returns 0 in case of success, a negative AVERROR code otherwise
  */
-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
+int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx);

 /**
  * Enqueues a V4L2Buffer
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index a40be94690..be76068af3 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -27,11 +27,13 @@
 #include <unistd.h>
 #include <fcntl.h>
 #include <poll.h>
+#include "libavutil/avassert.h"
 #include "libavcodec/avcodec.h"
 #include "decode.h"
 #include "v4l2_buffers.h"
 #include "v4l2_fmt.h"
 #include "v4l2_m2m.h"
+#include "weak_link.h"

 struct v4l2_format_update {
     uint32_t v4l2_fmt;
@@ -153,21 +155,99 @@ static inline void v4l2_save_to_context(V4L2Context* ctx, struct v4l2_format_upd
     }
 }

-static int v4l2_start_decode(V4L2Context *ctx)
+static int get_default_selection(V4L2Context * const ctx, struct v4l2_rect *r)
 {
-    struct v4l2_decoder_cmd cmd = {
-        .cmd = V4L2_DEC_CMD_START,
-        .flags = 0,
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
+    struct v4l2_selection selection = {
+        .type = V4L2_BUF_TYPE_VIDEO_CAPTURE,
+        .target = V4L2_SEL_TGT_COMPOSE
     };
-    int ret;

-    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DECODER_CMD, &cmd);
-    if (ret)
+    memset(r, 0, sizeof(*r));
+    if (ioctl(s->fd, VIDIOC_G_SELECTION, &selection))
         return AVERROR(errno);

+    *r = selection.r;
     return 0;
 }

+static int do_source_change(V4L2m2mContext * const s)
+{
+    AVCodecContext *const avctx = s->avctx;
+
+    int ret;
+    int reinit;
+    int full_reinit;
+    struct v4l2_format cap_fmt = s->capture.format;
+
+    s->resize_pending = 0;
+    s->capture.done = 0;
+
+    ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
+    if (ret) {
+        av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->capture.name);
+        return 0;
+    }
+
+    s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
+
+    get_default_selection(&s->capture, &s->capture.selection);
+
+    reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
+    if (reinit) {
+        s->capture.height = v4l2_get_height(&cap_fmt);
+        s->capture.width = v4l2_get_width(&cap_fmt);
+    }
+    s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
+
+    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n",
+           s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
+           s->capture.selection.width, s->capture.selection.height,
+           s->capture.selection.left, s->capture.selection.top);
+
+    s->reinit = 1;
+
+    if (reinit) {
+        if (avctx)
+            ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
+        if (ret < 0)
+            av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n");
+
+        ret = ff_v4l2_m2m_codec_reinit(s);
+        if (ret) {
+            av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
+            return AVERROR(EINVAL);
+        }
+        goto reinit_run;
+    }
+
+    /* Buffers are OK so just stream off to ack */
+    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only\n", __func__);
+
+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
+    if (ret)
+        av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n");
+    s->draining = 0;
+
+    /* reinit executed */
+reinit_run:
+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON);
+    return 1;
+}
+
+static int ctx_done(V4L2Context * const ctx)
+{
+    int rv = 0;
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
+
+    ctx->done = 1;
+
+    if (s->resize_pending && !V4L2_TYPE_IS_OUTPUT(ctx->type))
+        rv = do_source_change(s);
+
+    return rv;
+}
+
 /**
  * handle resolution change event and end of stream event
  * returns 1 if reinit was successful, negative if it failed
@@ -175,8 +255,7 @@ static int v4l2_start_decode(V4L2Context *ctx)
  */
 static int v4l2_handle_event(V4L2Context *ctx)
 {
-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
-    struct v4l2_format cap_fmt = s->capture.format;
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
     struct v4l2_event evt = { 0 };
     int ret;

@@ -186,44 +265,22 @@ static int v4l2_handle_event(V4L2Context *ctx)
         return 0;
     }

+    av_log(logger(ctx), AV_LOG_INFO, "Dq event %d\n", evt.type);
+
     if (evt.type == V4L2_EVENT_EOS) {
-        ctx->done = 1;
+//        ctx->done = 1;
+        av_log(logger(ctx), AV_LOG_TRACE, "%s VIDIOC_EVENT_EOS\n", ctx->name);
         return 0;
     }

     if (evt.type != V4L2_EVENT_SOURCE_CHANGE)
         return 0;

-    ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
-    if (ret) {
-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name);
-        return 0;
-    }
-
-    if (v4l2_resolution_changed(&s->capture, &cap_fmt)) {
-        s->capture.height = v4l2_get_height(&cap_fmt);
-        s->capture.width = v4l2_get_width(&cap_fmt);
-        s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
-    } else {
-        v4l2_start_decode(ctx);
+    s->resize_pending = 1;
+    if (!ctx->done)
         return 0;
-    }
-
-    s->reinit = 1;
-
-    if (s->avctx)
-        ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
-    if (ret < 0)
-        av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n");
-
-    ret = ff_v4l2_m2m_codec_reinit(s);
-    if (ret) {
-        av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n");
-        return AVERROR(EINVAL);
-    }

-    /* reinit executed */
-    return 1;
+    return do_source_change(s);
 }

 static int v4l2_stop_decode(V4L2Context *ctx)
@@ -266,8 +323,26 @@ static int v4l2_stop_encode(V4L2Context *ctx)
     return 0;
 }

+static int count_in_driver(const V4L2Context * const ctx)
+{
+    int i;
+    int n = 0;
+
+    if (!ctx->bufrefs)
+        return -1;
+
+    for (i = 0; i < ctx->num_buffers; ++i) {
+        V4L2Buffer *const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
+        if (avbuf->status == V4L2BUF_IN_DRIVER)
+            ++n;
+    }
+    return n;
+}
+
 static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
 {
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
+    const int is_capture = !V4L2_TYPE_IS_OUTPUT(ctx->type);
     struct v4l2_plane planes[VIDEO_MAX_PLANES];
     struct v4l2_buffer buf = { 0 };
     V4L2Buffer *avbuf;
@@ -276,50 +351,84 @@ static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
         .fd = ctx_to_m2mctx(ctx)->fd,
     };
     int i, ret;
+    int no_rx_means_done = 0;

-    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) {
+    if (is_capture && ctx->bufrefs) {
         for (i = 0; i < ctx->num_buffers; i++) {
-            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
+            avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
+            if (avbuf->status == V4L2BUF_IN_DRIVER)
                 break;
         }
         if (i == ctx->num_buffers)
-            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to "
+            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers (%d) returned to "
                                                 "userspace. Increase num_capture_buffers "
                                                 "to prevent device deadlock or dropped "
-                                                "packets/frames.\n");
+                                                "packets/frames.\n", i);
     }

+#if 0
+    // I think this is true but pointless
+    // we will get some other form of EOF signal
+
     /* if we are draining and there are no more capture buffers queued in the driver we are done */
-    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) {
+    if (is_capture && ctx_to_m2mctx(ctx)->draining) {
         for (i = 0; i < ctx->num_buffers; i++) {
             /* capture buffer initialization happens during decode hence
              * detection happens at runtime
              */
-            if (!ctx->buffers)
+            if (!ctx->bufrefs)
                 break;

-            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
+            avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
+            if (avbuf->status == V4L2BUF_IN_DRIVER)
                 goto start;
         }
         ctx->done = 1;
         return NULL;
     }
+#endif

 start:
-    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
-        pfd.events =  POLLOUT | POLLWRNORM;
-    else {
+    if (is_capture) {
         /* no need to listen to requests for more input while draining */
         if (ctx_to_m2mctx(ctx)->draining)
             pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
+    } else {
+        pfd.events =  POLLOUT | POLLWRNORM;
     }
+    no_rx_means_done = s->resize_pending && is_capture;

     for (;;) {
-        ret = poll(&pfd, 1, timeout);
+        // If we have a resize pending then all buffers should be Qed
+        // With a resize pending we should be in drain but evidence suggests
+        // that not all decoders do this so poll to clear
+        int t2 = no_rx_means_done ? 0 : timeout < 0 ? 3000 : timeout;
+        const int e = pfd.events;
+
+        ret = poll(&pfd, 1, t2);
+
         if (ret > 0)
             break;
-        if (errno == EINTR)
-            continue;
+
+        if (ret < 0) {
+            int err = errno;
+            if (err == EINTR)
+                continue;
+            av_log(logger(ctx), AV_LOG_ERROR, "=== poll error %d (%s): events=%#x, cap buffers=%d\n",
+                   err, strerror(err),
+                   e, count_in_driver(ctx));
+            return NULL;
+        }
+
+        // ret == 0 (timeout)
+        if (no_rx_means_done) {
+            av_log(logger(ctx), AV_LOG_DEBUG, "Ctx done on timeout\n");
+            ret = ctx_done(ctx);
+            if (ret > 0)
+                goto start;
+        }
+        if (timeout == -1)
+            av_log(logger(ctx), AV_LOG_ERROR, "=== poll unexpected TIMEOUT: events=%#x, cap buffers=%d\n", e, count_in_driver(ctx));;
         return NULL;
     }

@@ -329,7 +438,8 @@ start:
            no need to raise a warning */
         if (timeout == 0) {
             for (i = 0; i < ctx->num_buffers; i++) {
-                if (ctx->buffers[i].status != V4L2BUF_AVAILABLE)
+                avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
+                if (avbuf->status != V4L2BUF_AVAILABLE)
                     av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
             }
         }
@@ -347,22 +457,25 @@ start:
             ctx->done = 1;
             return NULL;
         }
-        if (ret) {
-            /* if re-init was successful drop the buffer (if there was one)
-             * since we had to reconfigure capture (unmap all buffers)
-             */
-            return NULL;
-        }
+        if (ret > 0)
+            goto start;
     }

     /* 2. dequeue the buffer */
     if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {

-        if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+        if (is_capture) {
             /* there is a capture buffer ready */
             if (pfd.revents & (POLLIN | POLLRDNORM))
                 goto dequeue;

+            // CAPTURE Q drained
+            if (no_rx_means_done) {
+                if (ctx_done(ctx) > 0)
+                    goto start;
+                return NULL;
+            }
+
             /* the driver is ready to accept more input; instead of waiting for the capture
              * buffer to complete we return NULL so input can proceed (we are single threaded)
              */
@@ -380,37 +493,58 @@ dequeue:
             buf.m.planes = planes;
         }

-        ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf);
-        if (ret) {
-            if (errno != EAGAIN) {
-                ctx->done = 1;
-                if (errno != EPIPE)
+        while ((ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf)) == -1) {
+            const int err = errno;
+            if (err == EINTR)
+                continue;
+            if (err != EAGAIN) {
+                // EPIPE on CAPTURE can be used instead of BUF_FLAG_LAST
+                if (err != EPIPE || !is_capture)
                     av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
-                        ctx->name, av_err2str(AVERROR(errno)));
+                        ctx->name, av_err2str(AVERROR(err)));
+                if (ctx_done(ctx) > 0)
+                    goto start;
             }
             return NULL;
         }
+        --ctx->q_count;
+        av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d\n",
+               ctx->name, buf.index,
+               buf.timestamp.tv_sec, buf.timestamp.tv_usec,
+               ctx->q_count, ++ctx->dq_count);

-        if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+        avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
+        avbuf->status = V4L2BUF_AVAILABLE;
+        avbuf->buf = buf;
+        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+            memcpy(avbuf->planes, planes, sizeof(planes));
+            avbuf->buf.m.planes = avbuf->planes;
+        }
+
+        if (ctx_to_m2mctx(ctx)->draining && is_capture) {
             int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
                             buf.m.planes[0].bytesused : buf.bytesused;
             if (bytesused == 0) {
-                ctx->done = 1;
+                av_log(logger(ctx), AV_LOG_DEBUG, "Buffer empty - reQ\n");
+
+                // Must reQ so we don't leak
+                // May not matter if the next thing we do is release all the
+                // buffers but better to be tidy.
+                ff_v4l2_buffer_enqueue(avbuf);
+
+                if (ctx_done(ctx) > 0)
+                    goto start;
                 return NULL;
             }
 #ifdef V4L2_BUF_FLAG_LAST
-            if (buf.flags & V4L2_BUF_FLAG_LAST)
-                ctx->done = 1;
+            if (buf.flags & V4L2_BUF_FLAG_LAST) {
+                av_log(logger(ctx), AV_LOG_TRACE, "FLAG_LAST set\n");
+                avbuf->status = V4L2BUF_IN_USE;  // Avoid flushing this buffer
+                ctx_done(ctx);
+            }
 #endif
         }

-        avbuf = &ctx->buffers[buf.index];
-        avbuf->status = V4L2BUF_AVAILABLE;
-        avbuf->buf = buf;
-        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-            memcpy(avbuf->planes, planes, sizeof(planes));
-            avbuf->buf.m.planes = avbuf->planes;
-        }
         return avbuf;
     }

@@ -429,8 +563,9 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
     }

     for (i = 0; i < ctx->num_buffers; i++) {
-        if (ctx->buffers[i].status == V4L2BUF_AVAILABLE)
-            return &ctx->buffers[i];
+        V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
+        if (avbuf->status == V4L2BUF_AVAILABLE)
+            return avbuf;
     }

     return NULL;
@@ -438,25 +573,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)

 static int v4l2_release_buffers(V4L2Context* ctx)
 {
-    struct v4l2_requestbuffers req = {
-        .memory = V4L2_MEMORY_MMAP,
-        .type = ctx->type,
-        .count = 0, /* 0 -> unmaps buffers from the driver */
-    };
-    int i, j;
+    int i;
+    int ret = 0;
+    const int fd = ctx_to_m2mctx(ctx)->fd;

-    for (i = 0; i < ctx->num_buffers; i++) {
-        V4L2Buffer *buffer = &ctx->buffers[i];
+    // Orphan any buffers in the wild
+    ff_weak_link_break(&ctx->wl_master);
+
+    if (ctx->bufrefs) {
+        for (i = 0; i < ctx->num_buffers; i++)
+            av_buffer_unref(ctx->bufrefs + i);
+    }
+
+    if (fd != -1) {
+        struct v4l2_requestbuffers req = {
+            .memory = V4L2_MEMORY_MMAP,
+            .type = ctx->type,
+            .count = 0, /* 0 -> unmap all buffers from the driver */
+        };
+
+        while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) {
+            if (errno == EINTR)
+                continue;
+
+            ret = AVERROR(errno);

-        for (j = 0; j < buffer->num_planes; j++) {
-            struct V4L2Plane_info *p = &buffer->plane_info[j];
-            if (p->mm_addr && p->length)
-                if (munmap(p->mm_addr, p->length) < 0)
-                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno)));
+            av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n",
+                ctx->name, av_err2str(AVERROR(errno)));
+
+            if (ctx_to_m2mctx(ctx)->output_drm)
+                av_log(logger(ctx), AV_LOG_ERROR,
+                    "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n"
+                    "for all buffers: \n"
+                    "  1. drmModeRmFB(..)\n"
+                    "  2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
         }
     }
+    ctx->q_count = 0;

-    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
+    return ret;
 }

 static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
@@ -485,6 +640,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm

 static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
 {
+    V4L2m2mContext* s = ctx_to_m2mctx(ctx);
+    V4L2m2mPriv *priv = s->avctx->priv_data;
     enum AVPixelFormat pixfmt = ctx->av_pix_fmt;
     struct v4l2_fmtdesc fdesc;
     int ret;
@@ -503,6 +660,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
         if (ret)
             return AVERROR(EINVAL);

+        if (priv->pix_fmt != AV_PIX_FMT_NONE) {
+            if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) {
+                fdesc.index++;
+                continue;
+            }
+        }
+
         pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
         ret = v4l2_try_raw_format(ctx, pixfmt);
         if (ret){
@@ -555,18 +719,73 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
   *
   *****************************************************************************/

+
+static void flush_all_buffers_status(V4L2Context* const ctx)
+{
+    int i;
+    for (i = 0; i < ctx->num_buffers; ++i) {
+        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
+        if (buf->status == V4L2BUF_IN_DRIVER)
+            buf->status = V4L2BUF_AVAILABLE;
+    }
+    ctx->q_count = 0;
+}
+
+static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
+{
+    int i;
+    int rv;
+
+    if (!ctx->bufrefs) {
+        rv = ff_v4l2_context_init(ctx);
+        if (rv) {
+            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
+            return rv;
+        }
+    }
+
+    for (i = 0; i < ctx->num_buffers; ++i) {
+        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
+        if (buf->status == V4L2BUF_AVAILABLE) {
+            rv = ff_v4l2_buffer_enqueue(buf);
+            if (rv < 0)
+                return rv;
+        }
+    }
+    return 0;
+}
+
 int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
 {
     int type = ctx->type;
     int ret;
+    AVCodecContext * const avctx = logger(ctx);
+
+    ff_mutex_lock(&ctx->lock);
+
+    if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
+        stuff_all_buffers(avctx, ctx);

     ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
-    if (ret < 0)
-        return AVERROR(errno);
+    if (ret < 0) {
+        const int err = errno;
+        av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name,
+               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err);
+        ret = AVERROR(err);
+    }
+    else
+    {
+        if (cmd == VIDIOC_STREAMOFF)
+            flush_all_buffers_status(ctx);

-    ctx->streamon = (cmd == VIDIOC_STREAMON);
+        ctx->streamon = (cmd == VIDIOC_STREAMON);
+        av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name,
+               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF");
+    }

-    return 0;
+    ff_mutex_unlock(&ctx->lock);
+
+    return ret;
 }

 int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
@@ -594,7 +813,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
     return ff_v4l2_buffer_enqueue(avbuf);
 }

-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
+                                   const void * extdata, size_t extlen, int no_rescale_pts)
 {
     V4L2m2mContext *s = ctx_to_m2mctx(ctx);
     V4L2Buffer* avbuf;
@@ -602,8 +822,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)

     if (!pkt->size) {
         ret = v4l2_stop_decode(ctx);
+        // Log but otherwise ignore stop failure
         if (ret)
-            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name);
+            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
         s->draining = 1;
         return 0;
     }
@@ -612,14 +833,14 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
     if (!avbuf)
         return AVERROR(EAGAIN);

-    ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf);
+    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts);
     if (ret)
         return ret;

     return ff_v4l2_buffer_enqueue(avbuf);
 }

-int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout, int no_rescale_pts)
 {
     V4L2Buffer *avbuf;

@@ -636,7 +857,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
         return AVERROR(EAGAIN);
     }

-    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
+    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf, no_rescale_pts);
 }

 int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
@@ -695,54 +916,57 @@ void ff_v4l2_context_release(V4L2Context* ctx)
 {
     int ret;

-    if (!ctx->buffers)
+    if (!ctx->bufrefs)
         return;

     ret = v4l2_release_buffers(ctx);
     if (ret)
         av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name);

-    av_freep(&ctx->buffers);
+    av_freep(&ctx->bufrefs);
+    av_buffer_unref(&ctx->frames_ref);
+
+    ff_mutex_destroy(&ctx->lock);
 }

-int ff_v4l2_context_init(V4L2Context* ctx)
+
+static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers)
 {
-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
     struct v4l2_requestbuffers req;
-    int ret, i;
-
-    if (!v4l2_type_supported(ctx)) {
-        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
-        return AVERROR_PATCHWELCOME;
-    }
-
-    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
-    if (ret)
-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name);
+    int ret;
+    int i;

     memset(&req, 0, sizeof(req));
-    req.count = ctx->num_buffers;
+    req.count = req_buffers;
     req.memory = V4L2_MEMORY_MMAP;
     req.type = ctx->type;
-    ret = ioctl(s->fd, VIDIOC_REQBUFS, &req);
-    if (ret < 0) {
-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, strerror(errno));
-        return AVERROR(errno);
+    while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) {
+        if (errno != EINTR) {
+            ret = AVERROR(errno);
+            av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, av_err2str(ret));
+            return ret;
+        }
     }

     ctx->num_buffers = req.count;
-    ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer));
-    if (!ctx->buffers) {
+    ctx->bufrefs = av_mallocz(ctx->num_buffers * sizeof(*ctx->bufrefs));
+    if (!ctx->bufrefs) {
         av_log(logger(ctx), AV_LOG_ERROR, "%s malloc enomem\n", ctx->name);
-        return AVERROR(ENOMEM);
+        goto fail_release;
     }

-    for (i = 0; i < req.count; i++) {
-        ctx->buffers[i].context = ctx;
-        ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i);
-        if (ret < 0) {
+    ctx->wl_master = ff_weak_link_new(ctx);
+    if (!ctx->wl_master) {
+        ret = AVERROR(ENOMEM);
+        goto fail_release;
+    }
+
+    for (i = 0; i < ctx->num_buffers; i++) {
+        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx);
+        if (ret) {
             av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
-            goto error;
+            goto fail_release;
         }
     }

@@ -756,10 +980,62 @@ int ff_v4l2_context_init(V4L2Context* ctx)

     return 0;

-error:
+fail_release:
     v4l2_release_buffers(ctx);
+    av_freep(&ctx->bufrefs);
+    return ret;
+}
+
+int ff_v4l2_context_init(V4L2Context* ctx)
+{
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
+    int ret;
+
+    // It is not valid to reinit a context without a previous release
+    av_assert0(ctx->bufrefs == NULL);
+
+    if (!v4l2_type_supported(ctx)) {
+        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    ff_mutex_init(&ctx->lock, NULL);

-    av_freep(&ctx->buffers);
+    if (s->output_drm) {
+        AVHWFramesContext *hwframes;
+
+        ctx->frames_ref = av_hwframe_ctx_alloc(s->device_ref);
+        if (!ctx->frames_ref) {
+            ret = AVERROR(ENOMEM);
+            goto fail_unlock;
+        }
+
+        hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
+        hwframes->format = AV_PIX_FMT_DRM_PRIME;
+        hwframes->sw_format = ctx->av_pix_fmt;
+        hwframes->width = ctx->width;
+        hwframes->height = ctx->height;
+        ret = av_hwframe_ctx_init(ctx->frames_ref);
+        if (ret < 0)
+            goto fail_unref_hwframes;
+    }
+
+    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
+    if (ret) {
+        ret = AVERROR(errno);
+        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed: %s\n", ctx->name, av_err2str(ret));
+        goto fail_unref_hwframes;
+    }
+
+    ret = create_buffers(ctx, ctx->num_buffers);
+    if (ret < 0)
+        goto fail_unref_hwframes;
+
+    return 0;

+fail_unref_hwframes:
+    av_buffer_unref(&ctx->frames_ref);
+fail_unlock:
+    ff_mutex_destroy(&ctx->lock);
     return ret;
 }
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 6f7460c89a..59009d11d1 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -32,6 +32,8 @@
 #include "libavutil/rational.h"
 #include "codec_id.h"
 #include "packet.h"
+#include "libavutil/buffer.h"
+#include "libavutil/thread.h"
 #include "v4l2_buffers.h"

 typedef struct V4L2Context {
@@ -71,11 +73,12 @@ typedef struct V4L2Context {
      */
     int width, height;
     AVRational sample_aspect_ratio;
+    struct v4l2_rect selection;

     /**
-     * Indexed array of V4L2Buffers
+     * Indexed array of pointers to V4L2Buffers
      */
-    V4L2Buffer *buffers;
+    AVBufferRef **bufrefs;

     /**
      * Readonly after init.
@@ -93,6 +96,12 @@ typedef struct V4L2Context {
      */
     int done;

+    AVBufferRef *frames_ref;
+    int q_count;
+    int dq_count;
+    struct ff_weak_link_master *wl_master;
+
+    AVMutex lock;
 } V4L2Context;

 /**
@@ -157,9 +166,12 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
  * @param[in] ctx The V4L2Context to dequeue from.
  * @param[inout] f The AVFrame to dequeue to.
  * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
+ * @param[in] no_rescale_pts (0 rescale pts, 1 use pts as
+ *       timestamp directly)
+ *
  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
  */
-int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int no_rescale_pts);

 /**
  * Enqueues a buffer to a V4L2Context from an AVPacket
@@ -171,7 +183,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
  * @param[in] pkt A pointer to an AVPacket.
  * @return 0 in case of success, a negative error otherwise.
  */
-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size, int no_rescale_pts);

 /**
  * Enqueues a buffer to a V4L2Context from an AVFrame
diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index 602efb7a16..516e6d9858 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -216,13 +216,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s)
         av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n");

     /* 2. unmap the capture buffers (v4l2 and ffmpeg):
-     *    we must wait for all references to be released before being allowed
-     *    to queue new buffers.
      */
-    av_log(log_ctx, AV_LOG_DEBUG, "waiting for user to release AVBufferRefs\n");
-    if (atomic_load(&s->refcount))
-        while(sem_wait(&s->refsync) == -1 && errno == EINTR);
-
     ff_v4l2_context_release(&s->capture);

     /* 3. get the new capture format */
@@ -259,6 +253,8 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context)
     av_frame_free(&s->frame);
     av_packet_unref(&s->buf_pkt);

+    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n");
+
     av_free(s);
 }

@@ -270,6 +266,11 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
     if (!s)
         return 0;

+    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n");
+
+    if (av_codec_is_decoder(s->avctx->codec))
+        av_packet_unref(&s->buf_pkt);
+
     if (s->fd >= 0) {
         ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
         if (ret)
@@ -282,7 +283,14 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)

     ff_v4l2_context_release(&s->output);

+    close(s->fd);
+    s->fd = -1;
+
     s->self_ref = NULL;
+    // This is only called on avctx close so after this point we don't have that
+    // Crash sooner if we find we are using it (can still log with avctx = NULL)
+    s->avctx = NULL;
+    priv->context = NULL;
     av_buffer_unref(&priv->context_ref);

     return 0;
diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 04d86d7b92..24a9c94864 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -30,6 +30,7 @@
 #include <linux/videodev2.h>

 #include "libavcodec/avcodec.h"
+#include "libavutil/pixfmt.h"
 #include "v4l2_context.h"

 #define container_of(ptr, type, member) ({ \
@@ -40,6 +41,17 @@
     { "num_output_buffers", "Number of buffers in the output context",\
         OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 2, INT_MAX, FLAGS }

+#define FF_V4L2_M2M_TRACK_SIZE 128
+typedef struct V4L2m2mTrackEl {
+    int     discard;   // If we see this buffer its been flushed, so discard
+    int     pkt_size;
+    int64_t pts;
+    int64_t reordered_opaque;
+    int64_t pkt_pos;
+    int64_t pkt_duration;
+    int64_t track_pts;
+} V4L2m2mTrackEl;
+
 typedef struct V4L2m2mContext {
     char devname[PATH_MAX];
     int fd;
@@ -53,6 +65,7 @@ typedef struct V4L2m2mContext {
     sem_t refsync;
     atomic_uint refcount;
     int reinit;
+    int resize_pending;

     /* null frame/packet received */
     int draining;
@@ -66,6 +79,23 @@ typedef struct V4L2m2mContext {

     /* reference back to V4L2m2mPriv */
     void *priv;
+
+    AVBufferRef *device_ref;
+
+    /* generate DRM frames */
+    int output_drm;
+
+    /* Frame tracking */
+    int64_t last_pkt_dts;
+    int64_t last_opaque;
+    unsigned int track_no;
+    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
+
+    /* req pkt */
+    int req_pkt;
+
+    /* Ext data sent */
+    int extdata_sent;
 } V4L2m2mContext;

 typedef struct V4L2m2mPriv {
@@ -76,6 +106,7 @@ typedef struct V4L2m2mPriv {

     int num_output_buffers;
     int num_capture_buffers;
+    enum AVPixelFormat pix_fmt;
 } V4L2m2mPriv;

 /**
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 4944d08511..7f6033ac2c 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -23,6 +23,10 @@

 #include <linux/videodev2.h>
 #include <sys/ioctl.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_drm.h"
 #include "libavutil/pixfmt.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/opt.h"
@@ -30,26 +34,51 @@
 #include "codec_internal.h"
 #include "libavcodec/decode.h"

+#include "libavcodec/hwaccels.h"
+#include "libavcodec/internal.h"
+#include "libavcodec/hwconfig.h"
+
 #include "v4l2_context.h"
 #include "v4l2_m2m.h"
 #include "v4l2_fmt.h"

+static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s)
+{
+    int ret;
+    struct v4l2_decoder_cmd cmd = {
+        .cmd = V4L2_DEC_CMD_START,
+        .flags = 0,
+    };
+
+    if (s->output.streamon)
+        return 0;
+
+    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON);
+    if (ret < 0)
+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n");
+
+    if (!s->capture.streamon || ret < 0)
+        return ret;
+
+    ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd);
+    if (ret < 0)
+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno);
+    else
+        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_DECODER_CMD start OK\n");
+
+    return ret;
+}
+
 static int v4l2_try_start(AVCodecContext *avctx)
 {
     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
     V4L2Context *const capture = &s->capture;
-    V4L2Context *const output = &s->output;
     struct v4l2_selection selection = { 0 };
     int ret;

     /* 1. start the output process */
-    if (!output->streamon) {
-        ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON);
-        if (ret < 0) {
-            av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n");
-            return ret;
-        }
-    }
+    if ((ret = check_output_streamon(avctx, s)) != 0)
+        return ret;

     if (capture->streamon)
         return 0;
@@ -63,15 +92,29 @@ static int v4l2_try_start(AVCodecContext *avctx)
     }

     /* 2.1 update the AVCodecContext */
-    avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
-    capture->av_pix_fmt = avctx->pix_fmt;
+    capture->av_pix_fmt =
+        ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
+    if (s->output_drm) {
+        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
+        avctx->sw_pix_fmt = capture->av_pix_fmt;
+    }
+    else
+        avctx->pix_fmt = capture->av_pix_fmt;

     /* 3. set the crop parameters */
+#if 1
+    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    selection.target = V4L2_SEL_TGT_CROP_DEFAULT;
+    ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
+    av_log(avctx, AV_LOG_INFO, "Post G selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height);
+#else
     selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
     selection.r.height = avctx->coded_height;
     selection.r.width = avctx->coded_width;
+    av_log(avctx, AV_LOG_INFO, "Try selection %dx%d\n", avctx->coded_width, avctx->coded_height);
     ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
-    if (!ret) {
+    av_log(avctx, AV_LOG_INFO, "Post S selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height);
+    if (1) {
         ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
         if (ret) {
             av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
@@ -82,15 +125,7 @@ static int v4l2_try_start(AVCodecContext *avctx)
             capture->width  = selection.r.width;
         }
     }
-
-    /* 4. init the capture context now that we have the capture format */
-    if (!capture->buffers) {
-        ret = ff_v4l2_context_init(capture);
-        if (ret) {
-            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
-            return AVERROR(ENOMEM);
-        }
-    }
+#endif

     /* 5. start the capture process */
     ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
@@ -133,50 +168,287 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s)
     return 0;
 }

-static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
+static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
+{
+    return (int64_t)n;
+}
+
+static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
+{
+    return (unsigned int)pts;
+}
+
+// FFmpeg requires us to propagate a number of vars from the coded pkt into
+// the decoded frame. The only thing that tracks like that in V4L2 stateful
+// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
+// guarantees about PTS being unique or specified for every frame so replace
+// the supplied PTS with a simple incrementing number and keep a circular
+// buffer of all the things we want preserved (including the original PTS)
+// indexed by the tracking no.
+static void
+xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *const avpkt)
+{
+    int64_t track_pts;
+
+    // Avoid 0
+    if (++s->track_no == 0)
+        s->track_no = 1;
+
+    track_pts = track_to_pts(avctx, s->track_no);
+
+    av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, s->track_no);
+    s->last_pkt_dts = avpkt->dts;
+    s->track_els[s->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
+        .discard          = 0,
+        .pkt_size         = avpkt->size,
+        .pts              = avpkt->pts,
+        .reordered_opaque = avctx->reordered_opaque,
+        .pkt_pos          = avpkt->pos,
+        .pkt_duration     = avpkt->duration,
+        .track_pts        = track_pts
+    };
+    avpkt->pts = track_pts;
+}
+
+// Returns -1 if we should discard the frame
+static int
+xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *const frame)
+{
+    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
+    const V4L2m2mTrackEl *const t = s->track_els + n;
+    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
+    {
+        av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
+        frame->pts              = AV_NOPTS_VALUE;
+        frame->pkt_dts          = s->last_pkt_dts;
+        frame->reordered_opaque = s->last_opaque;
+        frame->pkt_pos          = -1;
+        frame->pkt_duration     = 0;
+        frame->pkt_size         = -1;
+    }
+    else if (!t->discard)
+    {
+        frame->pts              = t->pts;
+        frame->pkt_dts          = s->last_pkt_dts;
+        frame->reordered_opaque = t->reordered_opaque;
+        frame->pkt_pos          = t->pkt_pos;
+        frame->pkt_duration     = t->pkt_duration;
+        frame->pkt_size         = t->pkt_size;
+
+        s->last_opaque = s->track_els[n].reordered_opaque;
+        s->track_els[n].pts = AV_NOPTS_VALUE;  // If we hit this again deny accurate knowledge of PTS
+    }
+    else
+    {
+        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
+        return -1;
+    }
+
+    frame->best_effort_timestamp = frame->pts;
+    frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 ", DTS=%" PRId64 "\n", frame->pts, frame->pkt_dts);
+    return 0;
+}
+
+static inline int stream_started(const V4L2m2mContext * const s) {
+    return s->capture.streamon && s->output.streamon;
+}
+
+#define NQ_OK        0
+#define NQ_Q_FULL    1
+#define NQ_SRC_EMPTY 2
+#define NQ_DRAINING  3
+#define NQ_DEAD      4
+
+#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
+
+// AVERROR_EOF     Flushing an already flushed stream
+// -ve             Error (all errors except EOF are unexpected)
+// NQ_OK (0)       OK
+// NQ_Q_FULL       Dst full (retry if we think V4L2 Q has space now)
+// NQ_SRC_EMPTY    Src empty (do not retry)
+// NQ_DRAINING     At EOS, dQ dest until EOS there too
+// NQ_DEAD         Not running (do not retry, do not attempt capture dQ)
+
+static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s)
 {
-    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-    V4L2Context *const capture = &s->capture;
-    V4L2Context *const output = &s->output;
     int ret;

+    // If we don't already have a coded packet - get a new one
+    // We will already have a coded pkt if the output Q was full last time we
+    // tried to Q it
     if (!s->buf_pkt.size) {
         ret = ff_decode_get_packet(avctx, &s->buf_pkt);
+
+        if (ret == AVERROR(EAGAIN)) {
+            if (!stream_started(s)) {
+                av_log(avctx, AV_LOG_TRACE, "%s: receive_frame before 1st coded packet\n", __func__);
+                return NQ_DEAD;
+            }
+            return NQ_SRC_EMPTY;
+        }
+
+        if (ret == AVERROR_EOF) {
+            // EOF - enter drain mode
+            av_log(avctx, AV_LOG_TRACE, "--- EOS req: ret=%d, size=%d, started=%d, drain=%d\n",
+                   ret, s->buf_pkt.size, stream_started(s), s->draining);
+            if (!stream_started(s)) {
+                av_log(avctx, AV_LOG_DEBUG, "EOS on flushed stream\n");
+                s->draining = 1;
+                s->capture.done = 1;
+                return AVERROR_EOF;
+            }
+
+            if (!s->draining) {
+                // Calling enqueue with an empty pkt starts drain
+                av_assert0(s->buf_pkt.size == 0);
+                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0, 1);
+                if (ret) {
+                    av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret);
+                    return ret;
+                }
+            }
+            return NQ_DRAINING;
+        }
+
         if (ret < 0) {
-            if (ret == AVERROR(EAGAIN))
-                return ff_v4l2_context_dequeue_frame(capture, frame, 0);
-            else if (ret != AVERROR_EOF)
-                return ret;
+            av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret);
+            return ret;
         }
+
+        xlat_pts_in(avctx, s, &s->buf_pkt);
     }

-    if (s->draining)
-        goto dequeue;
+    if ((ret = check_output_streamon(avctx, s)) != 0)
+        return ret;

-    ret = ff_v4l2_context_enqueue_packet(output, &s->buf_pkt);
-    if (ret < 0 && ret != AVERROR(EAGAIN))
-        goto fail;
+    ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt,
+                                         avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size,
+                                         1);

-    /* if EAGAIN don't unref packet and try to enqueue in the next iteration */
-    if (ret != AVERROR(EAGAIN))
+    if (ret == AVERROR(EAGAIN)) {
+        // Out of input buffers - keep packet
+        ret = NQ_Q_FULL;
+    }
+    else {
+        // In all other cases we are done with this packet
         av_packet_unref(&s->buf_pkt);
+        s->extdata_sent = 1;

-    if (!s->draining) {
-        ret = v4l2_try_start(avctx);
         if (ret) {
-            /* cant recover */
-            if (ret != AVERROR(ENOMEM))
-                ret = 0;
-            goto fail;
+            av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret);
+            return ret;
+        }
+    }
+
+    // Start if we haven't
+    {
+        const int ret2 = v4l2_try_start(avctx);
+        if (ret2) {
+            av_log(avctx, AV_LOG_DEBUG, "Start failure: err=%d\n", ret2);
+            ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD;
+        }
+    }
+
+    return ret;
+}
+
+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+    int src_rv;
+    int dst_rv = 1;  // Non-zero (done), non-negative (error) number
+
+    do {
+        src_rv = try_enqueue_src(avctx, s);
+
+        // If we got a frame last time and we have nothing to enqueue then
+        // return now. rv will be AVERROR(EAGAIN) indicating that we want more input
+        // This should mean that once decode starts we enter a stable state where
+        // we alternately ask for input and produce output
+        if (s->req_pkt && src_rv == NQ_SRC_EMPTY)
+            break;
+
+        if (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) {
+            av_log(avctx, AV_LOG_WARNING, "Poll says src Q has space but enqueue fail");
+            src_rv = NQ_SRC_EMPTY;  // If we can't enqueue pretend that there is nothing to enqueue
+        }
+
+        // Try to get a new frame if
+        // (a) we haven't already got one AND
+        // (b) enqueue returned a status indicating that decode should be attempted
+        if (dst_rv != 0 && TRY_DQ(src_rv)) {
+            do {
+                // Dequeue frame will unref any previous contents of frame
+                // if it returns success so we don't need an explicit unref
+                // when discarding
+                // This returns AVERROR(EAGAIN) if there isn't a frame ready yet
+                // but there is room in the input Q
+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, -1, 1);
+
+                if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
+                    av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
+                           s->draining, s->capture.done);
+                else if (dst_rv && dst_rv != AVERROR(EAGAIN))
+                    av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
+                           s->draining, s->capture.done, dst_rv);
+
+                // Go again if we got a frame that we need to discard
+            } while (dst_rv == 0 && xlat_pts_out(avctx, s, frame));
+        }
+
+        // Continue trying to enqueue packets if either
+        // (a) we succeeded last time OR
+        // (b) enqueue failed due to input Q full AND there is now room
+    } while (src_rv == NQ_OK || (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) );
+
+    // Ensure that the frame contains nothing if we aren't returning a frame
+    // (might happen when discarding)
+    if (dst_rv)
+        av_frame_unref(frame);
+
+    // If we got a frame this time ask for a pkt next time
+    s->req_pkt = (dst_rv == 0);
+
+#if 0
+    if (dst_rv == 0)
+    {
+        static int z = 0;
+        if (++z > 50) {
+            av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n");
+            ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
+            return -1;
         }
     }
+#endif
+
+    return dst_rv == 0 ? 0 :
+        src_rv < 0 ? src_rv :
+        dst_rv < 0 ? dst_rv :
+            AVERROR(EAGAIN);
+}
+
+#if 0
+#include <time.h>
+static int64_t us_time(void)
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
+}

-dequeue:
-    return ff_v4l2_context_dequeue_frame(capture, frame, -1);
-fail:
-    av_packet_unref(&s->buf_pkt);
+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    int ret;
+    const int64_t now = us_time();
+    int64_t done;
+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
+    ret = v4l2_receive_frame2(avctx, frame);
+    done = us_time();
+    av_log(avctx, AV_LOG_TRACE, ">>> %s: rx time=%" PRId64 ", rv=%d\n", __func__, done - now, ret);
     return ret;
 }
+#endif

 static av_cold int v4l2_decode_init(AVCodecContext *avctx)
 {
@@ -185,6 +457,9 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     V4L2m2mPriv *priv = avctx->priv_data;
     int ret;

+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
+    avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
+
     ret = ff_v4l2_m2m_create_context(priv, &s);
     if (ret < 0)
         return ret;
@@ -205,6 +480,28 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
     capture->av_pix_fmt = avctx->pix_fmt;

+    /* the client requests the codec to generate DRM frames:
+     *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
+     *       check the ff_v4l2_buffer_to_avframe conversion function.
+     *   - the DRM frame format is passed in the DRM frame descriptor layer.
+     *       check the v4l2_get_drm_frame function.
+     */
+    switch (ff_get_format(avctx, avctx->codec->pix_fmts)) {
+    default:
+        s->output_drm = 1;
+        break;
+    }
+
+    s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
+    if (!s->device_ref) {
+        ret = AVERROR(ENOMEM);
+        return ret;
+    }
+
+    ret = av_hwdevice_ctx_init(s->device_ref);
+    if (ret < 0)
+        return ret;
+
     s->avctx = avctx;
     ret = ff_v4l2_m2m_codec_init(priv);
     if (ret) {
@@ -217,7 +514,53 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)

 static av_cold int v4l2_decode_close(AVCodecContext *avctx)
 {
-    return ff_v4l2_m2m_codec_end(avctx->priv_data);
+    int rv;
+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
+    rv = ff_v4l2_m2m_codec_end(avctx->priv_data);
+    av_log(avctx, AV_LOG_TRACE, ">>> %s: rv=%d\n", __func__, rv);
+    return rv;
+}
+
+static void v4l2_decode_flush(AVCodecContext *avctx)
+{
+    // An alternatve and more drastic form of flush is to simply do this:
+    //    v4l2_decode_close(avctx);
+    //    v4l2_decode_init(avctx);
+    // The downside is that this keeps a decoder open until all the frames
+    // associated with it have been returned.  This is a bit wasteful on
+    // possibly limited h/w resources and fails on a Pi for this reason unless
+    // more GPU mem is allocated than is the default.
+
+    V4L2m2mPriv * const priv = avctx->priv_data;
+    V4L2m2mContext * const s = priv->context;
+    V4L2Context * const output = &s->output;
+    V4L2Context * const capture = &s->capture;
+    int ret, i;
+
+    av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
+
+    // Reflushing everything is benign, quick and avoids having to worry about
+    // states like EOS processing so don't try to optimize out (having got it
+    // wrong once)
+
+    ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
+    if (ret < 0)
+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret);
+
+    // V4L2 makes no guarantees about whether decoded frames are flushed or not
+    // so mark all frames we are tracking to be discarded if they appear
+    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i)
+        s->track_els[i].discard = 1;
+
+    // resend extradata
+    s->extdata_sent = 0;
+    // clear EOS status vars
+    s->draining = 0;
+    output->done = 0;
+    capture->done = 0;
+
+    // Stream on will occur when we actually submit a new frame
+    av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__);
 }

 #define OFFSET(x) offsetof(V4L2m2mPriv, x)
@@ -227,9 +570,15 @@ static const AVOption options[] = {
     V4L_M2M_DEFAULT_OPTS,
     { "num_capture_buffers", "Number of buffers in the capture context",
         OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS },
+    { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS },
     { NULL},
 };

+static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
+    HW_CONFIG_INTERNAL(DRM_PRIME),
+    NULL
+};
+
 #define M2MDEC_CLASS(NAME) \
     static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
         .class_name = #NAME "_v4l2m2m_decoder", \
@@ -250,11 +599,16 @@ static const AVOption options[] = {
         .init           = v4l2_decode_init, \
         FF_CODEC_RECEIVE_FRAME_CB(v4l2_receive_frame), \
         .close          = v4l2_decode_close, \
+        .flush          = v4l2_decode_flush, \
         .bsfs           = bsf_name, \
         .p.capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \
         .caps_internal  = FF_CODEC_CAP_NOT_INIT_THREADSAFE | \
                           FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \
         .p.wrapper_name = "v4l2m2m", \
+        .p.pix_fmts     = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
+                                                         AV_PIX_FMT_NV12, \
+                                                         AV_PIX_FMT_NONE}, \
+        .hw_configs     = v4l2_m2m_hw_configs, \
     }

 M2MDEC(h264,  "H.264", AV_CODEC_ID_H264,       "h264_mp4toannexb");

From 12f8f12326b83dd3c22084f8922705d79a13d195 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 10 Jun 2021 18:46:21 +0100
Subject: [PATCH 017/161] Fix crash in hw_device_default_name if type not found
 (NONE)

---
 fftools/ffmpeg_hw.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fftools/ffmpeg_hw.c b/fftools/ffmpeg_hw.c
index 88fa782470..740a5e7153 100644
--- a/fftools/ffmpeg_hw.c
+++ b/fftools/ffmpeg_hw.c
@@ -75,6 +75,8 @@ static char *hw_device_default_name(enum AVHWDeviceType type)
     char *name;
     size_t index_pos;
     int index, index_limit = 1000;
+    if (!type_name)
+        return NULL;
     index_pos = strlen(type_name);
     name = av_malloc(index_pos + 4);
     if (!name)

From 7f6bce459e683bff3a0b972922fbcc808e9177a6 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 10 Jun 2021 18:59:18 +0100
Subject: [PATCH 018/161] Allow v4l2m2m to select non-drm_prime output formats

---
 libavcodec/v4l2_buffers.c |  2 +-
 libavcodec/v4l2_m2m_dec.c | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index a003934ca1..1ca1128db6 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -524,7 +524,7 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
                 offset += dst_stride * out->context->height;
             }
             if (offset > out->plane_info[0].length) {
-                av_log(NULL, AV_LOG_ERROR, "%s: Plane total %d > buffer size %d\n", __func__, offset, out->plane_info[0].length);
+                av_log(NULL, AV_LOG_ERROR, "%s: Plane total %u > buffer size %zu\n", __func__, offset, out->plane_info[0].length);
                 return -1;
             }

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 7f6033ac2c..a4b5a4e7e9 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -455,10 +455,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     V4L2Context *capture, *output;
     V4L2m2mContext *s;
     V4L2m2mPriv *priv = avctx->priv_data;
+    int gf_pix_fmt;
     int ret;

     av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
-    avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;

     ret = ff_v4l2_m2m_create_context(priv, &s);
     if (ret < 0)
@@ -486,10 +486,15 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
      *   - the DRM frame format is passed in the DRM frame descriptor layer.
      *       check the v4l2_get_drm_frame function.
      */
-    switch (ff_get_format(avctx, avctx->codec->pix_fmts)) {
-    default:
+
+    gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
+    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n",
+           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
+
+    s->output_drm = 0;
+    if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
+        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
         s->output_drm = 1;
-        break;
     }

     s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
@@ -607,6 +612,7 @@ static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
         .p.wrapper_name = "v4l2m2m", \
         .p.pix_fmts     = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
                                                          AV_PIX_FMT_NV12, \
+                                                         AV_PIX_FMT_YUV420P, \
                                                          AV_PIX_FMT_NONE}, \
         .hw_configs     = v4l2_m2m_hw_configs, \
     }

From 9b0d964b727d98271f7f2f4dcdbcb1b41a429e2b Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 10 Jun 2021 18:59:38 +0100
Subject: [PATCH 019/161] Fix YUV420P output from v4l2m2m

Also put get_width get_height inlines in header as they are generally
useful.
---
 libavcodec/v4l2_buffers.c | 12 ++++++------
 libavcodec/v4l2_context.c | 22 ++++++----------------
 libavcodec/v4l2_m2m.h     | 12 ++++++++++++
 3 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 1ca1128db6..f4c11ca8d0 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -425,17 +425,17 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
     case AV_PIX_FMT_NV21:
         if (avbuf->num_planes > 1)
             break;
-        frame->linesize[1] = avbuf->plane_info[0].bytesperline;
-        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
+        frame->linesize[1] = frame->linesize[0];
+        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
         break;

     case AV_PIX_FMT_YUV420P:
         if (avbuf->num_planes > 1)
             break;
-        frame->linesize[1] = avbuf->plane_info[0].bytesperline >> 1;
-        frame->linesize[2] = avbuf->plane_info[0].bytesperline >> 1;
-        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
-        frame->data[2] = frame->data[1] + ((avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height) >> 2);
+        frame->linesize[1] = frame->linesize[0] / 2;
+        frame->linesize[2] = frame->linesize[1];
+        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
+        frame->data[2] = frame->data[1] + frame->linesize[1] * ff_v4l2_get_format_height(&avbuf->context->format) / 2;
         break;

     default:
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index be76068af3..6fe2586627 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -55,16 +55,6 @@ static inline AVCodecContext *logger(V4L2Context *ctx)
     return ctx_to_m2mctx(ctx)->avctx;
 }

-static inline unsigned int v4l2_get_width(struct v4l2_format *fmt)
-{
-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
-}
-
-static inline unsigned int v4l2_get_height(struct v4l2_format *fmt)
-{
-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
-}
-
 static AVRational v4l2_get_sar(V4L2Context *ctx)
 {
     struct AVRational sar = { 0, 1 };
@@ -96,8 +86,8 @@ static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2
     if (ret)
         av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
             ctx->name,
-            v4l2_get_width(fmt1), v4l2_get_height(fmt1),
-            v4l2_get_width(fmt2), v4l2_get_height(fmt2));
+            ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1),
+            ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2));

     return ret;
 }
@@ -195,8 +185,8 @@ static int do_source_change(V4L2m2mContext * const s)

     reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
     if (reinit) {
-        s->capture.height = v4l2_get_height(&cap_fmt);
-        s->capture.width = v4l2_get_width(&cap_fmt);
+        s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
+        s->capture.width = ff_v4l2_get_format_width(&cap_fmt);
     }
     s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);

@@ -973,8 +963,8 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers
     av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name,
         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat),
         req.count,
-        v4l2_get_width(&ctx->format),
-        v4l2_get_height(&ctx->format),
+        ff_v4l2_get_format_width(&ctx->format),
+        ff_v4l2_get_format_height(&ctx->format),
         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage,
         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline);

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 24a9c94864..8f054f2f50 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -160,4 +160,16 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx);
  */
 int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);

+
+static inline unsigned int ff_v4l2_get_format_width(struct v4l2_format *fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
+}
+
+static inline unsigned int ff_v4l2_get_format_height(struct v4l2_format *fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
+}
+
+
 #endif /* AVCODEC_V4L2_M2M_H */

From 14e9b4bf1b34b3d1e1e6a4fc755cc595416e7d7b Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 10 Jun 2021 19:23:44 +0100
Subject: [PATCH 020/161] Report buffer overflows in v4l2m2m

---
 libavcodec/v4l2_buffers.c | 14 ++++++++++----
 libavcodec/v4l2_context.c |  5 ++++-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index f4c11ca8d0..de31f7ced9 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -364,6 +364,7 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
 static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset)
 {
     unsigned int bytesused, length;
+    int rv = 0;

     if (plane >= out->num_planes)
         return AVERROR(EINVAL);
@@ -371,11 +372,16 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i
     length = out->plane_info[plane].length;
     bytesused = FFMIN(size+offset, length);

-    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset));
+    if (size > length - offset) {
+        size = length - offset;
+        rv = AVERROR(ENOMEM);
+    }
+
+    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, size);

     set_buf_length(out, plane, bytesused, length);

-    return 0;
+    return rv;
 }

 static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf)
@@ -630,7 +636,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
     }

     ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen);
-    if (ret)
+    if (ret && ret != AVERROR(ENOMEM))
         return ret;

     v4l2_set_pts(out, pkt->pts, no_rescale_pts);
@@ -638,7 +644,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
     if (pkt->flags & AV_PKT_FLAG_KEY)
         out->flags = V4L2_BUF_FLAG_KEYFRAME;

-    return 0;
+    return ret;
 }

 int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 6fe2586627..81aced0c2b 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -824,7 +824,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
         return AVERROR(EAGAIN);

     ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts);
-    if (ret)
+    if (ret == AVERROR(ENOMEM))
+        av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
+               __func__, pkt->size, avbuf->planes[0].length);
+    else if (ret)
         return ret;

     return ff_v4l2_buffer_enqueue(avbuf);

From 072907a7fcf160d12972997d24fdf62641687ea4 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 14 Jun 2021 11:55:16 +0100
Subject: [PATCH 021/161] Increase V4L2 H264 stateful coded buffer size

Try to set a min size of frame size / 2 for bitbuffers passed to V4l2.
This fixes a few streams that have large I-frames.  You would hope
Annex-A gave useful minCR so an appropriate size could be calculated
but it doesn't really.  It gives good guidance for bits required over
time but the instantaneous limits are very weak so it is possible
that even this won't be enough.  The correct long term solution would
be to have resizable dmabufs but that is a greter rewrite than seems
sensible now.
---
 libavcodec/v4l2_context.c | 24 +++++++++++++++++++++++-
 libavcodec/v4l2_context.h |  6 ++++++
 libavcodec/v4l2_m2m_dec.c | 24 ++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 81aced0c2b..a17ae027a6 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -902,7 +902,29 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)

 int ff_v4l2_context_set_format(V4L2Context* ctx)
 {
-    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
+    int ret;
+
+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
+    if (ret != 0)
+        return ret;
+
+    // Check returned size against min size and if smaller have another go
+    // Only worry about plane[0] as this is meant to enforce limits for
+    // encoded streams where we might know a bit more about the shape
+    // than the driver
+    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
+        if (ctx->min_buf_size <= ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage)
+            return 0;
+        ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage = ctx->min_buf_size;
+    }
+    else {
+        if (ctx->min_buf_size <= ctx->format.fmt.pix.sizeimage)
+            return 0;
+        ctx->format.fmt.pix.sizeimage = ctx->min_buf_size;
+    }
+
+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
+    return ret;
 }

 void ff_v4l2_context_release(V4L2Context* ctx)
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 59009d11d1..37b0431400 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -75,6 +75,12 @@ typedef struct V4L2Context {
     AVRational sample_aspect_ratio;
     struct v4l2_rect selection;

+    /**
+     * If the default size of buffer is less than this then try to
+     * set to this.
+     */
+    uint32_t min_buf_size;
+
     /**
      * Indexed array of pointers to V4L2Buffers
      */
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index a4b5a4e7e9..1851acbc93 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -450,6 +450,27 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 }
 #endif

+static uint32_t max_coded_size(const AVCodecContext * const avctx)
+{
+    uint32_t wxh = avctx->coded_width * avctx->coded_height;
+    uint32_t size;
+
+    // Currently the only thing we try to set our own limits for is H264
+    if (avctx->codec_id != AV_CODEC_ID_H264)
+        return 0;
+
+    size = wxh * 3 / 2;
+    // H.264 Annex A table A-1 gives minCR which is either 2 or 4
+    // unfortunately that doesn't yield an actually useful limit
+    // and it should be noted that frame 0 is special cased to allow
+    // a bigger number which really isn't helpful for us. So just pick
+    // frame_size / 2
+    size /= 2;
+    // Add 64k to allow for any overheads and/or encoder hopefulness
+    // with small WxH
+    return size + (1 << 16);
+}
+
 static av_cold int v4l2_decode_init(AVCodecContext *avctx)
 {
     V4L2Context *capture, *output;
@@ -460,6 +481,7 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)

     av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);

+    av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level);
     ret = ff_v4l2_m2m_create_context(priv, &s);
     if (ret < 0)
         return ret;
@@ -476,9 +498,11 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)

     output->av_codec_id = avctx->codec_id;
     output->av_pix_fmt  = AV_PIX_FMT_NONE;
+    output->min_buf_size = max_coded_size(avctx);

     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
     capture->av_pix_fmt = avctx->pix_fmt;
+    capture->min_buf_size = 0;

     /* the client requests the codec to generate DRM frames:
      *   - data[0] will therefore point to the returned AVDRMFrameDescriptor

From 6087c8c054e1ff3d2e6e62d5e32705d079928b64 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 28 Jun 2021 12:13:35 +0100
Subject: [PATCH 022/161] Fix raw video s.t. it respects any remaining cropping

This fixes the long standing CONFWIN_A conformance test failure for drm.
---
 libavcodec/rawenc.c       |  32 ++++++++---
 libavutil/hwcontext_drm.c | 112 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 130 insertions(+), 14 deletions(-)

diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
index 594a77c42a..8ca0379e12 100644
--- a/libavcodec/rawenc.c
+++ b/libavcodec/rawenc.c
@@ -124,32 +124,41 @@ static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,


 static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
-                      const AVFrame *frame, int *got_packet)
+                      const AVFrame *src_frame, int *got_packet)
 {
     int ret;
+    AVFrame * frame = NULL;

 #if CONFIG_SAND
-    if (av_rpi_is_sand_frame(frame)) {
-        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) :
-            av_rpi_is_sand16_frame(frame) ? raw_sand16_as_yuv420(avctx, pkt, frame) :
-            av_rpi_is_sand30_frame(frame) ? raw_sand30_as_yuv420(avctx, pkt, frame) : -1;
+    if (av_rpi_is_sand_frame(src_frame)) {
+        ret = av_rpi_is_sand8_frame(src_frame) ? raw_sand8_as_yuv420(avctx, pkt, src_frame) :
+            av_rpi_is_sand16_frame(src_frame) ? raw_sand16_as_yuv420(avctx, pkt, src_frame) :
+            av_rpi_is_sand30_frame(src_frame) ? raw_sand30_as_yuv420(avctx, pkt, src_frame) : -1;
         *got_packet = (ret == 0);
         return ret;
     }
 #endif

+    if ((frame = av_frame_clone(src_frame)) == NULL) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    if ((ret = av_frame_apply_cropping(frame, AV_FRAME_CROP_UNALIGNED)) < 0)
+        goto fail;
+
     ret = av_image_get_buffer_size(frame->format,
                                        frame->width, frame->height, 1);
     if (ret < 0)
-        return ret;
+        goto fail;

     if ((ret = ff_get_encode_buffer(avctx, pkt, ret, 0)) < 0)
-        return ret;
+        goto fail;
     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
                                        (const uint8_t **)frame->data, frame->linesize,
                                        frame->format,
                                        frame->width, frame->height, 1)) < 0)
-        return ret;
+        goto fail;

     if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 &&
        frame->format   == AV_PIX_FMT_YUYV422) {
@@ -165,8 +174,15 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
             AV_WB64(&pkt->data[8 * x], v << 48 | v >> 16);
         }
     }
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    av_frame_free(&frame);
     *got_packet = 1;
     return 0;
+
+fail:
+    av_frame_free(&frame);
+    *got_packet = 0;
+    return ret;
 }

 const FFCodec ff_rawvideo_encoder = {
diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c
index 7a9fdbd263..baf18920fa 100644
--- a/libavutil/hwcontext_drm.c
+++ b/libavutil/hwcontext_drm.c
@@ -21,6 +21,7 @@
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <unistd.h>
+#include <sys/ioctl.h>

 /* This was introduced in version 4.6. And may not exist all without an
  * optional package. So to prevent a hard dependency on needing the Linux
@@ -31,6 +32,7 @@
 #endif

 #include <drm.h>
+#include <libdrm/drm_fourcc.h>
 #include <xf86drm.h>

 #include "avassert.h"
@@ -38,7 +40,9 @@
 #include "hwcontext_drm.h"
 #include "hwcontext_internal.h"
 #include "imgutils.h"
-
+#if CONFIG_SAND
+#include "libavutil/rpi_sand_fns.h"
+#endif

 static void drm_device_free(AVHWDeviceContext *hwdev)
 {
@@ -53,6 +57,11 @@ static int drm_device_create(AVHWDeviceContext *hwdev, const char *device,
     AVDRMDeviceContext *hwctx = hwdev->hwctx;
     drmVersionPtr version;

+    if (device == NULL) {
+        hwctx->fd = -1;
+        return 0;
+    }
+
     hwctx->fd = open(device, O_RDWR);
     if (hwctx->fd < 0)
         return AVERROR(errno);
@@ -139,6 +148,8 @@ static int drm_map_frame(AVHWFramesContext *hwfc,
     if (flags & AV_HWFRAME_MAP_WRITE)
         mmap_prot |= PROT_WRITE;

+    if (dst->format == AV_PIX_FMT_NONE)
+        dst->format = hwfc->sw_format;
 #if HAVE_LINUX_DMA_BUF_H
     if (flags & AV_HWFRAME_MAP_READ)
         map->sync_flags |= DMA_BUF_SYNC_READ;
@@ -185,6 +196,23 @@ static int drm_map_frame(AVHWFramesContext *hwfc,

     dst->width  = src->width;
     dst->height = src->height;
+    dst->crop_top    = src->crop_top;
+    dst->crop_bottom = src->crop_bottom;
+    dst->crop_left   = src->crop_left;
+    dst->crop_right  = src->crop_right;
+
+#if CONFIG_SAND
+    // Rework for sand frames
+    if (av_rpi_is_sand_frame(dst)) {
+        // As it stands the sand formats hold stride2 in linesize[3]
+        // linesize[0] & [1] contain stride1 which is always 128 for everything we do
+        // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1]
+        dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier);
+        dst->linesize[0] = 128;
+        dst->linesize[1] = 128;
+        // *** Are we sure src->height is actually what we want ???
+    }
+#endif

     err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src,
                                 &drm_unmap_frame, map);
@@ -212,7 +240,15 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx,
     if (!pix_fmts)
         return AVERROR(ENOMEM);

-    pix_fmts[0] = ctx->sw_format;
+    // **** Offer native sand too ????
+    pix_fmts[0] =
+#if CONFIG_SAND
+        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
+            AV_PIX_FMT_YUV420P :
+        ctx->sw_format == AV_PIX_FMT_RPI4_10 ?
+            AV_PIX_FMT_YUV420P10LE :
+#endif
+            ctx->sw_format;
     pix_fmts[1] = AV_PIX_FMT_NONE;

     *formats = pix_fmts;
@@ -231,18 +267,79 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc,
     map = av_frame_alloc();
     if (!map)
         return AVERROR(ENOMEM);
-    map->format = dst->format;

+    // Map to default
+    map->format = AV_PIX_FMT_NONE;
     err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ);
     if (err)
         goto fail;

-    map->width  = dst->width;
-    map->height = dst->height;
+#if 0
+    av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__,
+           hwfc->sw_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE,
+           map->width, map->height,
+           map->linesize[0],
+           map->linesize[1],
+           map->linesize[2],
+           map->linesize[3],
+           dst->width, dst->height,
+           dst->linesize[0],
+           dst->linesize[1],
+           dst->linesize[2]);
+#endif
+#if CONFIG_SAND
+    if (av_rpi_is_sand_frame(map)) {
+        // Preserve crop - later ffmpeg code assumes that we have in that it
+        // overwrites any crop that we create with the old values
+        const unsigned int w = FFMIN(dst->width, map->width);
+        const unsigned int h = FFMIN(dst->height, map->height);
+
+        if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) {
+            av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
+                                     map->data[0],
+                                     128, stride2,
+                                     0, 0, w, h);
+            av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
+                                     dst->data[2], dst->linesize[2],
+                                     map->data[1],
+                                     128, stride2,
+                                     0, 0, w / 2, h / 2);
+        }
+        else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) {
+            av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
+                                     map->data[0],
+                                     128, stride2,
+                                     0, 0, w, h);
+            av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
+                                     dst->data[2], dst->linesize[2],
+                                     map->data[1],
+                                     128, stride2,
+                                     0, 0, w / 2, h / 2);
+        }
+        else
+        {
+            av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
+            err = AVERROR(EINVAL);
+            goto fail;
+        }
+
+        dst->width = w;
+        dst->height = h;
+    }
+    else
+#endif
+    {
+        // Kludge mapped h/w s.t. frame_copy works
+        map->width  = dst->width;
+        map->height = dst->height;
+        err = av_frame_copy(dst, map);
+    }

-    err = av_frame_copy(dst, map);
     if (err)
+    {
+        av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__);
         goto fail;
+    }

     err = 0;
 fail:
@@ -257,7 +354,10 @@ static int drm_transfer_data_to(AVHWFramesContext *hwfc,
     int err;

     if (src->width > hwfc->width || src->height > hwfc->height)
+    {
+        av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height);
         return AVERROR(EINVAL);
+    }

     map = av_frame_alloc();
     if (!map)

From 597858c11fbfbe0f54c1b68d9683025929258bc1 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 13 Aug 2021 15:38:28 +0100
Subject: [PATCH 023/161] Set frame interlace from V4L2 buffer field

---
 libavcodec/v4l2_buffers.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index de31f7ced9..97b8eb1db3 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -222,6 +222,16 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
     return AVCOL_TRC_UNSPECIFIED;
 }

+static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf)
+{
+    return V4L2_FIELD_IS_INTERLACED(buf->buf.field);
+}
+
+static int v4l2_buf_is_top_first(const V4L2Buffer * const buf)
+{
+    return buf->buf.field == V4L2_FIELD_INTERLACED_TB;
+}
+
 static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
 {
     AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
@@ -576,6 +586,8 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_resc
     frame->color_trc = v4l2_get_color_trc(avbuf);
     frame->pts = v4l2_get_pts(avbuf, no_rescale_pts);
     frame->pkt_dts = AV_NOPTS_VALUE;
+    frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf);
+    frame->top_field_first = v4l2_buf_is_top_first(avbuf);

     /* these values are updated also during re-init in v4l2_process_driver_event */
     frame->height = ctx->height;

From 05906e2086b5087d615485ec9a09b1493dbb32e1 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 13 Aug 2021 16:11:53 +0100
Subject: [PATCH 024/161] Fix V4L2 stateful to avoid crash if flush before
 start

---
 libavcodec/v4l2_context.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index a17ae027a6..eb901e8fab 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -713,6 +713,10 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
 static void flush_all_buffers_status(V4L2Context* const ctx)
 {
     int i;
+
+    if (!ctx->bufrefs)
+        return;
+
     for (i = 0; i < ctx->num_buffers; ++i) {
         struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
         if (buf->status == V4L2BUF_IN_DRIVER)

From 7157b6032e759078a7d751e5dd5762970f3d1e8c Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 9 Sep 2021 17:44:13 +0100
Subject: [PATCH 025/161] Copy properties from frame to v4l2 buffer

Now copies all the properties in ff_v4l2_buffer_avframe_to_buf that
ff_v4l2_buffer_buf_to_avframe copies
---
 libavcodec/v4l2_buffers.c | 126 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 97b8eb1db3..126d2a17f4 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -128,6 +128,105 @@ static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
     return AVCOL_PRI_UNSPECIFIED;
 }

+static void v4l2_set_color(V4L2Buffer *buf,
+                           const enum AVColorPrimaries avcp,
+                           const enum AVColorSpace avcs,
+                           const enum AVColorTransferCharacteristic avxc)
+{
+    enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT;
+    enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT;
+    enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT;
+
+    switch (avcp) {
+    case AVCOL_PRI_BT709:
+        cs = V4L2_COLORSPACE_REC709;
+        ycbcr = V4L2_YCBCR_ENC_709;
+        break;
+    case AVCOL_PRI_BT470M:
+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
+        ycbcr = V4L2_YCBCR_ENC_601;
+        break;
+    case AVCOL_PRI_BT470BG:
+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
+        break;
+    case AVCOL_PRI_SMPTE170M:
+        cs = V4L2_COLORSPACE_SMPTE170M;
+        break;
+    case AVCOL_PRI_SMPTE240M:
+        cs = V4L2_COLORSPACE_SMPTE240M;
+        break;
+    case AVCOL_PRI_BT2020:
+        cs = V4L2_COLORSPACE_BT2020;
+        break;
+    case AVCOL_PRI_SMPTE428:
+    case AVCOL_PRI_SMPTE431:
+    case AVCOL_PRI_SMPTE432:
+    case AVCOL_PRI_EBU3213:
+    case AVCOL_PRI_RESERVED:
+    case AVCOL_PRI_FILM:
+    case AVCOL_PRI_UNSPECIFIED:
+    default:
+        break;
+    }
+
+    switch (avcs) {
+    case AVCOL_SPC_RGB:
+        cs = V4L2_COLORSPACE_SRGB;
+        break;
+    case AVCOL_SPC_BT709:
+        cs = V4L2_COLORSPACE_REC709;
+        break;
+    case AVCOL_SPC_FCC:
+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
+        break;
+    case AVCOL_SPC_BT470BG:
+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
+        break;
+    case AVCOL_SPC_SMPTE170M:
+        cs = V4L2_COLORSPACE_SMPTE170M;
+        break;
+    case AVCOL_SPC_SMPTE240M:
+        cs = V4L2_COLORSPACE_SMPTE240M;
+        break;
+    case AVCOL_SPC_BT2020_CL:
+        cs = V4L2_COLORSPACE_BT2020;
+        ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM;
+        break;
+    case AVCOL_SPC_BT2020_NCL:
+        cs = V4L2_COLORSPACE_BT2020;
+        break;
+    default:
+        break;
+    }
+
+    switch (xfer) {
+    case AVCOL_TRC_BT709:
+        xfer = V4L2_XFER_FUNC_709;
+        break;
+    case AVCOL_TRC_IEC61966_2_1:
+        xfer = V4L2_XFER_FUNC_SRGB;
+        break;
+    case AVCOL_TRC_SMPTE240M:
+        xfer = V4L2_XFER_FUNC_SMPTE240M;
+        break;
+    case AVCOL_TRC_SMPTE2084:
+        xfer = V4L2_XFER_FUNC_SMPTE2084;
+        break;
+    default:
+        break;
+    }
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
+        buf->context->format.fmt.pix_mp.colorspace = cs;
+        buf->context->format.fmt.pix_mp.ycbcr_enc = ycbcr;
+        buf->context->format.fmt.pix_mp.xfer_func = xfer;
+    } else {
+        buf->context->format.fmt.pix.colorspace = cs;
+        buf->context->format.fmt.pix.ycbcr_enc = ycbcr;
+        buf->context->format.fmt.pix.xfer_func = xfer;
+    }
+}
+
 static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
 {
     enum v4l2_quantization qt;
@@ -146,6 +245,20 @@ static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
      return AVCOL_RANGE_UNSPECIFIED;
 }

+static void v4l2_set_color_range(V4L2Buffer *buf, const enum AVColorRange avcr)
+{
+    const enum v4l2_quantization q =
+        avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE :
+        avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE :
+            V4L2_QUANTIZATION_DEFAULT;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
+        buf->context->format.fmt.pix_mp.quantization = q;
+    } else {
+        buf->context->format.fmt.pix.quantization = q;
+    }
+}
+
 static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
 {
     enum v4l2_ycbcr_encoding ycbcr;
@@ -232,6 +345,12 @@ static int v4l2_buf_is_top_first(const V4L2Buffer * const buf)
     return buf->buf.field == V4L2_FIELD_INTERLACED_TB;
 }

+static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff)
+{
+    buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE :
+        is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT;
+}
+
 static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
 {
     AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
@@ -561,7 +680,14 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)

 int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
 {
+    out->buf.flags = frame->key_frame ? (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME) : (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME);
+    // Beware that colour info is held in format rather than the actual
+    // v4l2 buffer struct so this may not be as useful as you might hope
+    v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
+    v4l2_set_color_range(out, frame->color_range);
+    // PTS & interlace are buffer vars
     v4l2_set_pts(out, frame->pts, 0);
+    v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);

     return v4l2_buffer_swframe_to_buf(frame, out);
 }

From 15415ab226f966fd12e70d79fde3cb80f3d09144 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 17 Nov 2021 16:49:01 +0000
Subject: [PATCH 026/161] ffmpeg: Do not inc DTS on no decode output

V4L2 H264 decode has long latency and sometimes spits out a long stream
of output without input. In this case incrementing DTS is wrong. There
may be cases where the condition as written is correct so only "fix" in
the cases which cause problems
---
 fftools/ffmpeg.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
index 5dc2cd73c1..ba0c1898cf 100644
--- a/fftools/ffmpeg.c
+++ b/fftools/ffmpeg.c
@@ -2609,7 +2609,12 @@ static int process_input_packet(InputStream *ist, const AVPacket *pkt, int no_eo
         case AVMEDIA_TYPE_VIDEO:
             ret = decode_video    (ist, repeating ? NULL : avpkt, &got_output, &duration_pts, !pkt,
                                    &decode_failed);
-            if (!repeating || !pkt || got_output) {
+            // Pi: Do not inc dts if no_cvt_hw set
+            // V4L2 H264 decode has long latency and sometimes spits out a long
+            // stream of output without input. In this case incrementing DTS is wrong.
+            // There may be cases where the condition as written is correct so only
+            // "fix" in the cases which cause problems
+            if (!repeating || !pkt || (got_output && !no_cvt_hw)) {
                 if (pkt && pkt->duration) {
                     duration_dts = av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q);
                 } else if(ist->dec_ctx->framerate.num != 0 && ist->dec_ctx->framerate.den != 0) {

From 7bf6c062ed8a1e635aa5722c0072724f236daf00 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 17 Nov 2021 17:32:59 +0000
Subject: [PATCH 027/161] v4l2_m2m_dec: Adjust timebase if H264

Adjust AVCodecContext time_base if H264 in the same way that the
software decoder does.
---
 libavcodec/v4l2_m2m_dec.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 1851acbc93..aa1e5c1597 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -481,6 +481,16 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)

     av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);

+    if (avctx->codec_id == AV_CODEC_ID_H264) {
+        if (avctx->ticks_per_frame == 1) {
+            if(avctx->time_base.den < INT_MAX/2) {
+                avctx->time_base.den *= 2;
+            } else
+                avctx->time_base.num /= 2;
+        }
+        avctx->ticks_per_frame = 2;
+    }
+
     av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level);
     ret = ff_v4l2_m2m_create_context(priv, &s);
     if (ret < 0)

From 3cd23a761397ae75ed032c1687da5d6b76ddaaaa Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 17 Nov 2021 17:38:27 +0000
Subject: [PATCH 028/161] v4l2_m2m_dec: Produce best guess PTSs if none
 supplied

Filter scheduling gets confused by missing PTSs and makes poor guesses
more often than not.  Try to generate plausible timestamps where we are
missing them.
---
 libavcodec/v4l2_m2m.h     | 12 ++++++++
 libavcodec/v4l2_m2m_dec.c | 64 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 8f054f2f50..82feb0afdb 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -52,6 +52,16 @@ typedef struct V4L2m2mTrackEl {
     int64_t track_pts;
 } V4L2m2mTrackEl;

+typedef struct pts_stats_s
+{
+    void * logctx;
+    const char * name;  // For debug
+    unsigned int last_count;
+    unsigned int last_interval;
+    int64_t last_pts;
+    int64_t guess;
+} pts_stats_t;
+
 typedef struct V4L2m2mContext {
     char devname[PATH_MAX];
     int fd;
@@ -91,6 +101,8 @@ typedef struct V4L2m2mContext {
     unsigned int track_no;
     V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];

+    pts_stats_t pts_stat;
+
     /* req pkt */
     int req_pkt;

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index aa1e5c1597..a5a2afbd27 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -42,6 +42,62 @@
 #include "v4l2_m2m.h"
 #include "v4l2_fmt.h"

+// Pick 64 for max last count - that is >1sec at 60fps
+#define STATS_LAST_COUNT_MAX 64
+#define STATS_INTERVAL_MAX (1 << 30)
+
+static int64_t pts_stats_guess(const pts_stats_t * const stats)
+{
+    if (stats->last_pts == AV_NOPTS_VALUE ||
+            stats->last_interval == 0 ||
+            stats->last_count >= STATS_LAST_COUNT_MAX)
+        return AV_NOPTS_VALUE;
+    return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval;
+}
+
+static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
+{
+    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
+        if (stats->last_count < STATS_LAST_COUNT_MAX)
+            ++stats->last_count;
+        return;
+    }
+
+    if (stats->last_pts != AV_NOPTS_VALUE) {
+        const int64_t interval = pts - stats->last_pts;
+
+        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
+            stats->last_count >= STATS_LAST_COUNT_MAX) {
+            if (stats->last_interval != 0)
+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
+                       __func__, stats->name, interval, stats->last_count);
+            stats->last_interval = 0;
+        }
+        else {
+            const int64_t frame_time = interval / (int64_t)stats->last_count;
+
+            if (frame_time != stats->last_interval)
+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
+                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
+            stats->last_interval = frame_time;
+        }
+    }
+
+    stats->last_pts = pts;
+    stats->last_count = 1;
+}
+
+static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
+{
+    *stats = (pts_stats_t){
+        .logctx = logctx,
+        .name = name,
+        .last_count = 1,
+        .last_interval = 0,
+        .last_pts = AV_NOPTS_VALUE
+    };
+}
+
 static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s)
 {
     int ret;
@@ -244,9 +300,11 @@ xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *cons
         return -1;
     }

-    frame->best_effort_timestamp = frame->pts;
+    pts_stats_add(&s->pts_stat, frame->pts);
+
+    frame->best_effort_timestamp = pts_stats_guess(&s->pts_stat);
     frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
-    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 ", DTS=%" PRId64 "\n", frame->pts, frame->pkt_dts);
+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
     return 0;
 }

@@ -496,6 +554,8 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     if (ret < 0)
         return ret;

+    pts_stats_init(&s->pts_stat, avctx, "decoder");
+
     capture = &s->capture;
     output = &s->output;


From ee8be1e900f98212b6c4940980cc7a80becfc07c Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 17 Nov 2021 17:59:27 +0000
Subject: [PATCH 029/161] v4l2_m2m_dec: Try harder to get an initial frame

If the input Q is full then wait on a short timeout for a capture frame
rather than stuffing yet still another frame into the input if we could
do that first. This attempts to restrict the sometimes daft initial
buffering that ends up confusing the rest of the system.
---
 libavcodec/v4l2_context.c | 2 +-
 libavcodec/v4l2_m2m_dec.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index eb901e8fab..ee5dc7b8d4 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -381,7 +381,7 @@ static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
 start:
     if (is_capture) {
         /* no need to listen to requests for more input while draining */
-        if (ctx_to_m2mctx(ctx)->draining)
+        if (ctx_to_m2mctx(ctx)->draining || timeout > 0)
             pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
     } else {
         pfd.events =  POLLOUT | POLLWRNORM;
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index a5a2afbd27..b49f470c0a 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -442,7 +442,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
                 // when discarding
                 // This returns AVERROR(EAGAIN) if there isn't a frame ready yet
                 // but there is room in the input Q
-                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, -1, 1);
+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1, 1);

                 if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
                     av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",

From 72da14331c2160a12b69d666d493e0e74c5e8914 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 17 Nov 2021 18:04:56 +0000
Subject: [PATCH 030/161] Add a V4L2 M2M deinterlace filter

Add a V4L2 deinterlace filter that will accept DRMPRIME frames.

Multiple people have contributed to this:
Jernej Skrabec <jernej.skrabec@siol.net>
Alex Bee <knaerzche@gmail.com>
popcornmix <popcornmix@gmail.com>
John Cox <jc@kynesim.co.uk>

There is an unknown delay through the filter of typically one or three
fields which translates to 1 or 2 frames. Frames that are delayed are
lost at end of stream as the V4L2 filter has no flush control.
---
 libavcodec/v4l2_context.c            |    4 +-
 libavfilter/Makefile                 |    1 +
 libavfilter/allfilters.c             |    1 +
 libavfilter/vf_deinterlace_v4l2m2m.c | 1269 ++++++++++++++++++++++++++
 4 files changed, 1273 insertions(+), 2 deletions(-)
 create mode 100644 libavfilter/vf_deinterlace_v4l2m2m.c

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index ee5dc7b8d4..440dfaaba5 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -498,10 +498,10 @@ dequeue:
             return NULL;
         }
         --ctx->q_count;
-        av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d\n",
+        av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d field=%d\n",
                ctx->name, buf.index,
                buf.timestamp.tv_sec, buf.timestamp.tv_usec,
-               ctx->q_count, ++ctx->dq_count);
+               ctx->q_count, ++ctx->dq_count, buf.field);

         avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
         avbuf->status = V4L2BUF_AVAILABLE;
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index c14fc995a0..0e7b5856bd 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -262,6 +262,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER)                += vf_neighbor.o
 OBJS-$(CONFIG_DEFLICKER_FILTER)              += vf_deflicker.o
 OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER)        += vf_vpp_qsv.o
 OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER)      += vf_deinterlace_vaapi.o vaapi_vpp.o
+OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER)    += vf_deinterlace_v4l2m2m.o
 OBJS-$(CONFIG_DEJUDDER_FILTER)               += vf_dejudder.o
 OBJS-$(CONFIG_DELOGO_FILTER)                 += vf_delogo.o
 OBJS-$(CONFIG_DENOISE_VAAPI_FILTER)          += vf_misc_vaapi.o vaapi_vpp.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index b990a00152..357ff61ca8 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -248,6 +248,7 @@ extern const AVFilter ff_vf_derain;
 extern const AVFilter ff_vf_deshake;
 extern const AVFilter ff_vf_deshake_opencl;
 extern const AVFilter ff_vf_despill;
+extern const AVFilter ff_vf_deinterlace_v4l2m2m;
 extern const AVFilter ff_vf_detelecine;
 extern const AVFilter ff_vf_dilation;
 extern const AVFilter ff_vf_dilation_opencl;
diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
new file mode 100644
index 0000000000..1a933b7e0a
--- /dev/null
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -0,0 +1,1269 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * deinterlace video filter - V4L2 M2M
+ */
+
+#include <drm_fourcc.h>
+
+#include <linux/videodev2.h>
+
+#include <dirent.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <stdatomic.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/avstring.h"
+#include "libavutil/common.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_drm.h"
+#include "libavutil/internal.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/time.h"
+
+#define FF_INTERNAL_FIELDS 1
+#include "framequeue.h"
+#include "filters.h"
+#include "avfilter.h"
+#include "formats.h"
+#include "internal.h"
+#include "video.h"
+
+typedef struct V4L2Queue V4L2Queue;
+typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared;
+
+typedef struct V4L2PlaneInfo {
+    int bytesperline;
+    size_t length;
+} V4L2PlaneInfo;
+
+typedef struct V4L2Buffer {
+    int enqueued;
+    int reenqueue;
+    int fd;
+    struct v4l2_buffer buffer;
+    AVFrame frame;
+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
+    int num_planes;
+    V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES];
+    AVDRMFrameDescriptor drm_frame;
+    V4L2Queue *q;
+} V4L2Buffer;
+
+typedef struct V4L2Queue {
+    struct v4l2_format format;
+    int num_buffers;
+    V4L2Buffer *buffers;
+    DeintV4L2M2MContextShared *ctx;
+} V4L2Queue;
+
+typedef struct pts_stats_s
+{
+    void * logctx;
+    const char * name;  // For debug
+    unsigned int last_count;
+    unsigned int last_interval;
+    int64_t last_pts;
+} pts_stats_t;
+
+#define PTS_TRACK_SIZE 32
+typedef struct pts_track_el_s
+{
+    uint32_t n;
+    unsigned int interval;
+    AVFrame * props;
+} pts_track_el_t;
+
+typedef struct pts_track_s
+{
+    uint32_t n;
+    uint32_t last_n;
+    int got_2;
+    void * logctx;
+    pts_stats_t stats;
+    pts_track_el_t a[PTS_TRACK_SIZE];
+} pts_track_t;
+
+typedef struct DeintV4L2M2MContextShared {
+    void * logctx;  // For logging - will be NULL when done
+
+    int fd;
+    int done;
+    int width;
+    int height;
+    int orig_width;
+    int orig_height;
+    atomic_uint refcount;
+
+    AVBufferRef *hw_frames_ctx;
+
+    unsigned int field_order;
+
+    pts_track_t track;
+
+    V4L2Queue output;
+    V4L2Queue capture;
+} DeintV4L2M2MContextShared;
+
+typedef struct DeintV4L2M2MContext {
+    const AVClass *class;
+
+    DeintV4L2M2MContextShared *shared;
+} DeintV4L2M2MContext;
+
+static unsigned int pts_stats_interval(const pts_stats_t * const stats)
+{
+    return stats->last_interval;
+}
+
+// Pick 64 for max last count - that is >1sec at 60fps
+#define STATS_LAST_COUNT_MAX 64
+#define STATS_INTERVAL_MAX (1 << 30)
+static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
+{
+    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
+        if (stats->last_count < STATS_LAST_COUNT_MAX)
+            ++stats->last_count;
+        return;
+    }
+
+    if (stats->last_pts != AV_NOPTS_VALUE) {
+        const int64_t interval = pts - stats->last_pts;
+
+        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
+            stats->last_count >= STATS_LAST_COUNT_MAX) {
+            if (stats->last_interval != 0)
+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
+                       __func__, stats->name, interval, stats->last_count);
+            stats->last_interval = 0;
+        }
+        else {
+            const int64_t frame_time = interval / (int64_t)stats->last_count;
+
+            if (frame_time != stats->last_interval)
+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
+                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
+            stats->last_interval = frame_time;
+        }
+    }
+
+    stats->last_pts = pts;
+    stats->last_count = 1;
+}
+
+static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
+{
+    *stats = (pts_stats_t){
+        .logctx = logctx,
+        .name = name,
+        .last_count = 1,
+        .last_interval = 0,
+        .last_pts = AV_NOPTS_VALUE
+    };
+}
+
+static inline uint32_t pts_track_next_n(pts_track_t * const trk)
+{
+    if (++trk->n == 0)
+        trk->n = 1;
+    return trk->n;
+}
+
+static int pts_track_get_frame(pts_track_t * const trk, const struct timeval tv, AVFrame * const dst)
+{
+    uint32_t n = (uint32_t)(tv.tv_usec / 2 + tv.tv_sec * 500000);
+    pts_track_el_t * t;
+
+    // As a first guess assume that n==0 means last frame
+    if (n == 0) {
+        n = trk->last_n;
+        if (n == 0)
+            goto fail;
+    }
+
+    t = trk->a + (n & (PTS_TRACK_SIZE - 1));
+
+    if (t->n != n) {
+        av_log(trk->logctx, AV_LOG_ERROR, "%s: track failure: got %u, expected %u\n", __func__, n, trk->n);
+        goto fail;
+    }
+
+    // 1st frame is simple - just believe it
+    if (n != trk->last_n) {
+        trk->last_n = n;
+        trk->got_2 = 0;
+        return av_frame_copy_props(dst, t->props);
+    }
+
+    // Only believe in a single interpolated frame
+    if (trk->got_2)
+        goto fail;
+    trk->got_2 = 1;
+
+    av_frame_copy_props(dst, t->props);
+
+
+    // If we can't guess - don't
+    if (t->interval == 0) {
+        dst->best_effort_timestamp = AV_NOPTS_VALUE;
+        dst->pts = AV_NOPTS_VALUE;
+        dst->pkt_dts = AV_NOPTS_VALUE;
+    }
+    else {
+        if (dst->best_effort_timestamp != AV_NOPTS_VALUE)
+            dst->best_effort_timestamp += t->interval / 2;
+        if (dst->pts != AV_NOPTS_VALUE)
+            dst->pts += t->interval / 2;
+        if (dst->pkt_dts != AV_NOPTS_VALUE)
+            dst->pkt_dts += t->interval / 2;
+    }
+
+    return 0;
+
+fail:
+    trk->last_n = 0;
+    trk->got_2 = 0;
+    dst->pts = AV_NOPTS_VALUE;
+    dst->pkt_dts = AV_NOPTS_VALUE;
+    return 0;
+}
+
+static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src)
+{
+    const uint32_t n = pts_track_next_n(trk);
+    pts_track_el_t * const t = trk->a + (n & (PTS_TRACK_SIZE - 1));
+
+    pts_stats_add(&trk->stats, src->pts);
+
+    t->n = n;
+    t->interval = pts_stats_interval(&trk->stats); // guess that next interval is the same as the last
+    av_frame_unref(t->props);
+    av_frame_copy_props(t->props, src);
+
+    // We now know what the previous interval was, rather than having to guess,
+    // so set it.  There is a better than decent chance that this is before
+    // we use it.
+    if (t->interval != 0) {
+        pts_track_el_t * const prev_t = trk->a + ((n - 1) & (PTS_TRACK_SIZE - 1));
+        prev_t->interval = t->interval;
+    }
+
+    // In case deinterlace interpolates frames use every other usec
+    return (struct timeval){.tv_sec = n / 500000, .tv_usec = (n % 500000) * 2};
+}
+
+static void pts_track_uninit(pts_track_t * const trk)
+{
+    unsigned int i;
+    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
+        trk->a[i].n = 0;
+        av_frame_free(&trk->a[i].props);
+    }
+}
+
+static int pts_track_init(pts_track_t * const trk, void *logctx)
+{
+    unsigned int i;
+    trk->n = 1;
+    pts_stats_init(&trk->stats, logctx, "track");
+    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
+        trk->a[i].n = 0;
+        if ((trk->a[i].props = av_frame_alloc()) == NULL) {
+            pts_track_uninit(trk);
+            return AVERROR(ENOMEM);
+        }
+    }
+    return 0;
+}
+
+static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx)
+{
+    struct v4l2_capability cap;
+    int ret;
+
+    memset(&cap, 0, sizeof(cap));
+    ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap);
+    if (ret < 0)
+        return ret;
+
+    if (!(cap.capabilities & V4L2_CAP_STREAMING))
+        return AVERROR(EINVAL);
+
+    if (cap.capabilities & V4L2_CAP_VIDEO_M2M) {
+        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
+
+        return 0;
+    }
+
+    if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) {
+        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
+        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
+
+        return 0;
+    }
+
+    return AVERROR(EINVAL);
+}
+
+static int deint_v4l2m2m_try_format(V4L2Queue *queue)
+{
+    struct v4l2_format *fmt        = &queue->format;
+    DeintV4L2M2MContextShared *ctx = queue->ctx;
+    int ret, field;
+
+    ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt);
+    if (ret)
+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret);
+
+    if (V4L2_TYPE_IS_OUTPUT(fmt->type))
+        field = V4L2_FIELD_INTERLACED_TB;
+    else
+        field = V4L2_FIELD_NONE;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420;
+        fmt->fmt.pix_mp.field = field;
+        fmt->fmt.pix_mp.width = ctx->width;
+        fmt->fmt.pix_mp.height = ctx->height;
+    } else {
+        fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_YUV420;
+        fmt->fmt.pix.field = field;
+        fmt->fmt.pix.width = ctx->width;
+        fmt->fmt.pix.height = ctx->height;
+    }
+
+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__,
+		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
+		 fmt->fmt.pix_mp.pixelformat,
+		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
+
+    ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt);
+    if (ret)
+        return AVERROR(EINVAL);
+
+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__,
+		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
+		 fmt->fmt.pix_mp.pixelformat,
+		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        if (fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 ||
+            fmt->fmt.pix_mp.field != field) {
+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
+
+            return AVERROR(EINVAL);
+        }
+    } else {
+        if (fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 ||
+            fmt->fmt.pix.field != field) {
+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
+
+            return AVERROR(EINVAL);
+        }
+    }
+
+    return 0;
+}
+
+static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, int height, int pitch, int ysize)
+{
+    struct v4l2_format *fmt        = &queue->format;
+    DeintV4L2M2MContextShared *ctx = queue->ctx;
+    int ret;
+
+    struct v4l2_selection sel = {
+        .type = fmt->type,
+        .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS,
+    };
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        fmt->fmt.pix_mp.field = field;
+        fmt->fmt.pix_mp.width = width;
+        fmt->fmt.pix_mp.height = ysize / pitch;
+        fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch;
+        fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1);
+    } else {
+        fmt->fmt.pix.field = field;
+        fmt->fmt.pix.width = width;
+        fmt->fmt.pix.height = height;
+        fmt->fmt.pix.sizeimage = 0;
+        fmt->fmt.pix.bytesperline = 0;
+    }
+
+    ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt);
+    if (ret)
+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret);
+
+    ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel);
+    if (ret)
+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_SELECTION failed: %d\n", ret);
+
+    sel.r.width = width;
+    sel.r.height = height;
+    sel.r.left = 0;
+    sel.r.top = 0;
+    sel.target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE,
+    sel.flags = V4L2_SEL_FLAG_LE;
+
+    ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel);
+    if (ret)
+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_SELECTION failed: %d\n", ret);
+
+    return ret;
+}
+
+static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node)
+{
+    int ret;
+
+    ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0);
+    if (ctx->fd < 0)
+        return AVERROR(errno);
+
+    ret = deint_v4l2m2m_prepare_context(ctx);
+    if (ret)
+        goto fail;
+
+    ret = deint_v4l2m2m_try_format(&ctx->capture);
+    if (ret)
+        goto fail;
+
+    ret = deint_v4l2m2m_try_format(&ctx->output);
+    if (ret)
+        goto fail;
+
+    return 0;
+
+fail:
+    close(ctx->fd);
+    ctx->fd = -1;
+
+    return ret;
+}
+
+static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx)
+{
+    int ret = AVERROR(EINVAL);
+    struct dirent *entry;
+    char node[PATH_MAX];
+    DIR *dirp;
+
+    dirp = opendir("/dev");
+    if (!dirp)
+        return AVERROR(errno);
+
+    for (entry = readdir(dirp); entry; entry = readdir(dirp)) {
+
+        if (strncmp(entry->d_name, "video", 5))
+            continue;
+
+        snprintf(node, sizeof(node), "/dev/%s", entry->d_name);
+        av_log(ctx->logctx, AV_LOG_DEBUG, "probing device %s\n", node);
+        ret = deint_v4l2m2m_probe_device(ctx, node);
+        if (!ret)
+            break;
+    }
+
+    closedir(dirp);
+
+    if (ret) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "Could not find a valid device\n");
+        ctx->fd = -1;
+
+        return ret;
+    }
+
+    av_log(ctx->logctx, AV_LOG_INFO, "Using device %s\n", node);
+
+    return 0;
+}
+
+static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf)
+{
+    int ret;
+
+    ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer);
+    if (ret < 0)
+        return AVERROR(errno);
+
+    buf->enqueued = 1;
+
+    return 0;
+}
+
+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
+{
+    struct v4l2_exportbuffer expbuf;
+    int i, ret;
+
+    for (i = 0; i < avbuf->num_planes; i++) {
+        memset(&expbuf, 0, sizeof(expbuf));
+
+        expbuf.index = avbuf->buffer.index;
+        expbuf.type = avbuf->buffer.type;
+        expbuf.plane = i;
+
+        ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf);
+        if (ret < 0)
+            return AVERROR(errno);
+
+        avbuf->fd = expbuf.fd;
+
+        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) {
+            /* drm frame */
+            avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length;
+            avbuf->drm_frame.objects[i].fd = expbuf.fd;
+            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        } else {
+            /* drm frame */
+            avbuf->drm_frame.objects[0].size = avbuf->buffer.length;
+            avbuf->drm_frame.objects[0].fd = expbuf.fd;
+            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        }
+    }
+
+    return 0;
+}
+
+static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
+{
+    struct v4l2_format *fmt = &queue->format;
+    DeintV4L2M2MContextShared *ctx = queue->ctx;
+    struct v4l2_requestbuffers req;
+    int ret, i, j, multiplanar;
+    uint32_t memory;
+
+    memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ?
+        V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
+
+    multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type);
+
+    memset(&req, 0, sizeof(req));
+    req.count = queue->num_buffers;
+    req.memory = memory;
+    req.type = fmt->type;
+
+    ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req);
+    if (ret < 0) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno));
+
+        return AVERROR(errno);
+    }
+
+    queue->num_buffers = req.count;
+    queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer));
+    if (!queue->buffers) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "malloc enomem\n");
+
+        return AVERROR(ENOMEM);
+    }
+
+    for (i = 0; i < queue->num_buffers; i++) {
+        V4L2Buffer *buf = &queue->buffers[i];
+
+        buf->enqueued = 0;
+        buf->fd = -1;
+        buf->q = queue;
+
+        buf->buffer.type = fmt->type;
+        buf->buffer.memory = memory;
+        buf->buffer.index = i;
+
+        if (multiplanar) {
+            buf->buffer.length = VIDEO_MAX_PLANES;
+            buf->buffer.m.planes = buf->planes;
+        }
+
+        ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer);
+        if (ret < 0) {
+            ret = AVERROR(errno);
+
+            goto fail;
+        }
+
+        if (multiplanar)
+            buf->num_planes = buf->buffer.length;
+        else
+            buf->num_planes = 1;
+
+        for (j = 0; j < buf->num_planes; j++) {
+            V4L2PlaneInfo *info = &buf->plane_info[j];
+
+            if (multiplanar) {
+                info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline;
+                info->length = buf->buffer.m.planes[j].length;
+            } else {
+                info->bytesperline = fmt->fmt.pix.bytesperline;
+                info->length = buf->buffer.length;
+            }
+        }
+
+        if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) {
+            ret = deint_v4l2m2m_enqueue_buffer(buf);
+            if (ret)
+                goto fail;
+
+            ret = v4l2_buffer_export_drm(buf);
+            if (ret)
+                goto fail;
+        }
+    }
+
+    return 0;
+
+fail:
+    for (i = 0; i < queue->num_buffers; i++)
+        if (queue->buffers[i].fd >= 0)
+            close(queue->buffers[i].fd);
+    av_free(queue->buffers);
+    queue->buffers = NULL;
+
+    return ret;
+}
+
+static int deint_v4l2m2m_streamon(V4L2Queue *queue)
+{
+    DeintV4L2M2MContextShared * const ctx = queue->ctx;
+    int type = queue->format.type;
+    int ret;
+
+    ret = ioctl(ctx->fd, VIDIOC_STREAMON, &type);
+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
+    if (ret < 0)
+        return AVERROR(errno);
+
+    return 0;
+}
+
+static int deint_v4l2m2m_streamoff(V4L2Queue *queue)
+{
+    DeintV4L2M2MContextShared * const ctx = queue->ctx;
+    int type = queue->format.type;
+    int ret;
+
+    ret = ioctl(ctx->fd, VIDIOC_STREAMOFF, &type);
+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
+    if (ret < 0)
+        return AVERROR(errno);
+
+    return 0;
+}
+
+// timeout in ms
+static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout)
+{
+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
+    DeintV4L2M2MContextShared *ctx = queue->ctx;
+    struct v4l2_buffer buf = { 0 };
+    V4L2Buffer* avbuf = NULL;
+    struct pollfd pfd;
+    short events;
+    int ret;
+
+    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
+        events =  POLLOUT | POLLWRNORM;
+    else
+        events = POLLIN | POLLRDNORM;
+
+    pfd.events = events;
+    pfd.fd = ctx->fd;
+
+    for (;;) {
+        ret = poll(&pfd, 1, timeout);
+        if (ret > 0)
+            break;
+        if (errno == EINTR)
+            continue;
+        return NULL;
+    }
+
+    if (pfd.revents & POLLERR)
+        return NULL;
+
+    if (pfd.revents & events) {
+        memset(&buf, 0, sizeof(buf));
+        buf.memory = V4L2_MEMORY_MMAP;
+        buf.type = queue->format.type;
+        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
+            memset(planes, 0, sizeof(planes));
+            buf.length = VIDEO_MAX_PLANES;
+            buf.m.planes = planes;
+        }
+
+        ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf);
+        if (ret) {
+            if (errno != EAGAIN)
+                av_log(ctx->logctx, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n",
+                       av_err2str(AVERROR(errno)));
+            return NULL;
+        }
+
+        avbuf = &queue->buffers[buf.index];
+        avbuf->enqueued = 0;
+        avbuf->buffer = buf;
+        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
+            memcpy(avbuf->planes, planes, sizeof(planes));
+            avbuf->buffer.m.planes = avbuf->planes;
+        }
+        return avbuf;
+    }
+
+    return NULL;
+}
+
+static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue)
+{
+    int i;
+    V4L2Buffer *buf = NULL;
+
+    for (i = 0; i < queue->num_buffers; i++)
+        if (!queue->buffers[i].enqueued) {
+            buf = &queue->buffers[i];
+            break;
+        }
+    return buf;
+}
+
+static void deint_v4l2m2m_unref_queued(V4L2Queue *queue)
+{
+    int i;
+    V4L2Buffer *buf = NULL;
+
+    if (!queue || !queue->buffers)
+        return;
+    for (i = 0; i < queue->num_buffers; i++) {
+        buf = &queue->buffers[i];
+        if (queue->buffers[i].enqueued)
+            av_frame_unref(&buf->frame);
+    }
+}
+
+static void recycle_q(V4L2Queue * const queue)
+{
+    V4L2Buffer* avbuf;
+    while (avbuf = deint_v4l2m2m_dequeue_buffer(queue, 0), avbuf) {
+        av_frame_unref(&avbuf->frame);
+    }
+}
+
+static int count_enqueued(V4L2Queue *queue)
+{
+    int i;
+    int n = 0;
+
+    if (queue->buffers == NULL)
+        return 0;
+
+    for (i = 0; i < queue->num_buffers; i++)
+        if (queue->buffers[i].enqueued)
+            ++n;
+    return n;
+}
+
+static int deint_v4l2m2m_enqueue_frame(V4L2Queue * const queue, AVFrame * const frame)
+{
+    DeintV4L2M2MContextShared *const ctx = queue->ctx;
+    AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0];
+    V4L2Buffer *buf;
+    int i;
+
+    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
+        recycle_q(queue);
+
+    buf = deint_v4l2m2m_find_free_buf(queue);
+    if (!buf) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d finding free buf\n", __func__, 0);
+        return AVERROR(EAGAIN);
+    }
+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type))
+        for (i = 0; i < drm_desc->nb_objects; i++)
+            buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd;
+    else
+        buf->buffer.m.fd = drm_desc->objects[0].fd;
+
+    buf->buffer.field = !frame->interlaced_frame ? V4L2_FIELD_NONE :
+        frame->top_field_first ? V4L2_FIELD_INTERLACED_TB :
+            V4L2_FIELD_INTERLACED_BT;
+
+    if (ctx->field_order != buf->buffer.field) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Field changed: %d->%d\n", __func__, ctx->field_order, buf->buffer.field);
+        ctx->field_order = buf->buffer.field;
+    }
+
+    buf->buffer.timestamp = pts_track_add_frame(&ctx->track, frame);
+
+    buf->drm_frame.objects[0].fd = drm_desc->objects[0].fd;
+
+    av_frame_move_ref(&buf->frame, frame);
+
+    return deint_v4l2m2m_enqueue_buffer(buf);
+}
+
+static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx)
+{
+    if (atomic_fetch_sub(&ctx->refcount, 1) == 1) {
+        V4L2Queue *capture = &ctx->capture;
+        V4L2Queue *output  = &ctx->output;
+        int i;
+
+        av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__);
+
+        if (ctx->fd >= 0) {
+            deint_v4l2m2m_streamoff(capture);
+            deint_v4l2m2m_streamoff(output);
+        }
+
+        if (capture->buffers)
+            for (i = 0; i < capture->num_buffers; i++) {
+                capture->buffers[i].q = NULL;
+                if (capture->buffers[i].fd >= 0)
+                    close(capture->buffers[i].fd);
+            }
+
+        deint_v4l2m2m_unref_queued(output);
+
+        av_buffer_unref(&ctx->hw_frames_ctx);
+
+        if (capture->buffers)
+            av_free(capture->buffers);
+
+        if (output->buffers)
+            av_free(output->buffers);
+
+        if (ctx->fd >= 0) {
+            close(ctx->fd);
+            ctx->fd = -1;
+        }
+
+        av_free(ctx);
+    }
+}
+
+static void v4l2_free_buffer(void *opaque, uint8_t *unused)
+{
+    V4L2Buffer *buf                = opaque;
+    DeintV4L2M2MContextShared *ctx = buf->q->ctx;
+
+    if (!ctx->done)
+        deint_v4l2m2m_enqueue_buffer(buf);
+
+    deint_v4l2m2m_destroy_context(ctx);
+}
+
+static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
+{
+    int av_pix_fmt = AV_PIX_FMT_YUV420P;
+    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
+    AVDRMLayerDescriptor *layer;
+
+    /* fill the DRM frame descriptor */
+    drm_desc->nb_objects = avbuf->num_planes;
+    drm_desc->nb_layers = 1;
+
+    layer = &drm_desc->layers[0];
+    layer->nb_planes = avbuf->num_planes;
+
+    for (int i = 0; i < avbuf->num_planes; i++) {
+        layer->planes[i].object_index = i;
+        layer->planes[i].offset = 0;
+        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
+    }
+
+    switch (av_pix_fmt) {
+    case AV_PIX_FMT_YUYV422:
+
+        layer->format = DRM_FORMAT_YUYV;
+        layer->nb_planes = 1;
+
+        break;
+
+    case AV_PIX_FMT_NV12:
+    case AV_PIX_FMT_NV21:
+
+        layer->format = av_pix_fmt == AV_PIX_FMT_NV12 ?
+            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
+
+        if (avbuf->num_planes > 1)
+            break;
+
+        layer->nb_planes = 2;
+
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
+            height;
+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
+        break;
+
+    case AV_PIX_FMT_YUV420P:
+
+        layer->format = DRM_FORMAT_YUV420;
+
+        if (avbuf->num_planes > 1)
+            break;
+
+        layer->nb_planes = 3;
+
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
+            height;
+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
+
+        layer->planes[2].object_index = 0;
+        layer->planes[2].offset = layer->planes[1].offset +
+            ((avbuf->plane_info[0].bytesperline *
+              height) >> 2);
+        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
+        break;
+
+    default:
+        drm_desc->nb_layers = 0;
+        break;
+    }
+
+    return (uint8_t *) drm_desc;
+}
+
+// timeout in ms
+static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout)
+{
+    DeintV4L2M2MContextShared *ctx = queue->ctx;
+    V4L2Buffer* avbuf;
+
+    av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__);
+
+    avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout);
+    if (!avbuf) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout);
+        return AVERROR(EAGAIN);
+    }
+
+    // Fill in PTS and anciliary info from src frame
+    // we will want to overwrite some fields as only the pts/dts
+    // fields are updated with new timing in this fn
+    pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame);
+
+    frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame,
+                            sizeof(avbuf->drm_frame), v4l2_free_buffer,
+                            avbuf, AV_BUFFER_FLAG_READONLY);
+    if (!frame->buf[0]) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d creating buffer\n", __func__, 0);
+        return AVERROR(ENOMEM);
+    }
+
+    atomic_fetch_add(&ctx->refcount, 1);
+
+    frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height);
+    frame->format = AV_PIX_FMT_DRM_PRIME;
+    if (ctx->hw_frames_ctx)
+        frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
+    frame->height = ctx->height;
+    frame->width = ctx->width;
+
+    // Not interlaced now
+    frame->interlaced_frame = 0;
+    frame->top_field_first = 0;
+    // Pkt duration halved
+    frame->pkt_duration /= 2;
+
+    if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n");
+        frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM;
+    }
+
+    av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: PTS=%"PRId64"\n", __func__, frame->pts);
+    return 0;
+}
+
+static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
+{
+    AVFilterLink *inlink           = outlink->src->inputs[0];
+    AVFilterContext *avctx         = outlink->src;
+    DeintV4L2M2MContext *priv      = avctx->priv;
+    DeintV4L2M2MContextShared *ctx = priv->shared;
+    int ret;
+
+    ctx->height = avctx->inputs[0]->h;
+    ctx->width = avctx->inputs[0]->w;
+
+    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d\n", __func__, ctx->width, ctx->height);
+
+    outlink->time_base           = inlink->time_base;
+    outlink->w                   = inlink->w;
+    outlink->h                   = inlink->h;
+    outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+    outlink->format              = inlink->format;
+    outlink->frame_rate = (AVRational) {1, 0};  // Deny knowledge of frame rate
+
+    ret = deint_v4l2m2m_find_device(ctx);
+    if (ret)
+        return ret;
+
+    if (inlink->hw_frames_ctx) {
+        ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx);
+        if (!ctx->hw_frames_ctx)
+            return AVERROR(ENOMEM);
+    }
+    return 0;
+}
+
+static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
+{
+    AVFilterContext *avctx         = link->dst;
+    DeintV4L2M2MContext *priv      = avctx->priv;
+    DeintV4L2M2MContextShared *ctx = priv->shared;
+    V4L2Queue *capture             = &ctx->capture;
+    V4L2Queue *output              = &ctx->output;
+    int ret;
+
+    av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" (%"PRId64") field :%d interlaced: %d aspect:%d/%d\n",
+          __func__, in->pts, AV_NOPTS_VALUE, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den);
+    av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__,
+           avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out);
+
+    if (ctx->field_order == V4L2_FIELD_ANY) {
+        AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0];
+        ctx->orig_width = drm_desc->layers[0].planes[0].pitch;
+        ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width;
+
+        av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height,
+           drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset);
+
+        if (in->top_field_first)
+            ctx->field_order = V4L2_FIELD_INTERLACED_TB;
+        else
+            ctx->field_order = V4L2_FIELD_INTERLACED_BT;
+
+        ret = deint_v4l2m2m_set_format(output, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
+        if (ret)
+            return ret;
+
+        ret = deint_v4l2m2m_set_format(capture, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
+        if (ret)
+            return ret;
+
+        ret = deint_v4l2m2m_allocate_buffers(capture);
+        if (ret)
+            return ret;
+
+        ret = deint_v4l2m2m_streamon(capture);
+        if (ret)
+            return ret;
+
+        ret = deint_v4l2m2m_allocate_buffers(output);
+        if (ret)
+            return ret;
+
+        ret = deint_v4l2m2m_streamon(output);
+        if (ret)
+            return ret;
+    }
+
+    ret = deint_v4l2m2m_enqueue_frame(output, in);
+
+    av_log(priv, AV_LOG_TRACE, ">>> %s: %s\n", __func__, av_err2str(ret));
+    return ret;
+}
+
+static int deint_v4l2m2m_activate(AVFilterContext *avctx)
+{
+    DeintV4L2M2MContext * const priv = avctx->priv;
+    DeintV4L2M2MContextShared *const s = priv->shared;
+    AVFilterLink * const outlink = avctx->outputs[0];
+    AVFilterLink * const inlink = avctx->inputs[0];
+    int n = 0;
+    int cn = 99;
+    int instatus = 0;
+    int64_t inpts = 0;
+    int did_something = 0;
+
+    av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__);
+
+    FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx);
+
+    ff_inlink_acknowledge_status(inlink, &instatus, &inpts);
+
+    if (!ff_outlink_frame_wanted(outlink)) {
+        av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__);
+    }
+    else if (s->field_order != V4L2_FIELD_ANY)  // Can't DQ if no setup!
+    {
+        AVFrame * frame = av_frame_alloc();
+        int rv;
+
+again:
+        recycle_q(&s->output);
+        n = count_enqueued(&s->output);
+
+        if (frame == NULL) {
+            av_log(priv, AV_LOG_ERROR, "%s: error allocating frame\n", __func__);
+            return AVERROR(ENOMEM);
+        }
+
+        rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, n > 4 ? 300 : 0);
+        if (rv != 0) {
+            av_frame_free(&frame);
+            if (rv != AVERROR(EAGAIN)) {
+                av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv));
+                return rv;
+            }
+        }
+        else {
+            frame->interlaced_frame = 0;
+            // frame is always consumed by filter_frame - even on error despite
+            // a somewhat confusing comment in the header
+            rv = ff_filter_frame(outlink, frame);
+
+            if (instatus != 0) {
+                av_log(priv, AV_LOG_TRACE, "%s: eof loop\n", __func__);
+                goto again;
+            }
+
+            av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv));
+            did_something = 1;
+        }
+
+        cn = count_enqueued(&s->capture);
+    }
+
+    if (instatus != 0) {
+        ff_outlink_set_status(outlink, instatus, inpts);
+        av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(instatus));
+        return 0;
+    }
+
+    {
+        AVFrame * frame;
+        int rv;
+
+        recycle_q(&s->output);
+        n = count_enqueued(&s->output);
+
+        while (n < 6) {
+            if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) {
+                av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv));
+                return rv;
+            }
+
+            if (frame == NULL) {
+                av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
+                break;
+            }
+
+            deint_v4l2m2m_filter_frame(inlink, frame);
+            av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
+            ++n;
+        }
+    }
+
+    if (n < 6) {
+        ff_inlink_request_frame(inlink);
+        did_something = 1;
+        av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__);
+    }
+
+    if (n > 4 && ff_outlink_frame_wanted(outlink)) {
+        ff_filter_set_ready(avctx, 1);
+        did_something = 1;
+        av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__);
+    }
+
+    av_log(priv, AV_LOG_TRACE, ">>> %s: OK (n=%d, cn=%d)\n", __func__, n, cn);
+    return did_something ? 0 : FFERROR_NOT_READY;
+}
+
+static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
+{
+    DeintV4L2M2MContext * const priv = avctx->priv;
+    DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared));
+
+    if (!ctx) {
+        av_log(priv, AV_LOG_ERROR, "%s: error %d allocating context\n", __func__, 0);
+        return AVERROR(ENOMEM);
+    }
+    priv->shared = ctx;
+    ctx->logctx = priv;
+    ctx->fd = -1;
+    ctx->output.ctx = ctx;
+    ctx->output.num_buffers = 8;
+    ctx->capture.ctx = ctx;
+    ctx->capture.num_buffers = 12;
+    ctx->done = 0;
+    ctx->field_order = V4L2_FIELD_ANY;
+
+    pts_track_init(&ctx->track, priv);
+
+    atomic_init(&ctx->refcount, 1);
+
+    return 0;
+}
+
+static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
+{
+    DeintV4L2M2MContext *priv = avctx->priv;
+    DeintV4L2M2MContextShared *ctx = priv->shared;
+
+    ctx->done = 1;
+    ctx->logctx = NULL;  // Log to NULL works, log to missing crashes
+    pts_track_uninit(&ctx->track);
+    deint_v4l2m2m_destroy_context(ctx);
+}
+
+static const AVOption deinterlace_v4l2m2m_options[] = {
+    { NULL },
+};
+
+AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m);
+
+static const AVFilterPad deint_v4l2m2m_inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+    },
+};
+
+static const AVFilterPad deint_v4l2m2m_outputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_VIDEO,
+        .config_props  = deint_v4l2m2m_config_props,
+    },
+};
+
+AVFilter ff_vf_deinterlace_v4l2m2m = {
+    .name           = "deinterlace_v4l2m2m",
+    .description    = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"),
+    .priv_size      = sizeof(DeintV4L2M2MContext),
+    .init           = &deint_v4l2m2m_init,
+    .uninit         = &deint_v4l2m2m_uninit,
+    FILTER_INPUTS(deint_v4l2m2m_inputs),
+    FILTER_OUTPUTS(deint_v4l2m2m_outputs),
+    FILTER_SINGLE_SAMPLEFMT(AV_PIX_FMT_DRM_PRIME),
+    .priv_class     = &deinterlace_v4l2m2m_class,
+    .activate       = deint_v4l2m2m_activate,
+};

From 0fb00e51d1ca40eed22bfc66b7f309fdc56229bc Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 2 Dec 2021 17:49:55 +0000
Subject: [PATCH 031/161] Put no_pts_rescale in context which makes more sense
 than an arg

---
 libavcodec/v4l2_buffers.c | 28 ++++++++++++++--------------
 libavcodec/v4l2_buffers.h |  5 ++---
 libavcodec/v4l2_context.c |  8 ++++----
 libavcodec/v4l2_context.h | 13 +++++++++----
 libavcodec/v4l2_m2m_dec.c |  9 +++++----
 5 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 126d2a17f4..22da6bd722 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -39,7 +39,7 @@
 #define USEC_PER_SEC 1000000
 static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };

-static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
+static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
 {
     return V4L2_TYPE_IS_OUTPUT(buf->context->type) ?
         container_of(buf->context, V4L2m2mContext, output) :
@@ -51,34 +51,34 @@ static inline AVCodecContext *logger(V4L2Buffer *buf)
     return buf_to_m2mctx(buf)->avctx;
 }

-static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf)
+static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf)
 {
-    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+    const V4L2m2mContext *s = buf_to_m2mctx(avbuf);
     const AVRational tb = s->avctx->pkt_timebase.num ?
         s->avctx->pkt_timebase :
         s->avctx->time_base;
     return tb.num && tb.den ? tb : v4l2_timebase;
 }

-static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts, int no_rescale)
+static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts)
 {
     /* convert pts to v4l2 timebase */
     const int64_t v4l2_pts =
-        no_rescale ? pts :
+        out->context->no_pts_rescale ? pts :
         pts == AV_NOPTS_VALUE ? 0 :
             av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
     out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
     out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
 }

-static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf, int no_rescale)
+static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf)
 {
     /* convert pts back to encoder timebase */
     const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
                         avbuf->buf.timestamp.tv_usec;

     return
-        no_rescale ? v4l2_pts :
+        avbuf->context->no_pts_rescale ? v4l2_pts :
         v4l2_pts == 0 ? AV_NOPTS_VALUE :
             av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
 }
@@ -686,13 +686,13 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
     v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
     v4l2_set_color_range(out, frame->color_range);
     // PTS & interlace are buffer vars
-    v4l2_set_pts(out, frame->pts, 0);
+    v4l2_set_pts(out, frame->pts);
     v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);

     return v4l2_buffer_swframe_to_buf(frame, out);
 }

-int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_rescale_pts)
+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
 {
     int ret;
     V4L2Context * const ctx = avbuf->context;
@@ -710,7 +710,7 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_resc
     frame->colorspace = v4l2_get_color_space(avbuf);
     frame->color_range = v4l2_get_color_range(avbuf);
     frame->color_trc = v4l2_get_color_trc(avbuf);
-    frame->pts = v4l2_get_pts(avbuf, no_rescale_pts);
+    frame->pts = v4l2_get_pts(avbuf);
     frame->pkt_dts = AV_NOPTS_VALUE;
     frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf);
     frame->top_field_first = v4l2_buf_is_top_first(avbuf);
@@ -757,13 +757,13 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
         pkt->flags |= AV_PKT_FLAG_CORRUPT;
     }

-    pkt->dts = pkt->pts = v4l2_get_pts(avbuf, 0);
+    pkt->dts = pkt->pts = v4l2_get_pts(avbuf);

     return 0;
 }

 int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-                                    const void *extdata, size_t extlen, int no_rescale_pts)
+                                    const void *extdata, size_t extlen)
 {
     int ret;

@@ -777,7 +777,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
     if (ret && ret != AVERROR(ENOMEM))
         return ret;

-    v4l2_set_pts(out, pkt->pts, no_rescale_pts);
+    v4l2_set_pts(out, pkt->pts);

     if (pkt->flags & AV_PKT_FLAG_KEY)
         out->flags = V4L2_BUF_FLAG_KEYFRAME;
@@ -787,7 +787,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,

 int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
 {
-    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
+    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0);
 }


diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
index 111526aee3..641e0e147b 100644
--- a/libavcodec/v4l2_buffers.h
+++ b/libavcodec/v4l2_buffers.h
@@ -83,12 +83,11 @@ typedef struct V4L2Buffer {
  *
  * @param[in] frame The AVFRame to push the information to
  * @param[in] buf The V4L2Buffer to get the information from
- * @param[in] no_rescale_pts If non-zero do not rescale PTS
  *
  * @returns 0 in case of success, AVERROR(EINVAL) if the number of planes is incorrect,
  * AVERROR(ENOMEM) if the AVBufferRef can't be created.
  */
-int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf, int no_rescale_pts);
+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf);

 /**
  * Extracts the data from a V4L2Buffer to an AVPacket
@@ -113,7 +112,7 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf);
 int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);

 int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-                                    const void *extdata, size_t extlen, int no_rescale_pts);
+                                    const void *extdata, size_t extlen);

 /**
  * Extracts the data from an AVFrame to a V4L2Buffer
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 440dfaaba5..64540a37b3 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -808,7 +808,7 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
 }

 int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
-                                   const void * extdata, size_t extlen, int no_rescale_pts)
+                                   const void * extdata, size_t extlen)
 {
     V4L2m2mContext *s = ctx_to_m2mctx(ctx);
     V4L2Buffer* avbuf;
@@ -827,7 +827,7 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
     if (!avbuf)
         return AVERROR(EAGAIN);

-    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts);
+    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen);
     if (ret == AVERROR(ENOMEM))
         av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
                __func__, pkt->size, avbuf->planes[0].length);
@@ -837,7 +837,7 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
     return ff_v4l2_buffer_enqueue(avbuf);
 }

-int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout, int no_rescale_pts)
+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
 {
     V4L2Buffer *avbuf;

@@ -854,7 +854,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout,
         return AVERROR(EAGAIN);
     }

-    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf, no_rescale_pts);
+    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
 }

 int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 37b0431400..4cc164886c 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -102,6 +102,13 @@ typedef struct V4L2Context {
      */
     int done;

+    /**
+     * PTS rescale not wanted
+     * If the PTS is just a dummy frame count then rescale is
+     * actively harmful
+     */
+    int no_pts_rescale;
+
     AVBufferRef *frames_ref;
     int q_count;
     int dq_count;
@@ -172,12 +179,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
  * @param[in] ctx The V4L2Context to dequeue from.
  * @param[inout] f The AVFrame to dequeue to.
  * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
- * @param[in] no_rescale_pts (0 rescale pts, 1 use pts as
- *       timestamp directly)
  *
  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
  */
-int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int no_rescale_pts);
+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);

 /**
  * Enqueues a buffer to a V4L2Context from an AVPacket
@@ -189,7 +194,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int
  * @param[in] pkt A pointer to an AVPacket.
  * @return 0 in case of success, a negative error otherwise.
  */
-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size, int no_rescale_pts);
+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size);

 /**
  * Enqueues a buffer to a V4L2Context from an AVFrame
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index b49f470c0a..36754b314a 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -360,7 +360,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
             if (!s->draining) {
                 // Calling enqueue with an empty pkt starts drain
                 av_assert0(s->buf_pkt.size == 0);
-                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0, 1);
+                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
                 if (ret) {
                     av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret);
                     return ret;
@@ -381,8 +381,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
         return ret;

     ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt,
-                                         avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size,
-                                         1);
+                                         avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size);

     if (ret == AVERROR(EAGAIN)) {
         // Out of input buffers - keep packet
@@ -442,7 +441,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
                 // when discarding
                 // This returns AVERROR(EAGAIN) if there isn't a frame ready yet
                 // but there is room in the input Q
-                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1, 1);
+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1);

                 if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
                     av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
@@ -569,10 +568,12 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     output->av_codec_id = avctx->codec_id;
     output->av_pix_fmt  = AV_PIX_FMT_NONE;
     output->min_buf_size = max_coded_size(avctx);
+    output->no_pts_rescale = 1;

     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
     capture->av_pix_fmt = avctx->pix_fmt;
     capture->min_buf_size = 0;
+    capture->no_pts_rescale = 1;

     /* the client requests the codec to generate DRM frames:
      *   - data[0] will therefore point to the returned AVDRMFrameDescriptor

From 5e36908e6f2f06b68e85873cbcd421c0973f6409 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 8 Dec 2021 15:00:37 +0000
Subject: [PATCH 032/161] Use bitbuf min size for all streams

---
 libavcodec/v4l2_m2m_dec.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 36754b314a..48a6810d18 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -507,15 +507,12 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 }
 #endif

+// This heuristic is for H264 but use for everything
 static uint32_t max_coded_size(const AVCodecContext * const avctx)
 {
     uint32_t wxh = avctx->coded_width * avctx->coded_height;
     uint32_t size;

-    // Currently the only thing we try to set our own limits for is H264
-    if (avctx->codec_id != AV_CODEC_ID_H264)
-        return 0;
-
     size = wxh * 3 / 2;
     // H.264 Annex A table A-1 gives minCR which is either 2 or 4
     // unfortunately that doesn't yield an actually useful limit

From 5fcbcd31761eea31dc0157793f558eaaadfe2ac3 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 3 Dec 2021 12:54:18 +0000
Subject: [PATCH 033/161] Track pending frames in v4l2 stateful

Track which frames are pending decode in the v4l2 stateful decoder.
This relies on DTS & PTS having some relationship to reality, so
any use of this code must cope with the results being wrong.

Also moves the xlat state vars out of the main context and into their
own structure.
---
 libavcodec/v4l2_m2m.h     |  15 ++++--
 libavcodec/v4l2_m2m_dec.c | 100 +++++++++++++++++++++++++++++---------
 2 files changed, 89 insertions(+), 26 deletions(-)

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 82feb0afdb..3f86809623 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -44,8 +44,10 @@
 #define FF_V4L2_M2M_TRACK_SIZE 128
 typedef struct V4L2m2mTrackEl {
     int     discard;   // If we see this buffer its been flushed, so discard
+    int     pending;
     int     pkt_size;
     int64_t pts;
+    int64_t dts;
     int64_t reordered_opaque;
     int64_t pkt_pos;
     int64_t pkt_duration;
@@ -62,6 +64,14 @@ typedef struct pts_stats_s
     int64_t guess;
 } pts_stats_t;

+typedef struct xlat_track_s {
+    unsigned int track_no;
+    int64_t last_pts;
+    int64_t last_pkt_dts;
+    int64_t last_opaque;
+    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
+} xlat_track_t;
+
 typedef struct V4L2m2mContext {
     char devname[PATH_MAX];
     int fd;
@@ -96,10 +106,7 @@ typedef struct V4L2m2mContext {
     int output_drm;

     /* Frame tracking */
-    int64_t last_pkt_dts;
-    int64_t last_opaque;
-    unsigned int track_no;
-    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
+    xlat_track_t xlat;

     pts_stats_t pts_stat;

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 48a6810d18..d8ebb466cd 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -242,22 +242,24 @@ static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts
 // buffer of all the things we want preserved (including the original PTS)
 // indexed by the tracking no.
 static void
-xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *const avpkt)
+xlat_pts_in(AVCodecContext *const avctx, xlat_track_t *const x, AVPacket *const avpkt)
 {
     int64_t track_pts;

     // Avoid 0
-    if (++s->track_no == 0)
-        s->track_no = 1;
+    if (++x->track_no == 0)
+        x->track_no = 1;

-    track_pts = track_to_pts(avctx, s->track_no);
+    track_pts = track_to_pts(avctx, x->track_no);

-    av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, s->track_no);
-    s->last_pkt_dts = avpkt->dts;
-    s->track_els[s->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
+    av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
+    x->last_pkt_dts = avpkt->dts;
+    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
         .discard          = 0,
+        .pending          = 1,
         .pkt_size         = avpkt->size,
         .pts              = avpkt->pts,
+        .dts              = avpkt->dts,
         .reordered_opaque = avctx->reordered_opaque,
         .pkt_pos          = avpkt->pos,
         .pkt_duration     = avpkt->duration,
@@ -268,31 +270,36 @@ xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *cons

 // Returns -1 if we should discard the frame
 static int
-xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *const frame)
+xlat_pts_out(AVCodecContext *const avctx,
+             xlat_track_t * const x,
+             pts_stats_t * const ps,
+             AVFrame *const frame)
 {
     unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
-    const V4L2m2mTrackEl *const t = s->track_els + n;
+    V4L2m2mTrackEl *const t = x->track_els + n;
     if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
     {
         av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
         frame->pts              = AV_NOPTS_VALUE;
-        frame->pkt_dts          = s->last_pkt_dts;
-        frame->reordered_opaque = s->last_opaque;
+        frame->pkt_dts          = x->last_pkt_dts;
+        frame->reordered_opaque = x->last_opaque;
         frame->pkt_pos          = -1;
         frame->pkt_duration     = 0;
         frame->pkt_size         = -1;
     }
     else if (!t->discard)
     {
-        frame->pts              = t->pts;
-        frame->pkt_dts          = s->last_pkt_dts;
+        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
+        frame->pkt_dts          = x->last_pkt_dts;
         frame->reordered_opaque = t->reordered_opaque;
         frame->pkt_pos          = t->pkt_pos;
         frame->pkt_duration     = t->pkt_duration;
         frame->pkt_size         = t->pkt_size;

-        s->last_opaque = s->track_els[n].reordered_opaque;
-        s->track_els[n].pts = AV_NOPTS_VALUE;  // If we hit this again deny accurate knowledge of PTS
+        x->last_opaque = x->track_els[n].reordered_opaque;
+        if (frame->pts != AV_NOPTS_VALUE)
+            x->last_pts = frame->pts;
+        t->pending = 0;
     }
     else
     {
@@ -300,14 +307,62 @@ xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *cons
         return -1;
     }

-    pts_stats_add(&s->pts_stat, frame->pts);
+    pts_stats_add(ps, frame->pts);

-    frame->best_effort_timestamp = pts_stats_guess(&s->pts_stat);
+    frame->best_effort_timestamp = pts_stats_guess(ps);
     frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
     av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
     return 0;
 }

+static void
+xlat_flush(xlat_track_t * const x)
+{
+    unsigned int i;
+    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) {
+        x->track_els[i].pending = 0;
+        x->track_els[i].discard = 1;
+    }
+    x->last_pts = AV_NOPTS_VALUE;
+}
+
+static void
+xlat_init(xlat_track_t * const x)
+{
+    memset(x, 0, sizeof(*x));
+    x->last_pts = AV_NOPTS_VALUE;
+}
+
+static int
+xlat_pending(const xlat_track_t * const x)
+{
+    unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
+    unsigned int i;
+    int r = 0;
+    int64_t now = AV_NOPTS_VALUE;
+
+    for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) {
+        const V4L2m2mTrackEl * const t = x->track_els + n;
+
+        if (!t->pending)
+            continue;
+
+        if (now == AV_NOPTS_VALUE)
+            now = t->dts;
+
+        if (t->pts == AV_NOPTS_VALUE ||
+            ((now == AV_NOPTS_VALUE || t->pts <= now) &&
+             (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts)))
+            ++r;
+    }
+
+    // If we never get any ideas about PTS vs DTS allow a lot more buffer
+    if (now == AV_NOPTS_VALUE)
+        r -= 16;
+
+    return r;
+}
+
 static inline int stream_started(const V4L2m2mContext * const s) {
     return s->capture.streamon && s->output.streamon;
 }
@@ -374,7 +429,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
             return ret;
         }

-        xlat_pts_in(avctx, s, &s->buf_pkt);
+        xlat_pts_in(avctx, &s->xlat, &s->buf_pkt);
     }

     if ((ret = check_output_streamon(avctx, s)) != 0)
@@ -417,6 +472,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
     int dst_rv = 1;  // Non-zero (done), non-negative (error) number

     do {
+        av_log(avctx, AV_LOG_INFO, "Pending=%d\n", xlat_pending(&s->xlat));
         src_rv = try_enqueue_src(avctx, s);

         // If we got a frame last time and we have nothing to enqueue then
@@ -451,7 +507,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
                            s->draining, s->capture.done, dst_rv);

                 // Go again if we got a frame that we need to discard
-            } while (dst_rv == 0 && xlat_pts_out(avctx, s, frame));
+            } while (dst_rv == 0 && xlat_pts_out(avctx, &s->xlat, &s->pts_stat, frame));
         }

         // Continue trying to enqueue packets if either
@@ -550,6 +606,7 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     if (ret < 0)
         return ret;

+    xlat_init(&s->xlat);
     pts_stats_init(&s->pts_stat, avctx, "decoder");

     capture = &s->capture;
@@ -632,7 +689,7 @@ static void v4l2_decode_flush(AVCodecContext *avctx)
     V4L2m2mContext * const s = priv->context;
     V4L2Context * const output = &s->output;
     V4L2Context * const capture = &s->capture;
-    int ret, i;
+    int ret;

     av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);

@@ -646,8 +703,7 @@ static void v4l2_decode_flush(AVCodecContext *avctx)

     // V4L2 makes no guarantees about whether decoded frames are flushed or not
     // so mark all frames we are tracking to be discarded if they appear
-    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i)
-        s->track_els[i].discard = 1;
+    xlat_flush(&s->xlat);

     // resend extradata
     s->extdata_sent = 0;

From 6fae7b3f42c8e9e431a59323c0faa6c88fe951d9 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 15 Dec 2021 17:58:21 +0000
Subject: [PATCH 034/161] Use pending tracking to reduce v4l2 latency

If there are more than 5 pending decodes outstanding then add a small
timeout to the capture poll to reduce the rate at which frames are
added.
---
 libavcodec/v4l2_m2m_dec.c | 58 ++++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index d8ebb466cd..7e7e4729d0 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -370,16 +370,19 @@ static inline int stream_started(const V4L2m2mContext * const s) {
 #define NQ_OK        0
 #define NQ_Q_FULL    1
 #define NQ_SRC_EMPTY 2
-#define NQ_DRAINING  3
-#define NQ_DEAD      4
+#define NQ_NONE      3
+#define NQ_DRAINING  4
+#define NQ_DEAD      5

 #define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
+#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE)

 // AVERROR_EOF     Flushing an already flushed stream
 // -ve             Error (all errors except EOF are unexpected)
 // NQ_OK (0)       OK
 // NQ_Q_FULL       Dst full (retry if we think V4L2 Q has space now)
 // NQ_SRC_EMPTY    Src empty (do not retry)
+// NQ_NONE         Enqueue not attempted
 // NQ_DRAINING     At EOS, dQ dest until EOS there too
 // NQ_DEAD         Not running (do not retry, do not attempt capture dQ)

@@ -468,23 +471,28 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
 static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 {
     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-    int src_rv;
+    int src_rv = NQ_NONE;
     int dst_rv = 1;  // Non-zero (done), non-negative (error) number
+    unsigned int i = 0;

     do {
-        av_log(avctx, AV_LOG_INFO, "Pending=%d\n", xlat_pending(&s->xlat));
-        src_rv = try_enqueue_src(avctx, s);
-
-        // If we got a frame last time and we have nothing to enqueue then
-        // return now. rv will be AVERROR(EAGAIN) indicating that we want more input
-        // This should mean that once decode starts we enter a stable state where
-        // we alternately ask for input and produce output
-        if (s->req_pkt && src_rv == NQ_SRC_EMPTY)
-            break;
-
-        if (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) {
-            av_log(avctx, AV_LOG_WARNING, "Poll says src Q has space but enqueue fail");
-            src_rv = NQ_SRC_EMPTY;  // If we can't enqueue pretend that there is nothing to enqueue
+        const int pending = xlat_pending(&s->xlat);
+        const int prefer_dq = (pending > 5);
+
+        // Enqueue another pkt for decode if
+        // (a) We don't have a lot of stuff in the buffer already OR
+        // (b) ... we (think we) do but we've failed to get a frame already OR
+        // (c) We've dequeued a lot of frames without asking for input
+        if (!prefer_dq || i != 0 || s->req_pkt > 2) {
+            src_rv = try_enqueue_src(avctx, s);
+
+            // If we got a frame last time or we've already tried to get a frame and
+            // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
+            // indicating that we want more input.
+            // This should mean that once decode starts we enter a stable state where
+            // we alternately ask for input and produce output
+            if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
+                break;
         }

         // Try to get a new frame if
@@ -495,9 +503,9 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
                 // Dequeue frame will unref any previous contents of frame
                 // if it returns success so we don't need an explicit unref
                 // when discarding
-                // This returns AVERROR(EAGAIN) if there isn't a frame ready yet
-                // but there is room in the input Q
-                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1);
+                // This returns AVERROR(EAGAIN) on timeout or if
+                // there is room in the input Q and timeout == -1
+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, prefer_dq ? 5 : -1);

                 if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
                     av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
@@ -510,10 +518,16 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
             } while (dst_rv == 0 && xlat_pts_out(avctx, &s->xlat, &s->pts_stat, frame));
         }

+        ++i;
+        if (i >= 256) {
+            av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i);
+            src_rv = AVERROR(EIO);
+        }
+
         // Continue trying to enqueue packets if either
         // (a) we succeeded last time OR
-        // (b) enqueue failed due to input Q full AND there is now room
-    } while (src_rv == NQ_OK || (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) );
+        // (b) we didn't ret a frame and we can retry the input
+    } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv)));

     // Ensure that the frame contains nothing if we aren't returning a frame
     // (might happen when discarding)
@@ -521,7 +535,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
         av_frame_unref(frame);

     // If we got a frame this time ask for a pkt next time
-    s->req_pkt = (dst_rv == 0);
+    s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0;

 #if 0
     if (dst_rv == 0)

From 175abd2eb961a3718a660e1f9eda08b37b01b309 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 15 Dec 2021 12:23:54 +0000
Subject: [PATCH 035/161] Allow logger() to take const ctx

---
 libavcodec/v4l2_buffers.c | 2 +-
 libavcodec/v4l2_context.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 22da6bd722..39c0094aec 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -46,7 +46,7 @@ static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
         container_of(buf->context, V4L2m2mContext, capture);
 }

-static inline AVCodecContext *logger(V4L2Buffer *buf)
+static inline AVCodecContext *logger(const V4L2Buffer * const buf)
 {
     return buf_to_m2mctx(buf)->avctx;
 }
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 64540a37b3..d3df48aed4 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -43,14 +43,14 @@ struct v4l2_format_update {
     int update_avfmt;
 };

-static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx)
+static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
 {
     return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
         container_of(ctx, V4L2m2mContext, output) :
         container_of(ctx, V4L2m2mContext, capture);
 }

-static inline AVCodecContext *logger(V4L2Context *ctx)
+static inline AVCodecContext *logger(const V4L2Context *ctx)
 {
     return ctx_to_m2mctx(ctx)->avctx;
 }

From 21d4f3f644c45084c621cb5aa577169bf5c15017 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 15 Dec 2021 13:00:27 +0000
Subject: [PATCH 036/161] Track numbere of bufs qed with an atomic

Safer and faster than counting status
---
 libavcodec/v4l2_buffers.c | 6 +++---
 libavcodec/v4l2_context.c | 3 ++-
 libavcodec/v4l2_context.h | 3 +--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 39c0094aec..2cf7be6632 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -922,6 +922,7 @@ fail:
 int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
 {
     int ret;
+    int qc;

     avbuf->buf.flags = avbuf->flags;

@@ -941,11 +942,10 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
         return AVERROR(err);
     }

-    ++avbuf->context->q_count;
+    qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1;
     av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
            avbuf->context->name, avbuf->buf.index,
-           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
-           avbuf->context->q_count);
+           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc);

     avbuf->status = V4L2BUF_IN_DRIVER;

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index d3df48aed4..268a057e53 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -599,7 +599,7 @@ static int v4l2_release_buffers(V4L2Context* ctx)
                     "  2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
         }
     }
-    ctx->q_count = 0;
+    atomic_store(&ctx->q_count, 0);

     return ret;
 }
@@ -1019,6 +1019,7 @@ int ff_v4l2_context_init(V4L2Context* ctx)
     }

     ff_mutex_init(&ctx->lock, NULL);
+    atomic_init(&ctx->q_count, 0);

     if (s->output_drm) {
         AVHWFramesContext *hwframes;
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 4cc164886c..a4176448d5 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -110,8 +110,7 @@ typedef struct V4L2Context {
     int no_pts_rescale;

     AVBufferRef *frames_ref;
-    int q_count;
-    int dq_count;
+    atomic_int q_count;
     struct ff_weak_link_master *wl_master;

     AVMutex lock;

From b2fa4ab3d63924597b8c3659123b145a786a2c13 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 9 Dec 2021 12:01:25 +0000
Subject: [PATCH 037/161] Clear pkt_buf on flush

---
 libavcodec/v4l2_m2m_dec.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 7e7e4729d0..09ec496351 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -715,6 +715,9 @@ static void v4l2_decode_flush(AVCodecContext *avctx)
     if (ret < 0)
         av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret);

+    // Clear any buffered input packet
+    av_packet_unref(&s->buf_pkt);
+
     // V4L2 makes no guarantees about whether decoded frames are flushed or not
     // so mark all frames we are tracking to be discarded if they appear
     xlat_flush(&s->xlat);

From 16cf94cb5e1d11f4c3a6b8a43557383ce78112e0 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 15 Dec 2021 12:52:56 +0000
Subject: [PATCH 038/161] Rework v4l2 buffer dequeue

---
 libavcodec/v4l2_context.c | 543 ++++++++++++++++++--------------------
 libavcodec/v4l2_context.h |   2 +
 libavcodec/v4l2_m2m.c     |   1 -
 libavcodec/v4l2_m2m.h     |  16 +-
 libavcodec/v4l2_m2m_dec.c | 138 ++++------
 5 files changed, 327 insertions(+), 373 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 268a057e53..d765181645 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -73,19 +73,27 @@ static AVRational v4l2_get_sar(V4L2Context *ctx)
     return sar;
 }

-static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2)
+static inline int ctx_buffers_alloced(const V4L2Context * const ctx)
 {
-    struct v4l2_format *fmt1 = &ctx->format;
-    int ret =  V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
-        fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
-        fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
-        :
-        fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
-        fmt1->fmt.pix.height != fmt2->fmt.pix.height;
+    return ctx->bufrefs != NULL;
+}
+
+// Width/Height changed or we don't have an alloc in the first place?
+static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2)
+{
+    const struct v4l2_format *fmt1 = &ctx->format;
+    int ret = !ctx_buffers_alloced(ctx) ||
+        (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+            fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
+            fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
+            :
+            fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
+            fmt1->fmt.pix.height != fmt2->fmt.pix.height);

     if (ret)
-        av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
+        av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n",
             ctx->name,
+            ctx_buffers_alloced(ctx),
             ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1),
             ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2));

@@ -167,10 +175,8 @@ static int do_source_change(V4L2m2mContext * const s)

     int ret;
     int reinit;
-    int full_reinit;
     struct v4l2_format cap_fmt = s->capture.format;

-    s->resize_pending = 0;
     s->capture.done = 0;

     ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
@@ -179,15 +185,21 @@ static int do_source_change(V4L2m2mContext * const s)
         return 0;
     }

-    s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
-
     get_default_selection(&s->capture, &s->capture.selection);

-    reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
+    reinit = ctx_resolution_changed(&s->capture, &cap_fmt);
+    s->capture.format = cap_fmt;
     if (reinit) {
         s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
         s->capture.width = ff_v4l2_get_format_width(&cap_fmt);
     }
+
+    // If we don't support selection (or it is bust) and we obviously have HD then kludge
+    if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) &&
+        (s->capture.height == 1088 && s->capture.width == 1920)) {
+        s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080};
+    }
+
     s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);

     av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n",
@@ -195,11 +207,11 @@ static int do_source_change(V4L2m2mContext * const s)
            s->capture.selection.width, s->capture.selection.height,
            s->capture.selection.left, s->capture.selection.top);

-    s->reinit = 1;
-
     if (reinit) {
         if (avctx)
-            ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
+            ret = ff_set_dimensions(s->avctx,
+                                    s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width,
+                                    s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height);
         if (ret < 0)
             av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n");

@@ -208,11 +220,22 @@ static int do_source_change(V4L2m2mContext * const s)
             av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
             return AVERROR(EINVAL);
         }
+
+        // Update pixel format - should only actually do something on initial change
+        s->capture.av_pix_fmt =
+        ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
+        if (s->output_drm) {
+            avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
+            avctx->sw_pix_fmt = s->capture.av_pix_fmt;
+        }
+        else
+            avctx->pix_fmt = s->capture.av_pix_fmt;
+
         goto reinit_run;
     }

     /* Buffers are OK so just stream off to ack */
-    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only\n", __func__);
+    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__);

     ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
     if (ret)
@@ -225,54 +248,6 @@ reinit_run:
     return 1;
 }

-static int ctx_done(V4L2Context * const ctx)
-{
-    int rv = 0;
-    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-
-    ctx->done = 1;
-
-    if (s->resize_pending && !V4L2_TYPE_IS_OUTPUT(ctx->type))
-        rv = do_source_change(s);
-
-    return rv;
-}
-
-/**
- * handle resolution change event and end of stream event
- * returns 1 if reinit was successful, negative if it failed
- * returns 0 if reinit was not executed
- */
-static int v4l2_handle_event(V4L2Context *ctx)
-{
-    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-    struct v4l2_event evt = { 0 };
-    int ret;
-
-    ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt);
-    if (ret < 0) {
-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name);
-        return 0;
-    }
-
-    av_log(logger(ctx), AV_LOG_INFO, "Dq event %d\n", evt.type);
-
-    if (evt.type == V4L2_EVENT_EOS) {
-//        ctx->done = 1;
-        av_log(logger(ctx), AV_LOG_TRACE, "%s VIDIOC_EVENT_EOS\n", ctx->name);
-        return 0;
-    }
-
-    if (evt.type != V4L2_EVENT_SOURCE_CHANGE)
-        return 0;
-
-    s->resize_pending = 1;
-    if (!ctx->done)
-        return 0;
-
-    return do_source_change(s);
-}
-
 static int v4l2_stop_decode(V4L2Context *ctx)
 {
     struct v4l2_decoder_cmd cmd = {
@@ -313,243 +288,252 @@ static int v4l2_stop_encode(V4L2Context *ctx)
     return 0;
 }

-static int count_in_driver(const V4L2Context * const ctx)
+// DQ a buffer
+// Amalgamates all the various ways there are of signalling EOS/Event to
+// generate a consistant EPIPE.
+//
+// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped)
+//
+// Returns:
+//  0               Success
+//  AVERROR(EPIPE)  Nothing more to read
+//  *               AVERROR(..)
+
+ static int
+dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf)
 {
-    int i;
-    int n = 0;
+    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
+    AVCodecContext * const avctx = m->avctx;
+    V4L2Buffer * avbuf;
+    const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type);

-    if (!ctx->bufrefs)
-        return -1;
-
-    for (i = 0; i < ctx->num_buffers; ++i) {
-        V4L2Buffer *const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-        if (avbuf->status == V4L2BUF_IN_DRIVER)
-            ++n;
-    }
-    return n;
-}
+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};

-static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
-{
-    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-    const int is_capture = !V4L2_TYPE_IS_OUTPUT(ctx->type);
-    struct v4l2_plane planes[VIDEO_MAX_PLANES];
-    struct v4l2_buffer buf = { 0 };
-    V4L2Buffer *avbuf;
-    struct pollfd pfd = {
-        .events =  POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */
-        .fd = ctx_to_m2mctx(ctx)->fd,
+    struct v4l2_buffer buf = {
+        .type = ctx->type,
+        .memory = V4L2_MEMORY_MMAP,
     };
-    int i, ret;
-    int no_rx_means_done = 0;
-
-    if (is_capture && ctx->bufrefs) {
-        for (i = 0; i < ctx->num_buffers; i++) {
-            avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-            if (avbuf->status == V4L2BUF_IN_DRIVER)
-                break;
-        }
-        if (i == ctx->num_buffers)
-            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers (%d) returned to "
-                                                "userspace. Increase num_capture_buffers "
-                                                "to prevent device deadlock or dropped "
-                                                "packets/frames.\n", i);
+
+    *ppavbuf = NULL;
+
+    if (ctx->flag_last)
+        return AVERROR(EPIPE);
+
+    if (is_mp) {
+        buf.length = VIDEO_MAX_PLANES;
+        buf.m.planes = planes;
     }

-#if 0
-    // I think this is true but pointless
-    // we will get some other form of EOF signal
-
-    /* if we are draining and there are no more capture buffers queued in the driver we are done */
-    if (is_capture && ctx_to_m2mctx(ctx)->draining) {
-        for (i = 0; i < ctx->num_buffers; i++) {
-            /* capture buffer initialization happens during decode hence
-             * detection happens at runtime
-             */
-            if (!ctx->bufrefs)
-                break;
-
-            avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-            if (avbuf->status == V4L2BUF_IN_DRIVER)
-                goto start;
+    while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) {
+        const int err = errno;
+        av_assert0(AVERROR(err) < 0);
+        if (err != EINTR) {
+            av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
+                ctx->name, av_err2str(AVERROR(err)));
+
+            if (err == EPIPE)
+                ctx->flag_last = 1;
+
+            return AVERROR(err);
         }
-        ctx->done = 1;
-        return NULL;
     }
-#endif
-
-start:
-    if (is_capture) {
-        /* no need to listen to requests for more input while draining */
-        if (ctx_to_m2mctx(ctx)->draining || timeout > 0)
-            pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
-    } else {
-        pfd.events =  POLLOUT | POLLWRNORM;
+    atomic_fetch_sub(&ctx->q_count, 1);
+
+    avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
+    avbuf->status = V4L2BUF_AVAILABLE;
+    avbuf->buf = buf;
+    if (is_mp) {
+        memcpy(avbuf->planes, planes, sizeof(planes));
+        avbuf->buf.m.planes = avbuf->planes;
     }
-    no_rx_means_done = s->resize_pending && is_capture;

-    for (;;) {
-        // If we have a resize pending then all buffers should be Qed
-        // With a resize pending we should be in drain but evidence suggests
-        // that not all decoders do this so poll to clear
-        int t2 = no_rx_means_done ? 0 : timeout < 0 ? 3000 : timeout;
-        const int e = pfd.events;
-
-        ret = poll(&pfd, 1, t2);
+    if (V4L2_TYPE_IS_CAPTURE(ctx->type)) {
+        // Zero length cap buffer return == EOS
+        if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) {
+            av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n");

-        if (ret > 0)
-            break;
+            // Must reQ so we don't leak
+            // May not matter if the next thing we do is release all the
+            // buffers but better to be tidy.
+            ff_v4l2_buffer_enqueue(avbuf);

-        if (ret < 0) {
-            int err = errno;
-            if (err == EINTR)
-                continue;
-            av_log(logger(ctx), AV_LOG_ERROR, "=== poll error %d (%s): events=%#x, cap buffers=%d\n",
-                   err, strerror(err),
-                   e, count_in_driver(ctx));
-            return NULL;
+            ctx->flag_last = 1;
+            return AVERROR(EPIPE);
         }

-        // ret == 0 (timeout)
-        if (no_rx_means_done) {
-            av_log(logger(ctx), AV_LOG_DEBUG, "Ctx done on timeout\n");
-            ret = ctx_done(ctx);
-            if (ret > 0)
-                goto start;
-        }
-        if (timeout == -1)
-            av_log(logger(ctx), AV_LOG_ERROR, "=== poll unexpected TIMEOUT: events=%#x, cap buffers=%d\n", e, count_in_driver(ctx));;
-        return NULL;
+#ifdef V4L2_BUF_FLAG_LAST
+        // If flag_last set then this contains data but is the last frame
+        // so remember that but return OK
+        if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0)
+            ctx->flag_last = 1;
+#endif
     }

-    /* 0. handle errors */
-    if (pfd.revents & POLLERR) {
-        /* if we are trying to get free buffers but none have been queued yet
-           no need to raise a warning */
-        if (timeout == 0) {
-            for (i = 0; i < ctx->num_buffers; i++) {
-                avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-                if (avbuf->status != V4L2BUF_AVAILABLE)
-                    av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
-            }
-        }
-        else
-            av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
+    *ppavbuf = avbuf;
+    return 0;
+}

-        return NULL;
-    }
+/**
+ * handle resolution change event and end of stream event
+ * Expects to be called after the stream has stopped
+ *
+ * returns 1 if reinit was successful, negative if it failed
+ * returns 0 if reinit was not executed
+ */
+static int
+get_event(V4L2m2mContext * const m)
+{
+    AVCodecContext * const avctx = m->avctx;
+    struct v4l2_event evt = { 0 };

-    /* 1. handle resolution changes */
-    if (pfd.revents & POLLPRI) {
-        ret = v4l2_handle_event(ctx);
-        if (ret < 0) {
-            /* if re-init failed, abort */
-            ctx->done = 1;
-            return NULL;
+    while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) {
+        const int rv = AVERROR(errno);
+        if (rv == AVERROR(EINTR))
+            continue;
+        if (rv == AVERROR(EAGAIN)) {
+            av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n");
+            return AVERROR_EOF;
         }
-        if (ret > 0)
-            goto start;
+        av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv));
+        return rv;
+    }
+
+    av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type);
+
+    if (evt.type == V4L2_EVENT_EOS) {
+        av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n");
+        return AVERROR_EOF;
     }

-    /* 2. dequeue the buffer */
-    if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
+    if (evt.type == V4L2_EVENT_SOURCE_CHANGE)
+        return do_source_change(m);

-        if (is_capture) {
-            /* there is a capture buffer ready */
-            if (pfd.revents & (POLLIN | POLLRDNORM))
-                goto dequeue;
+    return 0;
+}

-            // CAPTURE Q drained
-            if (no_rx_means_done) {
-                if (ctx_done(ctx) > 0)
-                    goto start;
-                return NULL;
-            }

-            /* the driver is ready to accept more input; instead of waiting for the capture
-             * buffer to complete we return NULL so input can proceed (we are single threaded)
-             */
-            if (pfd.revents & (POLLOUT | POLLWRNORM))
-                return NULL;
+// Get a buffer
+// If output then just gets the buffer in the expected way
+// If capture then runs the capture state m/c to deal with res change etc.
+// If return value == 0 then *ppavbuf != NULL
+
+static int
+get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout)
+{
+    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
+    AVCodecContext * const avctx = m->avctx;
+    const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type);
+
+    const unsigned int poll_cap = (POLLIN | POLLRDNORM);
+    const unsigned int poll_out = (POLLOUT | POLLWRNORM);
+    const unsigned int poll_event = POLLPRI;
+
+    *ppavbuf = NULL;
+
+    for (;;) {
+        struct pollfd pfd = {
+            .fd = m->fd,
+            // If capture && stream not started then assume we are waiting for the initial event
+            .events = !is_cap ? poll_out :
+                !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap :
+                    poll_event,
+        };
+        int ret;
+
+        if (ctx->done) {
+            av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name);
+            return AVERROR_EOF;
         }

-dequeue:
-        memset(&buf, 0, sizeof(buf));
-        buf.memory = V4L2_MEMORY_MMAP;
-        buf.type = ctx->type;
-        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-            memset(planes, 0, sizeof(planes));
-            buf.length = VIDEO_MAX_PLANES;
-            buf.m.planes = planes;
+        // If capture && timeout == -1 then also wait for rx buffer free
+        if (is_cap && timeout == -1 && m->output.streamon && !m->draining)
+            pfd.events |= poll_out;
+
+        // If nothing Qed all we will get is POLLERR - avoid that
+        if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) ||
+            (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) ||
+            (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) {
+            av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
+            return AVERROR(EAGAIN);
         }

-        while ((ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf)) == -1) {
-            const int err = errno;
-            if (err == EINTR)
+        // Timeout kludged s.t. "forever" eventually gives up & produces logging
+        // If waiting for an event when we have seen a last_frame then we expect
+        //   it to be ready already so force a short timeout
+        ret = poll(&pfd, 1,
+                   ff_v4l2_ctx_eos(ctx) ? 10 :
+                   timeout == -1 ? 3000 : timeout);
+        if (ret < 0) {
+            ret = AVERROR(errno);  // Remember errno before logging etc.
+            av_assert0(ret < 0);
+        }
+
+        av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n",
+               ctx->name, ret, timeout, pfd.events, pfd.revents);
+
+        if (ret < 0) {
+            if (ret == AVERROR(EINTR))
                 continue;
-            if (err != EAGAIN) {
-                // EPIPE on CAPTURE can be used instead of BUF_FLAG_LAST
-                if (err != EPIPE || !is_capture)
-                    av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
-                        ctx->name, av_err2str(AVERROR(err)));
-                if (ctx_done(ctx) > 0)
-                    goto start;
+            av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret));
+            return ret;
+        }
+
+        if (ret == 0) {
+            if (timeout == -1)
+                av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events);
+            if (ff_v4l2_ctx_eos(ctx)) {
+                av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name);
+                ret = get_event(m);
+                if (ret < 0) {
+                    ctx->done = 1;
+                    return ret;
+                }
             }
-            return NULL;
+            return AVERROR(EAGAIN);
         }
-        --ctx->q_count;
-        av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d field=%d\n",
-               ctx->name, buf.index,
-               buf.timestamp.tv_sec, buf.timestamp.tv_usec,
-               ctx->q_count, ++ctx->dq_count, buf.field);
-
-        avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
-        avbuf->status = V4L2BUF_AVAILABLE;
-        avbuf->buf = buf;
-        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-            memcpy(avbuf->planes, planes, sizeof(planes));
-            avbuf->buf.m.planes = avbuf->planes;
+
+        if ((pfd.revents & POLLERR) != 0) {
+            av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name);
+            return AVERROR_UNKNOWN;
         }

-        if (ctx_to_m2mctx(ctx)->draining && is_capture) {
-            int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
-                            buf.m.planes[0].bytesused : buf.bytesused;
-            if (bytesused == 0) {
-                av_log(logger(ctx), AV_LOG_DEBUG, "Buffer empty - reQ\n");
+        if ((pfd.revents & poll_event) != 0) {
+            ret = get_event(m);
+            if (ret < 0) {
+                ctx->done = 1;
+                return ret;
+            }
+            continue;
+        }

-                // Must reQ so we don't leak
-                // May not matter if the next thing we do is release all the
-                // buffers but better to be tidy.
-                ff_v4l2_buffer_enqueue(avbuf);
+        if ((pfd.revents & poll_cap) != 0) {
+            ret = dq_buf(ctx, ppavbuf);
+            if (ret == AVERROR(EPIPE))
+                continue;
+            return ret;
+        }

-                if (ctx_done(ctx) > 0)
-                    goto start;
-                return NULL;
-            }
-#ifdef V4L2_BUF_FLAG_LAST
-            if (buf.flags & V4L2_BUF_FLAG_LAST) {
-                av_log(logger(ctx), AV_LOG_TRACE, "FLAG_LAST set\n");
-                avbuf->status = V4L2BUF_IN_USE;  // Avoid flushing this buffer
-                ctx_done(ctx);
-            }
-#endif
+        if ((pfd.revents & poll_out) != 0) {
+            if (is_cap)
+                return AVERROR(EAGAIN);
+            return dq_buf(ctx, ppavbuf);
         }

-        return avbuf;
+        av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents);
+        return AVERROR_UNKNOWN;
     }
-
-    return NULL;
 }

 static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
 {
-    int timeout = 0; /* return when no more buffers to dequeue */
     int i;

     /* get back as many output buffers as possible */
     if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-          do {
-          } while (v4l2_dequeue_v4l2buf(ctx, timeout));
+        V4L2Buffer * avbuf;
+        do {
+            get_qbuf(ctx, &avbuf, 0);
+        } while (avbuf);
     }

     for (i = 0; i < ctx->num_buffers; i++) {
@@ -722,7 +706,7 @@ static void flush_all_buffers_status(V4L2Context* const ctx)
         if (buf->status == V4L2BUF_IN_DRIVER)
             buf->status = V4L2BUF_AVAILABLE;
     }
-    ctx->q_count = 0;
+    atomic_store(&ctx->q_count, 0);
 }

 static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
@@ -755,6 +739,10 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
     int ret;
     AVCodecContext * const avctx = logger(ctx);

+    // Avoid doing anything if there is nothing we can do
+    if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon)
+        return 0;
+
     ff_mutex_lock(&ctx->lock);

     if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
@@ -777,6 +765,9 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
                cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF");
     }

+    // Both stream off & on effectively clear flag_last
+    ctx->flag_last = 0;
+
     ff_mutex_unlock(&ctx->lock);

     return ret;
@@ -840,19 +831,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
 int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
 {
     V4L2Buffer *avbuf;
+    int rv;

-    /*
-     * timeout=-1 blocks until:
-     *  1. decoded frame available
-     *  2. an input buffer is ready to be dequeued
-     */
-    avbuf = v4l2_dequeue_v4l2buf(ctx, timeout);
-    if (!avbuf) {
-        if (ctx->done)
-            return AVERROR_EOF;
-
-        return AVERROR(EAGAIN);
-    }
+    if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
+        return rv;

     return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
 }
@@ -860,19 +842,10 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
 int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
 {
     V4L2Buffer *avbuf;
+    int rv;

-    /*
-     * blocks until:
-     *  1. encoded packet available
-     *  2. an input buffer ready to be dequeued
-     */
-    avbuf = v4l2_dequeue_v4l2buf(ctx, -1);
-    if (!avbuf) {
-        if (ctx->done)
-            return AVERROR_EOF;
-
-        return AVERROR(EAGAIN);
-    }
+    if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
+        return rv;

     return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
 }
@@ -956,6 +929,8 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers
     int ret;
     int i;

+    av_assert0(ctx->bufrefs == NULL);
+
     memset(&req, 0, sizeof(req));
     req.count = req_buffers;
     req.memory = V4L2_MEMORY_MMAP;
@@ -1033,8 +1008,8 @@ int ff_v4l2_context_init(V4L2Context* ctx)
         hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
         hwframes->format = AV_PIX_FMT_DRM_PRIME;
         hwframes->sw_format = ctx->av_pix_fmt;
-        hwframes->width = ctx->width;
-        hwframes->height = ctx->height;
+        hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width;
+        hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height;
         ret = av_hwframe_ctx_init(ctx->frames_ref);
         if (ret < 0)
             goto fail_unref_hwframes;
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index a4176448d5..565858a1ed 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -102,6 +102,8 @@ typedef struct V4L2Context {
      */
     int done;

+    int flag_last;
+
     /**
      * PTS rescale not wanted
      * If the PTS is just a dummy frame count then rescale is
diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index 516e6d9858..e26bd74c3e 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -235,7 +235,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s)

     /* 5. complete reinit */
     s->draining = 0;
-    s->reinit = 0;

     return 0;
 }
diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 3f86809623..d71f6b721c 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -84,8 +84,6 @@ typedef struct V4L2m2mContext {
     AVCodecContext *avctx;
     sem_t refsync;
     atomic_uint refcount;
-    int reinit;
-    int resize_pending;

     /* null frame/packet received */
     int draining;
@@ -180,15 +178,25 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx);
 int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);


-static inline unsigned int ff_v4l2_get_format_width(struct v4l2_format *fmt)
+static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt)
 {
     return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
 }

-static inline unsigned int ff_v4l2_get_format_height(struct v4l2_format *fmt)
+static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt)
 {
     return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
 }

+static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
+}
+
+static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx)
+{
+    return ctx->flag_last;
+}
+

 #endif /* AVCODEC_V4L2_M2M_H */
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 09ec496351..e4b6569ba5 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -113,9 +113,6 @@ static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *co
     if (ret < 0)
         av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n");

-    if (!s->capture.streamon || ret < 0)
-        return ret;
-
     ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd);
     if (ret < 0)
         av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno);
@@ -127,69 +124,12 @@ static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *co

 static int v4l2_try_start(AVCodecContext *avctx)
 {
-    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-    V4L2Context *const capture = &s->capture;
-    struct v4l2_selection selection = { 0 };
+    V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
     int ret;

     /* 1. start the output process */
     if ((ret = check_output_streamon(avctx, s)) != 0)
         return ret;
-
-    if (capture->streamon)
-        return 0;
-
-    /* 2. get the capture format */
-    capture->format.type = capture->type;
-    ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format);
-    if (ret) {
-        av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n");
-        return ret;
-    }
-
-    /* 2.1 update the AVCodecContext */
-    capture->av_pix_fmt =
-        ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
-    if (s->output_drm) {
-        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
-        avctx->sw_pix_fmt = capture->av_pix_fmt;
-    }
-    else
-        avctx->pix_fmt = capture->av_pix_fmt;
-
-    /* 3. set the crop parameters */
-#if 1
-    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-    selection.target = V4L2_SEL_TGT_CROP_DEFAULT;
-    ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
-    av_log(avctx, AV_LOG_INFO, "Post G selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height);
-#else
-    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-    selection.r.height = avctx->coded_height;
-    selection.r.width = avctx->coded_width;
-    av_log(avctx, AV_LOG_INFO, "Try selection %dx%d\n", avctx->coded_width, avctx->coded_height);
-    ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
-    av_log(avctx, AV_LOG_INFO, "Post S selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height);
-    if (1) {
-        ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
-        if (ret) {
-            av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
-        } else {
-            av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height);
-            /* update the size of the resulting frame */
-            capture->height = selection.r.height;
-            capture->width  = selection.r.width;
-        }
-    }
-#endif
-
-    /* 5. start the capture process */
-    ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
-    if (ret) {
-        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n");
-        return ret;
-    }
-
     return 0;
 }

@@ -364,7 +304,7 @@ xlat_pending(const xlat_track_t * const x)
 }

 static inline int stream_started(const V4L2m2mContext * const s) {
-    return s->capture.streamon && s->output.streamon;
+    return s->output.streamon;
 }

 #define NQ_OK        0
@@ -377,6 +317,9 @@ static inline int stream_started(const V4L2m2mContext * const s) {
 #define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
 #define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE)

+// do_not_get      If true then no new packet will be got but status will
+//                  be set appropriately
+
 // AVERROR_EOF     Flushing an already flushed stream
 // -ve             Error (all errors except EOF are unexpected)
 // NQ_OK (0)       OK
@@ -386,14 +329,14 @@ static inline int stream_started(const V4L2m2mContext * const s) {
 // NQ_DRAINING     At EOS, dQ dest until EOS there too
 // NQ_DEAD         Not running (do not retry, do not attempt capture dQ)

-static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s)
+static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get)
 {
     int ret;

     // If we don't already have a coded packet - get a new one
     // We will already have a coded pkt if the output Q was full last time we
     // tried to Q it
-    if (!s->buf_pkt.size) {
+    if (!s->buf_pkt.size && !do_not_get) {
         ret = ff_decode_get_packet(avctx, &s->buf_pkt);

         if (ret == AVERROR(EAGAIN)) {
@@ -435,6 +378,17 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
         xlat_pts_in(avctx, &s->xlat, &s->buf_pkt);
     }

+    if (s->draining) {
+        if (s->buf_pkt.size) {
+            av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n");
+            av_packet_unref(&s->buf_pkt);
+        }
+        return NQ_DRAINING;
+    }
+
+    if (!s->buf_pkt.size)
+        return NQ_NONE;
+
     if ((ret = check_output_streamon(avctx, s)) != 0)
         return ret;

@@ -471,7 +425,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
 static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 {
     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-    int src_rv = NQ_NONE;
+    int src_rv;
     int dst_rv = 1;  // Non-zero (done), non-negative (error) number
     unsigned int i = 0;

@@ -483,31 +437,40 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
         // (a) We don't have a lot of stuff in the buffer already OR
         // (b) ... we (think we) do but we've failed to get a frame already OR
         // (c) We've dequeued a lot of frames without asking for input
-        if (!prefer_dq || i != 0 || s->req_pkt > 2) {
-            src_rv = try_enqueue_src(avctx, s);
-
-            // If we got a frame last time or we've already tried to get a frame and
-            // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
-            // indicating that we want more input.
-            // This should mean that once decode starts we enter a stable state where
-            // we alternately ask for input and produce output
-            if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
-                break;
-        }
+        src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2));
+
+        // If we got a frame last time or we've already tried to get a frame and
+        // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
+        // indicating that we want more input.
+        // This should mean that once decode starts we enter a stable state where
+        // we alternately ask for input and produce output
+        if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
+            break;

         // Try to get a new frame if
         // (a) we haven't already got one AND
         // (b) enqueue returned a status indicating that decode should be attempted
         if (dst_rv != 0 && TRY_DQ(src_rv)) {
+            // Pick a timeout depending on state
+            const int t =
+                src_rv == NQ_DRAINING ? 300 :
+                prefer_dq ? 5 :
+                src_rv == NQ_Q_FULL ? -1 : 0;
+
             do {
                 // Dequeue frame will unref any previous contents of frame
                 // if it returns success so we don't need an explicit unref
                 // when discarding
                 // This returns AVERROR(EAGAIN) on timeout or if
                 // there is room in the input Q and timeout == -1
-                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, prefer_dq ? 5 : -1);
+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);

-                if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
+                if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
+                    av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
+                    dst_rv = AVERROR_EOF;
+                    s->capture.done = 1;
+                }
+                else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
                     av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
                            s->draining, s->capture.done);
                 else if (dst_rv && dst_rv != AVERROR(EAGAIN))
@@ -630,8 +593,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
      * by the v4l2 driver; this event will trigger a full pipeline reconfig and
      * the proper values will be retrieved from the kernel driver.
      */
-    output->height = capture->height = avctx->coded_height;
-    output->width = capture->width = avctx->coded_width;
+//    output->height = capture->height = avctx->coded_height;
+//    output->width = capture->width = avctx->coded_width;
+    output->height = capture->height = 0;
+    output->width = capture->width = 0;

     output->av_codec_id = avctx->codec_id;
     output->av_pix_fmt  = AV_PIX_FMT_NONE;
@@ -703,7 +668,6 @@ static void v4l2_decode_flush(AVCodecContext *avctx)
     V4L2m2mContext * const s = priv->context;
     V4L2Context * const output = &s->output;
     V4L2Context * const capture = &s->capture;
-    int ret;

     av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);

@@ -711,13 +675,19 @@ static void v4l2_decode_flush(AVCodecContext *avctx)
     // states like EOS processing so don't try to optimize out (having got it
     // wrong once)

-    ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
-    if (ret < 0)
-        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret);
+    ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);

     // Clear any buffered input packet
     av_packet_unref(&s->buf_pkt);

+    // Clear a pending EOS
+    if (ff_v4l2_ctx_eos(capture)) {
+        // Arguably we could delay this but this is easy and doesn't require
+        // thought or extra vars
+        ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF);
+        ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
+    }
+
     // V4L2 makes no guarantees about whether decoded frames are flushed or not
     // so mark all frames we are tracking to be discarded if they appear
     xlat_flush(&s->xlat);

From a2519f7a512edde7433aced70de4464e21805693 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 9 Dec 2021 18:51:00 +0000
Subject: [PATCH 039/161] Honor result of ff_get_format if possible

---
 libavcodec/v4l2_m2m_dec.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index e4b6569ba5..c9655bcc3b 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -615,15 +615,19 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
      *       check the v4l2_get_drm_frame function.
      */

+    avctx->sw_pix_fmt = avctx->pix_fmt;
     gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
     av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n",
            avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));

-    s->output_drm = 0;
     if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
         avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
         s->output_drm = 1;
     }
+    else {
+        capture->av_pix_fmt = gf_pix_fmt;
+        s->output_drm = 0;
+    }

     s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
     if (!s->device_ref) {

From a1cd1cb98e48c631392b385ccac5ab7b09bb5ee9 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 14 Dec 2021 16:11:10 +0000
Subject: [PATCH 040/161] Add an always-reinit quirk

---
 libavcodec/v4l2_context.c |  7 +++++--
 libavcodec/v4l2_m2m.h     |  5 +++++
 libavcodec/v4l2_m2m_dec.c | 33 ++++++++++++++++++++++++++++++++-
 3 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index d765181645..c11b5e6863 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -188,6 +188,9 @@ static int do_source_change(V4L2m2mContext * const s)
     get_default_selection(&s->capture, &s->capture.selection);

     reinit = ctx_resolution_changed(&s->capture, &cap_fmt);
+    if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0)
+        reinit = 1;
+
     s->capture.format = cap_fmt;
     if (reinit) {
         s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
@@ -202,10 +205,10 @@ static int do_source_change(V4L2m2mContext * const s)

     s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);

-    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n",
+    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d, reinit=%d\n",
            s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
            s->capture.selection.width, s->capture.selection.height,
-           s->capture.selection.left, s->capture.selection.top);
+           s->capture.selection.left, s->capture.selection.top, reinit);

     if (reinit) {
         if (avctx)
diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index d71f6b721c..f1923bb26d 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -113,6 +113,11 @@ typedef struct V4L2m2mContext {

     /* Ext data sent */
     int extdata_sent;
+
+#define FF_V4L2_QUIRK_REINIT_ALWAYS     1
+    /* Quirks */
+    unsigned int quirks;
+
 } V4L2m2mContext;

 typedef struct V4L2m2mPriv {
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index c9655bcc3b..e2b10f5e3a 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -540,6 +540,34 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 }
 #endif

+static int
+get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
+{
+    struct v4l2_capability cap;
+
+    memset(&cap, 0, sizeof(cap));
+    while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) {
+        int err = errno;
+        if (err == EINTR)
+            continue;
+        av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err));
+        return AVERROR(err);
+    }
+
+    // Could be made table driven if we have a few more but right now there
+    // seems no point
+
+    // Meson (amlogic) always gives a resolution changed event after output
+    // streamon and userspace must (re)allocate capture buffers and streamon
+    // capture to clear the event even if the capture buffers were the right
+    // size in the first place.
+    if (strcmp(cap.driver, "meson-vdec") == 0)
+        s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS;
+
+    av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks);
+    return 0;
+}
+
 // This heuristic is for H264 but use for everything
 static uint32_t max_coded_size(const AVCodecContext * const avctx)
 {
@@ -646,7 +674,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
         return ret;
     }

-    return v4l2_prepare_decoder(s);
+    if ((ret = v4l2_prepare_decoder(s)) < 0)
+        return ret;
+
+    return get_quirks(avctx, s);
 }

 static av_cold int v4l2_decode_close(AVCodecContext *avctx)

From 2470968adf0d28bbaf310e782720dd00d57d7bf6 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 4 Jan 2022 16:58:31 +0000
Subject: [PATCH 041/161] v4l2_buffers: rework flags for keyframe

Previously flags could become confused and keyframe info could be lost.
This fixes that and removes the duplicate flags field in V4L2Buffer.
---
 libavcodec/v4l2_buffers.c | 15 ++++++++++-----
 libavcodec/v4l2_buffers.h |  1 -
 libavcodec/v4l2_context.c | 18 +++++++++++++++++-
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 2cf7be6632..62d1c26053 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -680,7 +680,9 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)

 int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
 {
-    out->buf.flags = frame->key_frame ? (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME) : (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME);
+    out->buf.flags = frame->key_frame ?
+        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
+        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
     // Beware that colour info is held in format rather than the actual
     // v4l2 buffer struct so this may not be as useful as you might hope
     v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
@@ -706,6 +708,10 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)

     /* 2. get frame information */
     frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME);
+    frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I :
+        (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P :
+        (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B :
+            AV_PICTURE_TYPE_NONE;
     frame->color_primaries = v4l2_get_color_primaries(avbuf);
     frame->colorspace = v4l2_get_color_space(avbuf);
     frame->color_range = v4l2_get_color_range(avbuf);
@@ -779,8 +785,9 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,

     v4l2_set_pts(out, pkt->pts);

-    if (pkt->flags & AV_PKT_FLAG_KEY)
-        out->flags = V4L2_BUF_FLAG_KEYFRAME;
+    out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ?
+        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
+        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);

     return ret;
 }
@@ -924,8 +931,6 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
     int ret;
     int qc;

-    avbuf->buf.flags = avbuf->flags;
-
     if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) {
         av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
                avbuf->context->name, avbuf->buf.index,
diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
index 641e0e147b..3b7ca4d99e 100644
--- a/libavcodec/v4l2_buffers.h
+++ b/libavcodec/v4l2_buffers.h
@@ -73,7 +73,6 @@ typedef struct V4L2Buffer {
     struct v4l2_buffer buf;
     struct v4l2_plane planes[VIDEO_MAX_PLANES];

-    int flags;
     enum V4L2Buffer_status status;

 } V4L2Buffer;
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index c11b5e6863..53b522d43e 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -527,6 +527,22 @@ get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout
     }
 }

+// Clear out flags and timestamps that should should be set by the user
+// Returns the passed avbuf
+static V4L2Buffer *
+clean_v4l2_buffer(V4L2Buffer * const avbuf)
+{
+    struct v4l2_buffer *const buf = &avbuf->buf;
+
+    buf->flags = 0;
+    buf->field = V4L2_FIELD_ANY;
+    buf->timestamp = (struct timeval){0};
+    buf->timecode = (struct v4l2_timecode){0};
+    buf->sequence = 0;
+
+    return avbuf;
+}
+
 static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
 {
     int i;
@@ -542,7 +558,7 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
     for (i = 0; i < ctx->num_buffers; i++) {
         V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
         if (avbuf->status == V4L2BUF_AVAILABLE)
-            return avbuf;
+            return clean_v4l2_buffer(avbuf);
     }

     return NULL;

From 5dc38f5d088beea4da57e82969643cc831c40cf0 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 22 Mar 2022 11:44:30 +0000
Subject: [PATCH 042/161] v4l2m2m: Rework decode to wait for missing buffer,
 add dynamic pending

Previously receive_frame exited with EAGAIN if no capture buffer
availble in the Q.  Now it waits in the hope that another thread will
post one.

The prefer dQ logic is now dynamic to help with cases where PTS/DTS
lies.  If it looks like we are never getting a frame then the
threshold is increased.  It then slowly decays over time to cope with
false alarms.
---
 libavcodec/v4l2_buffers.c |  6 +++--
 libavcodec/v4l2_context.c |  7 +++--
 libavcodec/v4l2_context.h |  3 +++
 libavcodec/v4l2_m2m.h     |  2 ++
 libavcodec/v4l2_m2m_dec.c | 57 +++++++++++++++++++++++++++++++++++++--
 5 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 62d1c26053..8c4f18dbed 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -947,12 +947,14 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
         return AVERROR(err);
     }

+    // Lock not wanted - if called from buffer free then lock already obtained
     qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1;
+    avbuf->status = V4L2BUF_IN_DRIVER;
+    pthread_cond_broadcast(&avbuf->context->cond);
+
     av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
            avbuf->context->name, avbuf->buf.index,
            avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc);

-    avbuf->status = V4L2BUF_IN_DRIVER;
-
     return 0;
 }
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 53b522d43e..7ddb759810 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -300,6 +300,7 @@ static int v4l2_stop_encode(V4L2Context *ctx)
 // Returns:
 //  0               Success
 //  AVERROR(EPIPE)  Nothing more to read
+//  AVERROR(ENOSPC) No buffers in Q to put result in
 //  *               AVERROR(..)

  static int
@@ -457,7 +458,7 @@ get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout
             (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) ||
             (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) {
             av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
-            return AVERROR(EAGAIN);
+            return AVERROR(ENOSPC);
         }

         // Timeout kludged s.t. "forever" eventually gives up & produces logging
@@ -864,7 +865,7 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
     int rv;

     if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
-        return rv;
+        return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC

     return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
 }
@@ -938,6 +939,7 @@ void ff_v4l2_context_release(V4L2Context* ctx)
     av_buffer_unref(&ctx->frames_ref);

     ff_mutex_destroy(&ctx->lock);
+    pthread_cond_destroy(&ctx->cond);
 }


@@ -1013,6 +1015,7 @@ int ff_v4l2_context_init(V4L2Context* ctx)
     }

     ff_mutex_init(&ctx->lock, NULL);
+    pthread_cond_init(&ctx->cond, NULL);
     atomic_init(&ctx->q_count, 0);

     if (s->output_drm) {
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 565858a1ed..0efff58f18 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -116,6 +116,7 @@ typedef struct V4L2Context {
     struct ff_weak_link_master *wl_master;

     AVMutex lock;
+    pthread_cond_t cond;
 } V4L2Context;

 /**
@@ -182,6 +183,8 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
  * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
  *
  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
+ *                AVERROR(ENOSPC) if no buffer availible to put
+ *                the frame in
  */
 int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index f1923bb26d..9a20447030 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -105,6 +105,8 @@ typedef struct V4L2m2mContext {

     /* Frame tracking */
     xlat_track_t xlat;
+    int pending_hw;
+    int pending_n;

     pts_stats_t pts_stat;

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index e2b10f5e3a..2e30449dfc 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -251,7 +251,8 @@ xlat_pts_out(AVCodecContext *const avctx,

     frame->best_effort_timestamp = pts_stats_guess(ps);
     frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
-    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
+           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
     return 0;
 }

@@ -422,6 +423,36 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
     return ret;
 }

+static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
+{
+    int rv = 0;
+
+    ff_mutex_lock(&ctx->lock);
+
+    while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) {
+        if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) {
+            rv = AVERROR(errno);
+            av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv));
+            break;
+        }
+    }
+
+    ff_mutex_unlock(&ctx->lock);
+    return rv;
+}
+
+// Number of frames over what xlat_pending returns that we keep *16
+// This is a min value - if it appears to be too small the threshold should
+// adjust dynamically.
+#define PENDING_HW_MIN      (3 * 16)
+// Offset to use when setting dynamically
+// Set to %16 == 15 to avoid the threshold changing immediately as we relax
+#define PENDING_HW_OFFSET   (PENDING_HW_MIN - 1)
+// Number of consecutive times we've failed to get a frame when we prefer it
+// before we increase the prefer threshold (5ms * N = max expected decode
+// time)
+#define PENDING_N_THRESHOLD 6
+
 static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 {
     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
@@ -431,7 +462,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)

     do {
         const int pending = xlat_pending(&s->xlat);
-        const int prefer_dq = (pending > 5);
+        const int prefer_dq = (pending > s->pending_hw / 16);

         // Enqueue another pkt for decode if
         // (a) We don't have a lot of stuff in the buffer already OR
@@ -465,6 +496,27 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
                 // there is room in the input Q and timeout == -1
                 dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);

+                // Failure due to no buffer in Q?
+                if (dst_rv == AVERROR(ENOSPC)) {
+                    // Wait & retry
+                    if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
+                        dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
+                    }
+                }
+
+                // Adjust dynamic pending threshold
+                if (dst_rv == 0) {
+                    if (--s->pending_hw < PENDING_HW_MIN)
+                        s->pending_hw = PENDING_HW_MIN;
+                    s->pending_n = 0;
+                }
+                else if (dst_rv == AVERROR(EAGAIN)) {
+                    if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
+                        s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
+                        s->pending_n = 0;
+                    }
+                }
+
                 if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
                     av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
                     dst_rv = AVERROR_EOF;
@@ -613,6 +665,7 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)

     xlat_init(&s->xlat);
     pts_stats_init(&s->pts_stat, avctx, "decoder");
+    s->pending_hw = PENDING_HW_MIN;

     capture = &s->capture;
     output = &s->output;

From 33765b769b4301e03f31b65e225fcdb0eff4c0e4 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 25 Mar 2022 15:37:58 +0000
Subject: [PATCH 043/161] v4l2_m2m2_dec: Avoid loop if unable to resize buffers

If source change signals a buffer size that cannot be honored give up
rather than looping indefinitely.  This happens on Pi if (say) a
2560x1440 h264 stream is presented to the decode.
---
 libavcodec/v4l2_context.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 7ddb759810..007a58c8f1 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -205,8 +205,9 @@ static int do_source_change(V4L2m2mContext * const s)

     s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);

-    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d, reinit=%d\n",
+    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n",
            s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
+           s->capture.width, s->capture.height,
            s->capture.selection.width, s->capture.selection.height,
            s->capture.selection.left, s->capture.selection.top, reinit);

@@ -224,9 +225,17 @@ static int do_source_change(V4L2m2mContext * const s)
             return AVERROR(EINVAL);
         }

+        if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) ||
+            s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) {
+            av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n",
+                   s->capture.width, s->capture.height,
+                   ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format));
+            return AVERROR(EINVAL);
+        }
+
         // Update pixel format - should only actually do something on initial change
         s->capture.av_pix_fmt =
-        ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
+            ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
         if (s->output_drm) {
             avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
             avctx->sw_pix_fmt = s->capture.av_pix_fmt;

From bb7ad2392ce83149a1ba40ecacb36e051b6bf785 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 25 Mar 2022 18:14:40 +0000
Subject: [PATCH 044/161] v4l2dec: Improve size/format validation on init

---
 libavcodec/v4l2_m2m_dec.c      | 84 ++++++++++++++++++++++++++++++++--
 libavcodec/v4l2_request_hevc.c | 11 +++++
 2 files changed, 92 insertions(+), 3 deletions(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 2e30449dfc..8dcadf461b 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -592,6 +592,76 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 }
 #endif

+static int
+check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
+{
+    unsigned int i;
+    const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format);
+    const uint32_t w = avctx->coded_width;
+    const uint32_t h = avctx->coded_height;
+
+    if (w == 0 || h == 0 || fcc == 0) {
+        av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc));
+        return 0;
+    }
+
+    for (i = 0;; ++i) {
+        struct v4l2_frmsizeenum fs = {
+            .index = i,
+            .pixel_format = fcc,
+        };
+
+        while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) {
+            const int err = AVERROR(errno);
+            if (err == AVERROR(EINTR))
+                continue;
+            if (i == 0 && err == AVERROR(ENOTTY)) {
+                av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n");
+                return 0;
+            }
+            if (err != AVERROR(EINVAL)) {
+                av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err));
+                return err;
+            }
+            av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in frame size enums\n",
+                   w, h, av_fourcc2str(fcc));
+            return err;
+        }
+
+        switch (fs.type) {
+            case V4L2_FRMSIZE_TYPE_DISCRETE:
+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i,
+                       fs.discrete.width,fs.discrete.height);
+                if (w == fs.discrete.width && h == fs.discrete.height)
+                    return 0;
+                break;
+            case V4L2_FRMSIZE_TYPE_STEPWISE:
+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
+                       fs.stepwise.min_width, fs.stepwise.min_height,
+                       fs.stepwise.max_width, fs.stepwise.max_height,
+                       fs.stepwise.step_width,fs.stepwise.step_height);
+                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
+                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height &&
+                    (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 &&
+                    (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0)
+                    return 0;
+                break;
+            case V4L2_FRMSIZE_TYPE_CONTINUOUS:
+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
+                       fs.stepwise.min_width, fs.stepwise.min_height,
+                       fs.stepwise.max_width, fs.stepwise.max_height,
+                       fs.stepwise.step_width,fs.stepwise.step_height);
+                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
+                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height)
+                    return 0;
+                break;
+            default:
+                av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type);
+                return AVERROR(EINVAL);
+        }
+    }
+}
+
 static int
 get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
 {
@@ -698,8 +768,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)

     avctx->sw_pix_fmt = avctx->pix_fmt;
     gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
-    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n",
-           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
+    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
+           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
+           avctx->coded_width, avctx->coded_height,
+           gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));

     if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
         avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
@@ -730,7 +802,13 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     if ((ret = v4l2_prepare_decoder(s)) < 0)
         return ret;

-    return get_quirks(avctx, s);
+    if ((ret = get_quirks(avctx, s)) != 0)
+        return ret;
+
+    if ((ret = check_size(avctx, s)) != 0)
+        return ret;
+
+    return 0;
 }

 static av_cold int v4l2_decode_close(AVCodecContext *avctx)
diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
index b0a5930844..76ab0916cd 100644
--- a/libavcodec/v4l2_request_hevc.c
+++ b/libavcodec/v4l2_request_hevc.c
@@ -147,6 +147,17 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)

     av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);

+    // Give up immediately if this is something that we have no code to deal with
+    if (h->ps.sps->chroma_format_idc != 1) {
+        av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc);
+        return AVERROR_PATCHWELCOME;
+    }
+    if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) ||
+        h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) {
+        av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma);
+        return AVERROR_PATCHWELCOME;
+    }
+
     if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) {
         av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n");
         return (AVERROR(-ret));

From 4646b558c0e45f506578a5a452820f55983abc82 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 13 Apr 2022 16:05:56 +0000
Subject: [PATCH 045/161] v4l2 stateless hevc: Add another API variation for
 linux 5.18

This is probably going to be a short lived variation and may end up
being reverted if no release using it ever ends up in the wild.
---
 libavcodec/Makefile            |   2 +-
 libavcodec/hevc-ctrls-v3.h     | 255 +++++++++++++++++++++++++++++++++
 libavcodec/v4l2_req_hevc_v3.c  |   3 +
 libavcodec/v4l2_req_hevc_vx.c  |  17 +++
 libavcodec/v4l2_req_media.c    |  15 +-
 libavcodec/v4l2_req_media.h    |   3 +
 libavcodec/v4l2_request_hevc.c |   6 +-
 libavcodec/v4l2_request_hevc.h |   1 +
 8 files changed, 295 insertions(+), 7 deletions(-)
 create mode 100644 libavcodec/hevc-ctrls-v3.h
 create mode 100644 libavcodec/v4l2_req_hevc_v3.c

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index e1aa0ba014..2b3c16185d 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -1000,7 +1000,7 @@ OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
 OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
 OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec.o
 OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o\
-                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o
+                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o
 OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
diff --git a/libavcodec/hevc-ctrls-v3.h b/libavcodec/hevc-ctrls-v3.h
new file mode 100644
index 0000000000..4e35bd583d
--- /dev/null
+++ b/libavcodec/hevc-ctrls-v3.h
@@ -0,0 +1,255 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * These are the HEVC state controls for use with stateless HEVC
+ * codec drivers.
+ *
+ * It turns out that these structs are not stable yet and will undergo
+ * more changes. So keep them private until they are stable and ready to
+ * become part of the official public API.
+ */
+
+#ifndef _HEVC_CTRLS_H_
+#define _HEVC_CTRLS_H_
+
+#include <linux/videodev2.h>
+
+/* The pixel format isn't stable at the moment and will likely be renamed. */
+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
+
+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
+
+/* enum v4l2_ctrl_type type values */
+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
+#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
+
+enum v4l2_mpeg_video_hevc_decode_mode {
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
+};
+
+enum v4l2_mpeg_video_hevc_start_code {
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
+};
+
+#define V4L2_HEVC_SLICE_TYPE_B	0
+#define V4L2_HEVC_SLICE_TYPE_P	1
+#define V4L2_HEVC_SLICE_TYPE_I	2
+
+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
+
+/* The controls are not stable at the moment and will likely be reworked. */
+struct v4l2_ctrl_hevc_sps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
+	__u16	pic_width_in_luma_samples;
+	__u16	pic_height_in_luma_samples;
+	__u8	bit_depth_luma_minus8;
+	__u8	bit_depth_chroma_minus8;
+	__u8	log2_max_pic_order_cnt_lsb_minus4;
+	__u8	sps_max_dec_pic_buffering_minus1;
+	__u8	sps_max_num_reorder_pics;
+	__u8	sps_max_latency_increase_plus1;
+	__u8	log2_min_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_luma_coding_block_size;
+	__u8	log2_min_luma_transform_block_size_minus2;
+	__u8	log2_diff_max_min_luma_transform_block_size;
+	__u8	max_transform_hierarchy_depth_inter;
+	__u8	max_transform_hierarchy_depth_intra;
+	__u8	pcm_sample_bit_depth_luma_minus1;
+	__u8	pcm_sample_bit_depth_chroma_minus1;
+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
+	__u8	num_short_term_ref_pic_sets;
+	__u8	num_long_term_ref_pics_sps;
+	__u8	chroma_format_idc;
+	__u8	sps_max_sub_layers_minus1;
+
+	__u64	flags;
+};
+
+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
+
+struct v4l2_ctrl_hevc_pps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
+	__u8	num_extra_slice_header_bits;
+	__u8	num_ref_idx_l0_default_active_minus1;
+	__u8	num_ref_idx_l1_default_active_minus1;
+	__s8	init_qp_minus26;
+	__u8	diff_cu_qp_delta_depth;
+	__s8	pps_cb_qp_offset;
+	__s8	pps_cr_qp_offset;
+	__u8	num_tile_columns_minus1;
+	__u8	num_tile_rows_minus1;
+	__u8	column_width_minus1[20];
+	__u8	row_height_minus1[22];
+	__s8	pps_beta_offset_div2;
+	__s8	pps_tc_offset_div2;
+	__u8	log2_parallel_merge_level_minus2;
+
+	__u8	padding[4];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
+
+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
+
+struct v4l2_hevc_dpb_entry {
+	__u64	timestamp;
+	__u8	flags;
+	__u8	field_pic;
+	__u16	pic_order_cnt[2];
+	__u8	padding[2];
+};
+
+struct v4l2_hevc_pred_weight_table {
+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__u8	padding[6];
+
+	__u8	luma_log2_weight_denom;
+	__s8	delta_chroma_log2_weight_denom;
+};
+
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
+
+struct v4l2_ctrl_hevc_slice_params {
+	__u32	bit_size;
+	__u32	data_bit_offset;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u32	slice_segment_addr;
+	__u32	num_entry_point_offsets;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+	__u8	nal_unit_type;
+	__u8	nuh_temporal_id_plus1;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	slice_type;
+	__u8	colour_plane_id;
+	__u16	slice_pic_order_cnt;
+	__u8	num_ref_idx_l0_active_minus1;
+	__u8	num_ref_idx_l1_active_minus1;
+	__u8	collocated_ref_idx;
+	__u8	five_minus_max_num_merge_cand;
+	__s8	slice_qp_delta;
+	__s8	slice_cb_qp_offset;
+	__s8	slice_cr_qp_offset;
+	__s8	slice_act_y_qp_offset;
+	__s8	slice_act_cb_qp_offset;
+	__s8	slice_act_cr_qp_offset;
+	__s8	slice_beta_offset_div2;
+	__s8	slice_tc_offset_div2;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
+	__u8	pic_struct;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+
+	__u8	padding[5];
+
+	__u32	entry_point_offset_minus1[256];
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
+	struct v4l2_hevc_pred_weight_table pred_weight_table;
+
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
+
+struct v4l2_ctrl_hevc_decode_params {
+	__s32	pic_order_cnt_val;
+	__u8	num_active_dpb_entries;
+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	num_poc_st_curr_before;
+	__u8	num_poc_st_curr_after;
+	__u8	num_poc_lt_curr;
+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u64	flags;
+};
+
+struct v4l2_ctrl_hevc_scaling_matrix {
+	__u8	scaling_list_4x4[6][16];
+	__u8	scaling_list_8x8[6][64];
+	__u8	scaling_list_16x16[6][64];
+	__u8	scaling_list_32x32[2][64];
+	__u8	scaling_list_dc_coef_16x16[6];
+	__u8	scaling_list_dc_coef_32x32[2];
+};
+
+/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
+#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
+/*
+ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
+ * the number of data (in bits) to skip in the
+ * slice segment header.
+ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
+ * to before syntax element "slice_temporal_mvp_enabled_flag".
+ * If IDR, the skipped bits are just "pic_output_flag"
+ * (separate_colour_plane_flag is not supported).
+ */
+#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
+
+#endif
diff --git a/libavcodec/v4l2_req_hevc_v3.c b/libavcodec/v4l2_req_hevc_v3.c
new file mode 100644
index 0000000000..dcc8d95632
--- /dev/null
+++ b/libavcodec/v4l2_req_hevc_v3.c
@@ -0,0 +1,3 @@
+#define HEVC_CTRLS_VERSION 3
+#include "v4l2_req_hevc_vx.c"
+
diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
index 0ae03b10c4..611fa21cc3 100644
--- a/libavcodec/v4l2_req_hevc_vx.c
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -16,6 +16,8 @@

 #elif HEVC_CTRLS_VERSION == 2
 #include "hevc-ctrls-v2.h"
+#elif HEVC_CTRLS_VERSION == 3
+#include "hevc-ctrls-v3.h"
 #else
 #error Unknown HEVC_CTRLS_VERSION
 #endif
@@ -147,6 +149,7 @@ static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_t
     }
 }

+#if HEVC_CTRLS_VERSION <= 2
 static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
 {
     const HEVCFrame *frame;
@@ -172,6 +175,7 @@ static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)

     return 0;
 }
+#endif

 static unsigned int
 get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
@@ -247,7 +251,12 @@ fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const
             struct v4l2_hevc_dpb_entry * const entry = entries + n++;

             entry->timestamp = frame_capture_dpb(frame->frame);
+#if HEVC_CTRLS_VERSION <= 2
             entry->rps = find_frame_rps_type(h, entry->timestamp);
+#else
+            entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 :
+                V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE;
+#endif
             entry->field_pic = frame->frame->interlaced_frame;

             /* TODO: Interleaved: Get the POC for each field. */
@@ -1011,6 +1020,14 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
     };
     const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc);

+#if HEVC_CTRLS_VERSION == 2
+    if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0))
+        return AVERROR(EINVAL);
+#elif HEVC_CTRLS_VERSION == 3
+    if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0))
+        return AVERROR(EINVAL);
+#endif
+
     if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) {
         av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION);
         return AVERROR(EINVAL);
diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c
index eb00ecb406..980b306b8a 100644
--- a/libavcodec/v4l2_req_media.c
+++ b/libavcodec/v4l2_req_media.c
@@ -604,6 +604,7 @@ struct mediabufs_ctl {

     struct v4l2_format src_fmt;
     struct v4l2_format dst_fmt;
+    struct v4l2_capability capability;
 };

 static int qe_v4l2_queue(struct qent_base *const be,
@@ -1498,20 +1499,24 @@ void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc)
     mediabufs_ctl_delete(mbc);
 }

+unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc)
+{
+    return mbc->capability.version;
+}
+
 static int set_capabilities(struct mediabufs_ctl *const mbc)
 {
-    struct v4l2_capability capability = { 0 };
     uint32_t caps;

-    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &capability)) {
+    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) {
         int err = errno;
         request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err));
         return -err;
     }

-    caps = (capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
-            capability.device_caps :
-            capability.capabilities;
+    caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
+            mbc->capability.device_caps :
+            mbc->capability.capabilities;

     if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) {
         mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h
index 2f826cfb14..0307a831de 100644
--- a/libavcodec/v4l2_req_media.h
+++ b/libavcodec/v4l2_req_media.h
@@ -142,6 +142,9 @@ MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw,
                   struct dmabufs_ctl * const dbsc,
                   unsigned int n);

+#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c))
+unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc);
+
 struct mediabufs_ctl * mediabufs_ctl_new(void * const dc,
                      const char *vpath, struct pollqueue *const pq);
 void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc);
diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
index 76ab0916cd..20e4e0ab15 100644
--- a/libavcodec/v4l2_request_hevc.c
+++ b/libavcodec/v4l2_request_hevc.c
@@ -210,7 +210,11 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
         goto fail4;
     }

-    if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
+    if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
+        ctx->fns = &V2(ff_v4l2_req_hevc, 3);
+    }
+    else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n");
         ctx->fns = &V2(ff_v4l2_req_hevc, 2);
     }
diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
index f14f594564..ed48d62e2d 100644
--- a/libavcodec/v4l2_request_hevc.h
+++ b/libavcodec/v4l2_request_hevc.h
@@ -98,5 +98,6 @@ typedef struct v4l2_req_decode_fns {

 extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
 extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3);

 #endif

From 92160173e701aa7e2f1011e63596e48d15e691a9 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 3 May 2022 12:44:42 +0000
Subject: [PATCH 046/161] Remove V4l2 frame size check for meson-vdec

---
 libavcodec/v4l2_m2m.h     |  3 ++-
 libavcodec/v4l2_m2m_dec.c | 10 +++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 9a20447030..6bd5e8eda7 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -116,7 +116,8 @@ typedef struct V4L2m2mContext {
     /* Ext data sent */
     int extdata_sent;

-#define FF_V4L2_QUIRK_REINIT_ALWAYS     1
+#define FF_V4L2_QUIRK_REINIT_ALWAYS             1
+#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN    2
     /* Quirks */
     unsigned int quirks;

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 8dcadf461b..888ba67fea 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -604,6 +604,10 @@ check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
         av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc));
         return 0;
     }
+    if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) {
+        av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc));
+        return 0;
+    }

     for (i = 0;; ++i) {
         struct v4l2_frmsizeenum fs = {
@@ -623,8 +627,8 @@ check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
                 av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err));
                 return err;
             }
-            av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in frame size enums\n",
-                   w, h, av_fourcc2str(fcc));
+            av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n",
+                   w, h, av_fourcc2str(fcc), i);
             return err;
         }

@@ -684,7 +688,7 @@ get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
     // capture to clear the event even if the capture buffers were the right
     // size in the first place.
     if (strcmp(cap.driver, "meson-vdec") == 0)
-        s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS;
+        s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN;

     av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks);
     return 0;

From 8ba5576e7fcd24c2f450f0295cc3b6d8e82e8649 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 23 May 2022 18:05:20 +0100
Subject: [PATCH 047/161] v4l2m2m_dec: Make some error rturns a bit more robust

---
 libavcodec/v4l2_context.c |  5 ++---
 libavcodec/v4l2_m2m_dec.c | 23 ++++++++++++++---------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 007a58c8f1..b3662aedaa 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -765,7 +765,7 @@ static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
 int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
 {
     int type = ctx->type;
-    int ret;
+    int ret = 0;
     AVCodecContext * const avctx = logger(ctx);

     // Avoid doing anything if there is nothing we can do
@@ -777,8 +777,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
     if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
         stuff_all_buffers(avctx, ctx);

-    ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
-    if (ret < 0) {
+    if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) {
         const int err = errno;
         av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name,
                cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err);
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 888ba67fea..88a341aae2 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -110,16 +110,21 @@ static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *co
         return 0;

     ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON);
-    if (ret < 0)
-        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n");
-
-    ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd);
-    if (ret < 0)
-        av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno);
-    else
-        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_DECODER_CMD start OK\n");
+    if (ret != 0) {
+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret));
+        return ret;
+    }

-    return ret;
+    // STREAMON should do implicit START so this just for those that don't.
+    // It is optional so don't worry if it fails
+    if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) {
+        ret = AVERROR(errno);
+        av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret));
+    }
+    else {
+        av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n");
+    }
+    return 0;
 }

 static int v4l2_try_start(AVCodecContext *avctx)

From aafa5968f8713319be35cf26069c98566d5bf59b Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 24 May 2022 17:02:58 +0000
Subject: [PATCH 048/161] v4l2m2m_dec: Support in-pkt AV_PKT_DATA_NEW_EXTRADATA

Support packet side-data containing AV_PKT_DATA_NEW_EXTRADATA.  Should
also detect and complain about unexpected streams of empty packets.

This functionality untested as I haven't yet found anything that creates
NEW_EXTRADATA side data.
---
 libavcodec/v4l2_m2m.c     |  1 +
 libavcodec/v4l2_m2m.h     |  3 +++
 libavcodec/v4l2_m2m_dec.c | 49 ++++++++++++++++++++++++++++++++++++---
 3 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index e26bd74c3e..6dd01e2e00 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -251,6 +251,7 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context)
     av_frame_unref(s->frame);
     av_frame_free(&s->frame);
     av_packet_unref(&s->buf_pkt);
+    av_freep(&s->extdata_data);

     av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n");

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 6bd5e8eda7..19d618698d 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -115,6 +115,9 @@ typedef struct V4L2m2mContext {

     /* Ext data sent */
     int extdata_sent;
+    /* Ext data sent in packet - overrides ctx */
+    uint8_t * extdata_data;
+    size_t extdata_size;

 #define FF_V4L2_QUIRK_REINIT_ALWAYS             1
 #define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN    2
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 88a341aae2..392a68f0c7 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -343,7 +343,46 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
     // We will already have a coded pkt if the output Q was full last time we
     // tried to Q it
     if (!s->buf_pkt.size && !do_not_get) {
-        ret = ff_decode_get_packet(avctx, &s->buf_pkt);
+        unsigned int i;
+
+        for (i = 0; i < 256; ++i) {
+            uint8_t * side_data;
+            size_t side_size;
+
+            ret = ff_decode_get_packet(avctx, &s->buf_pkt);
+            if (ret != 0)
+                break;
+
+            // New extradata is the only side-data we undertand
+            side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
+            if (side_data) {
+                av_log(avctx, AV_LOG_DEBUG, "New extradata\n");
+                av_freep(&s->extdata_data);
+                if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) {
+                    av_log(avctx, AV_LOG_ERROR, "Failed to alloc %zd bytes of extra data\n", side_size);
+                    return AVERROR(ENOMEM);
+                }
+                memcpy(s->extdata_data, side_data, side_size);
+                s->extdata_size = side_size;
+                s->extdata_sent = 0;
+            }
+
+            if (s->buf_pkt.size != 0)
+                break;
+
+            if (s->buf_pkt.side_data_elems == 0) {
+                av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n");
+                ret = AVERROR_EOF;
+                break;
+            }
+
+            // Retry a side-data only pkt
+        }
+        // If i >= 256 something has gone wrong
+        if (i >= 256) {
+            av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n");
+            return AVERROR(EIO);
+        }

         if (ret == AVERROR(EAGAIN)) {
             if (!stream_started(s)) {
@@ -398,8 +437,12 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
     if ((ret = check_output_streamon(avctx, s)) != 0)
         return ret;

-    ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt,
-                                         avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size);
+    if (s->extdata_sent)
+        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
+    else if (s->extdata_data)
+        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);
+    else
+        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size);

     if (ret == AVERROR(EAGAIN)) {
         // Out of input buffers - keep packet

From e9bced67bdb40096d31067d41956276e9e1af11a Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 24 May 2022 20:02:48 +0000
Subject: [PATCH 049/161] v4l2m2m_dec: Catch repeated Q fulls

---
 libavcodec/v4l2_m2m_dec.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 392a68f0c7..7e17044706 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -504,13 +504,14 @@ static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
 static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 {
     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-    int src_rv;
+    int src_rv = NQ_OK;
     int dst_rv = 1;  // Non-zero (done), non-negative (error) number
     unsigned int i = 0;

     do {
         const int pending = xlat_pending(&s->xlat);
         const int prefer_dq = (pending > s->pending_hw / 16);
+        const int last_src_rv = src_rv;

         // Enqueue another pkt for decode if
         // (a) We don't have a lot of stuff in the buffer already OR
@@ -526,6 +527,11 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
         if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
             break;

+        if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) {
+            av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n");
+            break;
+        }
+
         // Try to get a new frame if
         // (a) we haven't already got one AND
         // (b) enqueue returned a status indicating that decode should be attempted

From 0c974e4da2c0311836145f2fd42081d40eb15998 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 25 May 2022 15:22:12 +0000
Subject: [PATCH 050/161] Remove requirement for epoxy & libudev config options

---
 configure              | 26 +++++++++++++++++---------
 pi-util/conf_native.sh |  2 --
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/configure b/configure
index b41663c794..fdc95146bf 100755
--- a/configure
+++ b/configure
@@ -205,6 +205,7 @@ External library support:
   --disable-bzlib          disable bzlib [autodetect]
   --disable-coreimage      disable Apple CoreImage framework [autodetect]
   --enable-chromaprint     enable audio fingerprinting with chromaprint [no]
+  --disable-epoxy          disable epoxy [autodetect]
   --enable-frei0r          enable frei0r video filtering [no]
   --enable-gcrypt          enable gcrypt, needed for rtmp(t)e support
                            if openssl, librtmp or gmp is not used [no]
@@ -281,7 +282,7 @@ External library support:
                            if openssl, gnutls or mbedtls is not used [no]
   --enable-libtwolame      enable MP2 encoding via libtwolame [no]
   --enable-libuavs3d       enable AVS3 decoding via libuavs3d [no]
-  --enable-libudev         enable libudev [no]
+  --disable-libudev        disable libudev [autodetect]
   --enable-libv4l2         enable libv4l2/v4l-utils [no]
   --enable-libvidstab      enable video stabilization using vid.stab [no]
   --enable-libvmaf         enable vmaf filter via libvmaf [no]
@@ -1747,7 +1748,9 @@ EXTERNAL_AUTODETECT_LIBRARY_LIST="
     avfoundation
     bzlib
     coreimage
+    epoxy
     iconv
+    libudev
     libxcb
     libxcb_shm
     libxcb_shape
@@ -1819,7 +1822,6 @@ EXTERNAL_LIBRARY_LIST="
     libdav1d
     libdc1394
     libdrm
-    epoxy
     libflite
     libfontconfig
     libfreetype
@@ -1863,7 +1865,6 @@ EXTERNAL_LIBRARY_LIST="
     libtheora
     libtwolame
     libuavs3d
-    libudev
     libv4l2
     libvmaf
     libvorbis
@@ -3567,9 +3568,8 @@ v4l2_indev_suggest="libv4l2"
 v4l2_outdev_deps="libdrm"
 v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
 v4l2_outdev_suggest="libv4l2"
-vout_drm_outdev_deps="libdrm vout_drm"
-vout_egl_outdev_deps="xlib"
-vout_egl_outdev_select="epoxy"
+vout_drm_outdev_deps="libdrm"
+vout_egl_outdev_deps="xlib epoxy"
 vfwcap_indev_deps="vfw32 vfwcap_defines"
 xcbgrab_indev_deps="libxcb"
 xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
@@ -6355,6 +6355,12 @@ if enabled xlib; then
         disable xlib
 fi

+enabled libudev &&
+    check_pkg_config libudev libudev libudev.h udev_new
+
+enabled epoxy &&
+    check_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
+
 check_headers direct.h
 check_headers dirent.h
 check_headers dxgidebug.h
@@ -6601,7 +6607,6 @@ enabled libdav1d          && require_pkg_config libdav1d "dav1d >= 0.5.0" "dav1d
 enabled libdavs2          && require_pkg_config libdavs2 "davs2 >= 1.6.0" davs2.h davs2_decoder_open
 enabled libdc1394         && require_pkg_config libdc1394 libdc1394-2 dc1394/dc1394.h dc1394_new
 enabled libdrm            && require_pkg_config libdrm libdrm xf86drm.h drmGetVersion
-enabled epoxy             && require_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
 enabled libfdk_aac        && { check_pkg_config libfdk_aac fdk-aac "fdk-aac/aacenc_lib.h" aacEncOpen ||
                                { require libfdk_aac fdk-aac/aacenc_lib.h aacEncOpen -lfdk-aac &&
                                  warn "using libfdk without pkg-config"; } }
@@ -6713,7 +6718,6 @@ enabled libtwolame        && require libtwolame twolame.h twolame_init -ltwolame
                              { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame ||
                                die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; }
 enabled libuavs3d         && require_pkg_config libuavs3d "uavs3d >= 1.1.41" uavs3d.h uavs3d_decode
-enabled libudev           && require_pkg_config libudev libudev libudev.h udev_new
 enabled libv4l2           && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl
 enabled libvidstab        && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit
 enabled libvmaf           && require_pkg_config libvmaf "libvmaf >= 2.0.0" libvmaf.h vmaf_init
@@ -6819,9 +6823,13 @@ enabled rkmpp             && { require_pkg_config rkmpp rockchip_mpp  rockchip/r
 enabled v4l2_request      && { enabled libdrm ||
                                die "ERROR: v4l2-request requires --enable-libdrm"; } &&
                              { enabled libudev ||
-                               die "ERROR: v4l2-request requires --enable-libudev"; }
+                               die "ERROR: v4l2-request requires libudev"; }
 enabled vapoursynth       && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init

+enabled vout_drm && { enabled libdrm || die "ERROR: vout_drm requires --enable-libdrm"; }
+
+enabled vout_egl && { enabled epoxy || die "ERROR: vout_egl requires epoxy"; } &&
+                    { enabled xlib  || die "ERROR: vout_egl requires xlib"; }

 if enabled gcrypt; then
     GCRYPT_CONFIG="${cross_prefix}libgcrypt-config"
diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
index 65576846e8..37cea71756 100755
--- a/pi-util/conf_native.sh
+++ b/pi-util/conf_native.sh
@@ -91,8 +91,6 @@ $FFSRC/configure \
  --disable-thumb\
  --enable-v4l2-request\
  --enable-libdrm\
- --enable-epoxy\
- --enable-libudev\
  --enable-vout-egl\
  --enable-vout-drm\
  $SHARED_LIBS\

From 9f234d8cbde2829e6a70fd3cb6324998df8a31f3 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 27 May 2022 09:36:51 +0000
Subject: [PATCH 051/161] hevc: If hwaccel avoid creation of s/w only vars

---
 libavcodec/hevc_refs.c | 35 +++++++++++++++++++++--------------
 libavcodec/hevcdec.c   | 42 +++++++++++++++++++++++++++++-------------
 2 files changed, 50 insertions(+), 27 deletions(-)

diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
index 811e8feff8..f7cf14eabc 100644
--- a/libavcodec/hevc_refs.c
+++ b/libavcodec/hevc_refs.c
@@ -98,18 +98,22 @@ static HEVCFrame *alloc_frame(HEVCContext *s)
         if (!frame->rpl_buf)
             goto fail;

-        frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
-        if (!frame->tab_mvf_buf)
-            goto fail;
-        frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
+        if (s->tab_mvf_pool) {
+            frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
+            if (!frame->tab_mvf_buf)
+                goto fail;
+            frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
+        }

-        frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
-        if (!frame->rpl_tab_buf)
-            goto fail;
-        frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
-        frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
-        for (j = 0; j < frame->ctb_count; j++)
-            frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
+        if (s->rpl_tab_pool) {
+            frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
+            if (!frame->rpl_tab_buf)
+                goto fail;
+            frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
+            frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
+            for (j = 0; j < frame->ctb_count; j++)
+                frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
+        }

         frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
         frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
@@ -297,14 +301,17 @@ static int init_slice_rpl(HEVCContext *s)
     int ctb_count    = frame->ctb_count;
     int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
     int i;
+    RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;

     if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab))
         return AVERROR_INVALIDDATA;

-    for (i = ctb_addr_ts; i < ctb_count; i++)
-        frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
+    if (frame->rpl_tab) {
+        for (i = ctb_addr_ts; i < ctb_count; i++)
+            frame->rpl_tab[i] = tab;
+    }

-    frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts];
+    frame->refPicList = tab->refPicList;

     return 0;
 }
diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
index 2867cb2e16..17f53322fb 100644
--- a/libavcodec/hevcdec.c
+++ b/libavcodec/hevcdec.c
@@ -536,6 +536,16 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps,
     if (!sps)
         return 0;

+    // If hwaccel then we don't need all the s/w decode helper arrays
+    if (s->avctx->hwaccel) {
+        export_stream_params(s, sps);
+
+        s->avctx->pix_fmt = pix_fmt;
+        s->ps.sps = sps;
+        s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
+        return 0;
+    }
+
     ret = pic_arrays_init(s, sps);
     if (ret < 0)
         goto fail;
@@ -2890,11 +2900,13 @@ static int hevc_frame_start(HEVCContext *s)
                            ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
     int ret;

-    memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
-    memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
-    memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
-    memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
-    memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
+    if (s->horizontal_bs) {
+        memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
+        memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
+        memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
+        memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
+        memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
+    }

     s->is_decoded        = 0;
     s->first_nal_type    = s->nal_unit_type;
@@ -3438,15 +3450,19 @@ static int hevc_ref_frame(HEVCContext *s, HEVCFrame *dst, HEVCFrame *src)
         dst->needs_fg = 1;
     }

-    dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
-    if (!dst->tab_mvf_buf)
-        goto fail;
-    dst->tab_mvf = src->tab_mvf;
+    if (src->tab_mvf_buf) {
+        dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
+        if (!dst->tab_mvf_buf)
+            goto fail;
+        dst->tab_mvf = src->tab_mvf;
+    }

-    dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
-    if (!dst->rpl_tab_buf)
-        goto fail;
-    dst->rpl_tab = src->rpl_tab;
+    if (src->rpl_tab_buf) {
+        dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
+        if (!dst->rpl_tab_buf)
+            goto fail;
+        dst->rpl_tab = src->rpl_tab;
+    }

     dst->rpl_buf = av_buffer_ref(src->rpl_buf);
     if (!dst->rpl_buf)

From bb2ddc480634141bed9afd3f66e7f63f5091bb2f Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 30 May 2022 17:51:44 +0100
Subject: [PATCH 052/161] rpi_sand: Add SAND30->NV12 conversion

C code only. Reworks the hwcontext_drm conversion to use the
rpi_sand_fns generic frame convert fn rather than calling the
individual conversion functions directly. This keeps all teh stride and
size logic in a single place.
---
 libavutil/hwcontext_drm.c | 46 ++++++++------------
 libavutil/rpi_sand_fns.c  | 89 +++++++++++++++++++++++++++++++++++++++
 libavutil/rpi_sand_fns.h  |  5 +++
 3 files changed, 111 insertions(+), 29 deletions(-)

diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c
index baf18920fa..137a952d2c 100644
--- a/libavutil/hwcontext_drm.c
+++ b/libavutil/hwcontext_drm.c
@@ -234,14 +234,14 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx,
                                     enum AVHWFrameTransferDirection dir,
                                     enum AVPixelFormat **formats)
 {
-    enum AVPixelFormat *pix_fmts;
+    enum AVPixelFormat *p;

-    pix_fmts = av_malloc_array(2, sizeof(*pix_fmts));
-    if (!pix_fmts)
+    p = *formats = av_malloc_array(3, sizeof(*p));
+    if (!p)
         return AVERROR(ENOMEM);

     // **** Offer native sand too ????
-    pix_fmts[0] =
+    *p++ =
 #if CONFIG_SAND
         ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
             AV_PIX_FMT_YUV420P :
@@ -249,9 +249,14 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx,
             AV_PIX_FMT_YUV420P10LE :
 #endif
             ctx->sw_format;
-    pix_fmts[1] = AV_PIX_FMT_NONE;

-    *formats = pix_fmts;
+#if CONFIG_SAND
+    if (ctx->sw_format == AV_PIX_FMT_RPI4_10 ||
+        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128)
+        *p++ = AV_PIX_FMT_NV12;
+#endif
+
+    *p = AV_PIX_FMT_NONE;
     return 0;
 }

@@ -294,29 +299,12 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc,
         const unsigned int w = FFMIN(dst->width, map->width);
         const unsigned int h = FFMIN(dst->height, map->height);

-        if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) {
-            av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
-                                     map->data[0],
-                                     128, stride2,
-                                     0, 0, w, h);
-            av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
-                                     dst->data[2], dst->linesize[2],
-                                     map->data[1],
-                                     128, stride2,
-                                     0, 0, w / 2, h / 2);
-        }
-        else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) {
-            av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
-                                     map->data[0],
-                                     128, stride2,
-                                     0, 0, w, h);
-            av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
-                                     dst->data[2], dst->linesize[2],
-                                     map->data[1],
-                                     128, stride2,
-                                     0, 0, w / 2, h / 2);
-        }
-        else
+        map->crop_top = 0;
+        map->crop_bottom = 0;
+        map->crop_left = 0;
+        map->crop_right = 0;
+
+        if (av_rpi_sand_to_planar_frame(dst, map) != 0)
         {
             av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
             err = AVERROR(EINVAL);
diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
index 1f543e9357..256c3d532f 100644
--- a/libavutil/rpi_sand_fns.c
+++ b/libavutil/rpi_sand_fns.c
@@ -229,6 +229,75 @@ void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_
     }
 }

+// Fetches a single patch - offscreen fixup not done here
+// w <= stride1
+// single lose bottom 2 bits truncation
+// _x & _w in pixels, strides in bytes
+void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
+    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
+    const unsigned int x1 = ((_x + _w) / 3) * 4;
+    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
+    const unsigned int mask = stride1 - 1;
+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
+
+#if HAVE_SAND_ASM && 0
+    if (_x == 0) {
+        ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
+        return;
+    }
+#endif
+
+    if (x0 == x1) {
+        // *******************
+        // Partial single word xfer
+        return;
+    }
+
+    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
+    {
+        unsigned int x = x0;
+        const uint32_t * p = (const uint32_t *)p0;
+        uint8_t * d = dst;
+
+        if (xskip0 != 0) {
+            const uint32_t p3 = *p++;
+
+            if (xskip0 == 1)
+                *d++ = (p3 >> 12) & 0xff;
+            *d++ = (p3 >> 22) & 0xff;
+
+            if (((x += 4) & mask) == 0)
+                p += slice_inc;
+        }
+
+        while (x != x1) {
+            const uint32_t p3 = *p++;
+            *d++ = (p3 >> 2) & 0xff;
+            *d++ = (p3 >> 12) & 0xff;
+            *d++ = (p3 >> 22) & 0xff;
+
+            if (((x += 4) & mask) == 0)
+                p += slice_inc;
+        }
+
+        if (xrem1 != 0) {
+            const uint32_t p3 = *p;
+
+            *d++ = (p3 >> 2) & 0xff;
+            if (xrem1 == 2)
+                *d++ = (p3 >> 12) & 0xff;
+        }
+    }
+}
+
+

 // w/h in pixels
 void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
@@ -310,6 +379,16 @@ int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
                                              av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
                                              x/2, y/2,  w/2, h/2);
                     break;
+                case AV_PIX_FMT_NV12:
+                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
+                                             src->data[0],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x, y, w, h);
+                    av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1],
+                                             src->data[1],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x/2, y/2, w, h/2);
+                    break;
                 default:
                     return -1;
             }
@@ -344,6 +423,16 @@ int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
                                              av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
                                              x/2, y/2, w/2, h/2);
                     break;
+                case AV_PIX_FMT_NV12:
+                    av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0],
+                                             src->data[0],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x, y, w, h);
+                    av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1],
+                                             src->data[1],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x/2, y/2, w, h/2);
+                    break;
                 default:
                     return -1;
             }
diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
index 634b55e800..462ccb8abd 100644
--- a/libavutil/rpi_sand_fns.h
+++ b/libavutil/rpi_sand_fns.h
@@ -85,6 +85,11 @@ void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_
                              unsigned int _x, unsigned int y,
                              unsigned int _w, unsigned int h);

+void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);

 // w/h in pixels
 void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,

From b55c351e6954c800229d97dc6c982ca8f998c848 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 1 Jun 2022 17:49:26 +0000
Subject: [PATCH 053/161] rpi_sand: Add SAND30->NV12 asm for Armv7 & Armv8

Also reworks the previous Armv8 SAND30->Y16 function in a slightly more
efficient way that makes it look more like the Armv7 version.
---
 libavutil/aarch64/rpi_sand_neon.S | 549 ++++++++++++++++++------------
 libavutil/aarch64/rpi_sand_neon.h |   4 +
 libavutil/arm/rpi_sand_neon.S     | 239 ++++++++++---
 libavutil/arm/rpi_sand_neon.h     |  11 +
 libavutil/rpi_sand_fns.c          |   2 +-
 5 files changed, 541 insertions(+), 264 deletions(-)

diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
index cdcf71ee67..2f07d9674c 100644
--- a/libavutil/aarch64/rpi_sand_neon.S
+++ b/libavutil/aarch64/rpi_sand_neon.S
@@ -248,228 +248,6 @@ incomplete_block_loop_end_c8:
     ret
 endfunc

-//void ff_rpi_sand30_lines_to_planar_y16(
-//  uint8_t * dest,             // [x0]
-//  unsigned int dst_stride,    // [w1] -> assumed to be equal to _w
-//  const uint8_t * src,        // [x2]
-//  unsigned int src_stride1,   // [w3] -> 128
-//  unsigned int src_stride2,   // [w4]
-//  unsigned int _x,            // [w5]
-//  unsigned int y,             // [w6]
-//  unsigned int _w,            // [w7]
-//  unsigned int h);            // [sp, #0]
-
-function ff_rpi_sand30_lines_to_planar_y16, export=1
-    stp x19, x20, [sp, #-48]!
-    stp x21, x22, [sp, #16]
-    stp x23, x24, [sp, #32]
-
-    // w6 = argument h
-    ldr w6, [sp, #48]
-
-    // slice_inc = ((stride2 - 1) * stride1)
-    mov w5, w4
-    sub w5, w5, #1
-    lsl w5, w5, #7
-
-    // total number of bytes per row = (width / 3) * 4
-    mov w8, w7
-    mov w9, #3
-    udiv w8, w8, w9
-    lsl w8, w8, #2
-
-    // number of full 128 byte blocks to be processed
-    mov w9, #96
-    udiv w9, w7, w9 // = (width * 4) / (3*128) = width/96
-
-    // w10 = number of full integers to process (4 bytes)
-    // w11 = remaning zero to two 10bit values still to copy over
-    mov w12, #96
-    mul w12, w9, w12
-    sub w12, w7, w12  // width - blocks*96 = remaining points per row
-    mov w11, #3
-    udiv w10, w12, w11 // full integers to process = w12 / 3
-    mul w11, w10, w11  // #integers *3
-    sub w11, w12, w11  // remaining 0-2 points = remaining points - integers*3
-
-    // increase w9 by one if w10+w11 is not zero, and decrease the row count by one
-    // this is to efficiently copy incomplete blocks at the end of the rows
-    // the last row is handled explicitly to avoid writing out of bounds
-    add w22, w10, w11
-    cmp w22, #0
-    cset w22, ne // 1 iff w10+w11 not zero, 0 otherwise
-    add w9, w9, w22
-    sub w6, w6, #1
-
-    // store the number of bytes in w20 which we copy too much for every row
-    // when the width of the frame is not a multiple of 96 (128bytes storing 96 10bit values)
-    mov w20, #96*2
-    mul w20, w20, w9
-    sub w20, w1, w20
-
-    mov w23, #0 // flag to check whether the last line had already been processed
-
-    // bitmask to clear the uppper 6bits of the result values
-    mov x19, #0x03ff03ff03ff03ff
-    dup v22.2d, x19
-
-    // row counter = 0
-    eor w12, w12, w12
-row_loop_y16:
-    cmp w12, w6               // jump to row_loop_y16_fin if we processed all rows
-    bge row_loop_y16_fin
-
-    mov x13, x2               // row src
-    eor w14, w14, w14         // full block counter
-block_loop_y16:
-    cmp w14, w9
-    bge block_loop_y16_fin
-
-    // load 64 bytes
-    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
-
-    // process v0 and v1
-    xtn v16.4h, v0.4s
-    ushr v0.4s, v0.4s, #10
-    xtn v17.4h, v0.4s
-    ushr v0.4s, v0.4s, #10
-    xtn v18.4h, v0.4s
-
-    xtn2 v16.8h, v1.4s
-    and v16.16b, v16.16b, v22.16b
-    ushr v1.4s, v1.4s, #10
-    xtn2 v17.8h, v1.4s
-    and v17.16b, v17.16b, v22.16b
-    ushr v1.4s, v1.4s, #10
-    xtn2 v18.8h, v1.4s
-    and v18.16b, v18.16b, v22.16b
-
-    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
-
-    // process v2 and v3
-    xtn v23.4h, v2.4s
-    ushr v2.4s, v2.4s, #10
-    xtn v24.4h, v2.4s
-    ushr v2.4s, v2.4s, #10
-    xtn v25.4h, v2.4s
-
-    xtn2 v23.8h, v3.4s
-    and v23.16b, v23.16b, v22.16b
-    ushr v3.4s, v3.4s, #10
-    xtn2 v24.8h, v3.4s
-    and v24.16b, v24.16b, v22.16b
-    ushr v3.4s, v3.4s, #10
-    xtn2 v25.8h, v3.4s
-    and v25.16b, v25.16b, v22.16b
-
-    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
-
-    // load the second half of the block -> 64 bytes into registers v4-v7
-    ld1 { v4.4s,  v5.4s,  v6.4s,  v7.4s }, [x13], #64
-
-    // process v4 and v5
-    xtn v16.4h, v4.4s
-    ushr v4.4s, v4.4s, #10
-    xtn v17.4h, v4.4s
-    ushr v4.4s, v4.4s, #10
-    xtn v18.4h, v4.4s
-
-    xtn2 v16.8h, v5.4s
-    and v16.16b, v16.16b, v22.16b
-    ushr v5.4s, v5.4s, #10
-    xtn2 v17.8h, v5.4s
-    and v17.16b, v17.16b, v22.16b
-    ushr v5.4s, v5.4s, #10
-    xtn2 v18.8h, v5.4s
-    and v18.16b, v18.16b, v22.16b
-
-    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
-
-    // v6 and v7
-    xtn v23.4h, v6.4s
-    ushr v6.4s, v6.4s, #10
-    xtn v24.4h, v6.4s
-    ushr v6.4s, v6.4s, #10
-    xtn v25.4h, v6.4s
-
-    xtn2 v23.8h, v7.4s
-    and v23.16b, v23.16b, v22.16b
-    ushr v7.4s, v7.4s, #10
-    xtn2 v24.8h, v7.4s
-    and v24.16b, v24.16b, v22.16b
-    ushr v7.4s, v7.4s, #10
-    xtn2 v25.8h, v7.4s
-    and v25.16b, v25.16b, v22.16b
-
-    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
-
-    add x13, x13, x5          // row src += slice_inc
-    add w14, w14, #1
-    b block_loop_y16
-block_loop_y16_fin:
-
-
-
-
-    add x2, x2, #128          // src += stride1 (start of the next row)
-    add x0, x0, w20, sxtw     // subtract the bytes we copied too much from dst
-    add w12, w12, #1
-    b row_loop_y16
-row_loop_y16_fin:
-
-    // check whether we have incomplete blocks at the end of every row
-    // in that case decrease row block count by one
-    // change height back to it's original value (meaning increase it by 1)
-    // and jump back to another iteration of row_loop_y16
-
-    cmp w23, #1
-    beq row_loop_y16_fin2 // don't continue here if we already processed the last row
-    add w6, w6, #1    // increase height to the original value
-    sub w9, w9, w22   // block count - 1 or 0, depending on the remaining bytes count
-    mov w23, #1
-    b row_loop_y16
-row_loop_y16_fin2:
-
-    sub x0, x0, w20, sxtw // with the last row we didn't actually move the dst ptr to far ahead, therefore readd the diference
-
-    // now we've got to handle the last block in the last row
-    eor w12, w12, w12 // w12 = 0 = counter
-integer_loop_y16:
-    cmp w12, w10
-    bge integer_loop_y16_fin
-    ldr w14, [x13], #4
-    and w15, w14, #0x3ff
-    strh w15, [x0], #2
-    lsr w14, w14, #10
-    and w15, w14, #0x3ff
-    strh w15, [x0], #2
-    lsr w14, w14, #10
-    and w15, w14, #0x3ff
-    strh w15, [x0], #2
-    add w12, w12, #1
-    b integer_loop_y16
-integer_loop_y16_fin:
-
-final_values_y16:
-    // remaining point count = w11
-    ldr w14, [x13], #4
-    cmp w11, #0
-    beq final_values_y16_fin
-    and w15, w14, #0x3ff
-    strh w15, [x0], #2
-    cmp w11, #1
-    beq final_values_y16_fin
-    lsr w14, w14, #10
-    and w15, w14, #0x3ff
-    strh w15, [x0], #2
-final_values_y16_fin:
-
-    ldp x23, x24, [sp, #32]
-    ldp x21, x22, [sp, #16]
-    ldp x19, x20, [sp], #48
-    ret
-endfunc
-
 //void ff_rpi_sand30_lines_to_planar_c16(
 //  uint8_t * dst_u,            // [x0]
 //  unsigned int dst_stride_u,  // [w1] == _w*2
@@ -674,3 +452,330 @@ endfunc
 //  unsigned int _w,
 //  unsigned int h);

+// void ff_rpi_sand30_lines_to_planar_y8(
+//   uint8_t * dest,            : x0
+//   unsigned int dst_stride,   : w1
+//   const uint8_t * src,       : x2
+//   unsigned int src_stride1,  : w3, always 128
+//   unsigned int src_stride2,  : w4
+//   unsigned int _x,           : w5
+//   unsigned int y,            : w6
+//   unsigned int _w,           : w7
+//   unsigned int h);           : [sp, #0]
+//
+// Assumes that we are starting on a stripe boundary and that overreading
+// within the stripe is OK. However it does respect the dest size for wri
+
+function ff_rpi_sand30_lines_to_planar_y16, export=1
+                lsl             w4,  w4,  #7
+                sub             w4,  w4,  #64
+                sub             w1,  w1,  w7, lsl #1
+                uxtw            x6,  w6
+                add             x8,  x2,  x6, lsl #7
+                ldr             w6,  [sp, #0]
+
+10:
+                mov             x2,  x8
+                mov             w5,  w7
+1:
+                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
+                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
+
+                subs            w5,  w5,  #96
+
+                // v0, v1
+
+                shrn            v18.4h,  v0.4s,   #14
+                xtn             v16.4h,  v0.4s
+                shrn            v17.4h,  v0.4s,   #10
+
+                shrn2           v18.8h,  v1.4s,   #14
+                xtn2            v16.8h,  v1.4s
+                shrn2           v17.8h,  v1.4s,   #10
+
+                ushr            v18.8h,  v18.8h,  #6
+                bic             v16.8h,  #0xfc,   lsl #8
+                bic             v17.8h,  #0xfc,   lsl #8
+
+                // v2, v3
+
+                shrn            v21.4h,  v2.4s,   #14
+                xtn             v19.4h,  v2.4s
+                shrn            v20.4h,  v2.4s,   #10
+
+                shrn2           v21.8h,  v3.4s,   #14
+                xtn2            v19.8h,  v3.4s
+                shrn2           v20.8h,  v3.4s,   #10
+
+                ushr            v21.8h,  v21.8h,  #6
+                bic             v19.8h,  #0xfc,   lsl #8
+                bic             v20.8h,  #0xfc,   lsl #8
+
+                // v4, v5
+
+                shrn            v24.4h,  v4.4s,   #14
+                xtn             v22.4h,  v4.4s
+                shrn            v23.4h,  v4.4s,   #10
+
+                shrn2           v24.8h,  v5.4s,   #14
+                xtn2            v22.8h,  v5.4s
+                shrn2           v23.8h,  v5.4s,   #10
+
+                ushr            v24.8h,  v24.8h,  #6
+                bic             v22.8h,  #0xfc,   lsl #8
+                bic             v23.8h,  #0xfc,   lsl #8
+
+                // v6, v7
+
+                shrn            v27.4h,  v6.4s,   #14
+                xtn             v25.4h,  v6.4s
+                shrn            v26.4h,  v6.4s,   #10
+
+                shrn2           v27.8h,  v7.4s,   #14
+                xtn2            v25.8h,  v7.4s
+                shrn2           v26.8h,  v7.4s,   #10
+
+                ushr            v27.8h,  v27.8h,  #6
+                bic             v25.8h,  #0xfc,   lsl #8
+                bic             v26.8h,  #0xfc,   lsl #8
+
+                blt             2f
+
+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
+                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
+                st3             {v22.8h, v23.8h, v24.8h}, [x0], #48
+                st3             {v25.8h, v26.8h, v27.8h}, [x0], #48
+
+                bne             1b
+
+11:
+                subs            w6,  w6,  #1
+                add             x0,  x0,  w1,  uxtw
+                add             x8,  x8,  #128
+                bne             10b
+
+                ret
+
+// Partial final write
+2:
+                cmp             w5,  #48-96
+                blt             1f
+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
+                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
+                beq             11b
+                mov             v16.16b, v22.16b
+                mov             v17.16b, v23.16b
+                sub             w5,  w5,  #48
+                mov             v18.16b, v24.16b
+                mov             v19.16b, v25.16b
+                mov             v20.16b, v26.16b
+                mov             v21.16b, v27.16b
+1:
+                cmp             w5,  #24-96
+                blt             1f
+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
+                beq             11b
+                mov             v16.16b, v19.16b
+                mov             v17.16b, v20.16b
+                sub             w5,  w5,  #24
+                mov             v18.16b, v21.16b
+1:
+                cmp             w5,  #12-96
+                blt             1f
+                st3             {v16.4h, v17.4h, v18.4h}, [x0], #24
+                beq             11b
+                mov             v16.2d[0], v16.2d[1]
+                sub             w5,  w5,  #12
+                mov             v17.2d[0], v17.2d[1]
+                mov             v18.2d[0], v18.2d[1]
+1:
+                cmp             w5,  #6-96
+                blt             1f
+                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
+                st3             {v16.h, v17.h, v18.h}[1], [x0], #6
+                beq             11b
+                mov             v16.2s[0], v16.2s[1]
+                sub             w5,  w5,  #6
+                mov             v17.2s[0], v17.2s[1]
+                mov             v18.2s[0], v18.2s[1]
+1:
+                cmp             w5,  #3-96
+                blt             1f
+                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
+                beq             11b
+                mov             v16.4h[0], v16.4h[1]
+                sub             w5,  w5,  #3
+                mov             v17.4h[0], v17.4h[1]
+1:
+                cmp             w5,  #2-96
+                blt             1f
+                st2             {v16.h, v17.h}[0], [x0], #4
+                b               11b
+1:
+                st1             {v16.h}[0], [x0], #2
+                b               11b
+
+endfunc
+
+// void ff_rpi_sand30_lines_to_planar_y8(
+//   uint8_t * dest,            : x0
+//   unsigned int dst_stride,   : w1
+//   const uint8_t * src,       : x2
+//   unsigned int src_stride1,  : w3, always 128
+//   unsigned int src_stride2,  : w4
+//   unsigned int _x,           : w5
+//   unsigned int y,            : w6
+//   unsigned int _w,           : w7
+//   unsigned int h);           : [sp, #0]
+//
+// Assumes that we are starting on a stripe boundary and that overreading
+// within the stripe is OK. However it does respect the dest size for wri
+
+function ff_rpi_sand30_lines_to_planar_y8, export=1
+                lsl             w4,  w4,  #7
+                sub             w4,  w4,  #64
+                sub             w1,  w1,  w7
+                uxtw            x6,  w6
+                add             x8,  x2,  x6, lsl #7
+                ldr             w6,  [sp, #0]
+
+10:
+                mov             x2,  x8
+                mov             w5,  w7
+1:
+                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
+                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
+
+                subs            w5,  w5,  #96
+
+                // v0, v1
+
+                shrn            v18.4h,  v0.4s,   #16
+                xtn             v16.4h,  v0.4s
+                shrn            v17.4h,  v0.4s,   #12
+
+                shrn2           v18.8h,  v1.4s,   #16
+                xtn2            v16.8h,  v1.4s
+                shrn2           v17.8h,  v1.4s,   #12
+
+                shrn            v18.8b,  v18.8h,  #6
+                shrn            v16.8b,  v16.8h,  #2
+                xtn             v17.8b,  v17.8h
+
+                // v2, v3
+
+                shrn            v21.4h,  v2.4s,   #16
+                xtn             v19.4h,  v2.4s
+                shrn            v20.4h,  v2.4s,   #12
+
+                shrn2           v21.8h,  v3.4s,   #16
+                xtn2            v19.8h,  v3.4s
+                shrn2           v20.8h,  v3.4s,   #12
+
+                shrn2           v18.16b, v21.8h,  #6
+                shrn2           v16.16b, v19.8h,  #2
+                xtn2            v17.16b, v20.8h
+
+                // v4, v5
+
+                shrn            v24.4h,  v4.4s,   #16
+                xtn             v22.4h,  v4.4s
+                shrn            v23.4h,  v4.4s,   #12
+
+                shrn2           v24.8h,  v5.4s,   #16
+                xtn2            v22.8h,  v5.4s
+                shrn2           v23.8h,  v5.4s,   #12
+
+                shrn            v21.8b,  v24.8h,  #6
+                shrn            v19.8b,  v22.8h,  #2
+                xtn             v20.8b,  v23.8h
+
+                // v6, v7
+
+                shrn            v27.4h,  v6.4s,   #16
+                xtn             v25.4h,  v6.4s
+                shrn            v26.4h,  v6.4s,   #12
+
+                shrn2           v27.8h,  v7.4s,   #16
+                xtn2            v25.8h,  v7.4s
+                shrn2           v26.8h,  v7.4s,   #12
+
+                shrn2           v21.16b, v27.8h,  #6
+                shrn2           v19.16b, v25.8h,  #2
+                xtn2            v20.16b, v26.8h
+
+                blt             2f
+
+                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
+                st3             {v19.16b, v20.16b, v21.16b}, [x0], #48
+
+                bne             1b
+
+11:
+                subs            w6,  w6,  #1
+                add             x0,  x0,  w1,  uxtw
+                add             x8,  x8,  #128
+                bne             10b
+
+                ret
+
+// Partial final write
+2:
+                cmp             w5,  #48-96
+                blt             1f
+                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
+                beq             11b
+                mov             v16.16b, v22.16b
+                mov             v17.16b, v23.16b
+                sub             w5,  w5,  #48
+                mov             v18.16b, v24.16b
+1:
+                cmp             w5,  #24-96
+                blt             1f
+                st3             {v16.8b, v17.8b, v18.8b}, [x0], #24
+                beq             11b
+                mov             v16.2d[0], v16.2d[1]
+                sub             w5,  w5,  #24
+                mov             v17.2d[0], v17.2d[1]
+                mov             v18.2d[0], v18.2d[1]
+1:
+                cmp             w5,  #12-96
+                blt             1f
+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
+                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
+                st3             {v16.b, v17.b, v18.b}[2], [x0], #3
+                st3             {v16.b, v17.b, v18.b}[3], [x0], #3
+                beq             11b
+                mov             v16.2s[0], v16.2s[1]
+                sub             w5,  w5,  #12
+                mov             v17.2s[0], v17.2s[1]
+                mov             v18.2s[0], v18.2s[1]
+1:
+                cmp             w5,  #6-96
+                blt             1f
+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
+                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
+                beq             11b
+                mov             v16.4h[0], v16.4h[1]
+                sub             w5,  w5,  #6
+                mov             v17.4h[0], v17.4h[1]
+                mov             v18.4h[0], v18.4h[1]
+1:
+                cmp             w5,  #3-96
+                blt             1f
+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
+                beq             11b
+                mov             v16.8b[0], v16.8b[1]
+                sub             w5,  w5,  #3
+                mov             v17.8b[0], v17.8b[1]
+1:
+                cmp             w5,  #2-96
+                blt             1f
+                st2             {v16.b, v17.b}[0], [x0], #2
+                b               11b
+1:
+                st1             {v16.b}[0], [x0], #1
+                b               11b
+
+endfunc
+
diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h
index b3aa481ea4..2a56135bc3 100644
--- a/libavutil/aarch64/rpi_sand_neon.h
+++ b/libavutil/aarch64/rpi_sand_neon.h
@@ -49,6 +49,10 @@ void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_
   uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1,
   unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);

+void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
index 80890fe985..60e697f681 100644
--- a/libavutil/arm/rpi_sand_neon.S
+++ b/libavutil/arm/rpi_sand_neon.S
@@ -360,7 +360,6 @@ function ff_rpi_sand30_lines_to_planar_y16, export=1
                 ldr             r6,  [sp, #36]
                 ldr             r7,  [sp, #32]  @ y
                 mov             r12, #48
-                vmov.u16        q15, #0x3ff
                 sub             r3,  #1
                 lsl             r3,  #7
                 sub             r1,  r1,  r6,  lsl #1
@@ -376,37 +375,33 @@ function ff_rpi_sand30_lines_to_planar_y16, export=1
                 vldm            r2!, {q10-q13}
                 add             lr,  #64

-                vshr.u32        q14, q10, #20    @ Cannot vshrn.u32 #20!
+                vshrn.u32       d4 , q10, #14    @ Cannot vshrn.u32 #20!
                 ands            lr,  #127
                 vshrn.u32       d2,  q10, #10
                 vmovn.u32       d0,  q10
-                vmovn.u32       d4,  q14

-                vshr.u32        q14, q11, #20
+                vshrn.u32       d5,  q11, #14
                 it              eq
                 addeq           r2,  r3
                 vshrn.u32       d3,  q11, #10
                 vmovn.u32       d1,  q11
-                vmovn.u32       d5,  q14

                 subs            r5,  #48
-                vand            q0,  q15
-                vand            q1,  q15
-                vand            q2,  q15
+                vshr.u16        q2,  #6
+                vbic.u16        q0,  #0xfc00
+                vbic.u16        q1,  #0xfc00

-                vshr.u32        q14, q12, #20
+                vshrn.u32       d20, q12, #14
                 vshrn.u32       d18, q12, #10
                 vmovn.u32       d16, q12
-                vmovn.u32       d20, q14

-                vshr.u32        q14, q13, #20
+                vshrn.u32       d21, q13, #14
                 vshrn.u32       d19, q13, #10
                 vmovn.u32       d17, q13
-                vmovn.u32       d21, q14

-                vand            q8,  q15
-                vand            q9,  q15
-                vand            q10, q15
+                vshr.u16        q10, #6
+                vbic.u16        q8,  #0xfc00
+                vbic.u16        q9 , #0xfc00
                 blt             2f

                 vst3.16         {d0,  d2,  d4},  [r0], r12
@@ -499,7 +494,6 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1
                 ldr             r7,  [sp, #48]
                 ldr             r9,  [sp, #52]
                 mov             r12, #48
-                vmov.u16        q15, #0x3ff
                 sub             r8,  #1
                 lsl             r8,  #7
                 add             r5,  r5,  r7,  lsl #7
@@ -515,48 +509,44 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1
                 add             lr,  #64

                 @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2
-                vshr.u32        q14, q0,  #20
-                vshrn.u32       d16, q0,  #10
+                vshrn.u32       d20, q0,  #14
                 vmovn.u32       d18, q0
+                vshrn.u32       d0,  q0,  #10
                 ands            lr,  #127
-                vmovn.u32       d20, q14

-                vshr.u32        q14, q1,  #20
-                vshrn.u32       d17, q1,  #10
+                vshrn.u32       d21, q1,  #14
                 vmovn.u32       d19, q1
-                vmovn.u32       d21, q14
+                vshrn.u32       d1,  q1,  #10

-                vshr.u32        q14, q2,  #20
                 vshrn.u32       d22, q2,  #10
-                vmovn.u32       d24, q2
-                vmovn.u32       d26, q14
+                vmovn.u32       d2,  q2
+                vshrn.u32       d4,  q2,  #14

-                vshr.u32        q14, q3,  #20
-                vshrn.u32       d23, q3,  #10
-                vmovn.u32       d25, q3
                 add             r10, r0,  #24
-                vmovn.u32       d27, q14
+                vshrn.u32       d23, q3,  #10
+                vmovn.u32       d3,  q3
+                vshrn.u32       d5,  q3,  #14

                 it              eq
                 addeq           r4,  r8
-                vuzp.16         q8,  q11
-                vuzp.16         q9,  q12
-                vuzp.16         q10, q13
+                vuzp.16         q0,  q11
+                vuzp.16         q9,  q1
+                vuzp.16         q10, q2

-                @ q8   V0, V3,.. -> q0
+                @ q0   V0, V3,..
                 @ q9   U0, U3...
                 @ q10  U1, U4...
                 @ q11  U2, U5,..
-                @ q12  V1, V4,.. -> q1
-                @ q13  V2, V5,.. -> q2
+                @ q1   V1, V4,
+                @ q2   V2, V5,..

                 subs            r6,  #24
-                vand            q11, q15
-                vand            q9,  q15
-                vand            q10, q15
-                vand            q0,  q8,  q15
-                vand            q1,  q12, q15
-                vand            q2,  q13, q15
+                vbic.u16        q11, #0xfc00
+                vbic.u16        q9,  #0xfc00
+                vshr.u16        q10, #6
+                vshr.u16        q2,  #6
+                vbic.u16        q0,  #0xfc00
+                vbic.u16        q1,  #0xfc00

                 blt             2f

@@ -765,4 +755,171 @@ function ff_rpi_sand30_lines_to_planar_p010, export=1
 endfunc


+@ void ff_rpi_sand30_lines_to_planar_y8(
+@   uint8_t * dest,             // [r0]
+@   unsigned int dst_stride,    // [r1]
+@   const uint8_t * src,        // [r2]
+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+@   unsigned int src_stride2,   // [sp, #0]  -> r3
+@   unsigned int _x,            // [sp, #4]  Ignored - 0
+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+@   unsigned int h);            // [sp, #16] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for wri
+
+function ff_rpi_sand30_lines_to_planar_y8, export=1
+                push            {r4-r8, lr}     @ +24
+                ldr             r3,  [sp, #24]
+                ldr             r6,  [sp, #36]
+                ldr             r7,  [sp, #32]  @ y
+                mov             r12, #48
+                lsl             r3,  #7
+                sub             r1,  r1,  r6
+                add             r8,  r2,  r7,  lsl #7
+                ldr             r7,  [sp, #40]
+
+10:
+                mov             r2,  r8
+                add             r4,  r0,  #24
+                mov             r5,  r6
+1:
+                vldm            r2,  {q8-q15}
+
+                subs            r5,  #96
+
+                vmovn.u32       d0,  q8
+                vshrn.u32       d2,  q8,  #12
+                vshrn.u32       d4,  q8,  #16    @ Cannot vshrn.u32 #20!
+
+                add             r2,  r3
+
+                vmovn.u32       d1,  q9
+                vshrn.u32       d3,  q9,  #12
+                vshrn.u32       d5,  q9,  #16
+
+                pld             [r2, #0]
+
+                vshrn.u16       d0,  q0,  #2
+                vmovn.u16       d1,  q1
+                vshrn.u16       d2,  q2,  #6
+
+                vmovn.u32       d16, q10
+                vshrn.u32       d18, q10, #12
+                vshrn.u32       d20, q10, #16
+
+                vmovn.u32       d17, q11
+                vshrn.u32       d19, q11, #12
+                vshrn.u32       d21, q11, #16
+
+                pld             [r2, #64]
+
+                vshrn.u16       d4,  q8,  #2
+                vmovn.u16       d5,  q9
+                vshrn.u16       d6,  q10, #6
+
+                vmovn.u32       d16, q12
+                vshrn.u32       d18, q12, #12
+                vshrn.u32       d20, q12, #16
+
+                vmovn.u32       d17, q13
+                vshrn.u32       d19, q13, #12
+                vshrn.u32       d21, q13, #16
+
+                vshrn.u16       d16, q8,  #2
+                vmovn.u16       d17, q9
+                vshrn.u16       d18, q10, #6
+
+                vmovn.u32       d20, q14
+                vshrn.u32       d22, q14, #12
+                vshrn.u32       d24, q14, #16
+
+                vmovn.u32       d21, q15
+                vshrn.u32       d23, q15, #12
+                vshrn.u32       d25, q15, #16
+
+                vshrn.u16       d20, q10, #2
+                vmovn.u16       d21, q11
+                vshrn.u16       d22, q12, #6
+
+                blt             2f
+
+                vst3.8          {d0,  d1,  d2},  [r0], r12
+                vst3.8          {d4,  d5,  d6},  [r4], r12
+                vst3.8          {d16, d17, d18}, [r0], r12
+                vst3.8          {d20, d21, d22}, [r4], r12
+
+                bne             1b
+
+11:
+                subs            r7,  #1
+                add             r0,  r1
+                add             r8,  #128
+                bne             10b
+
+                pop             {r4-r8, pc}
+
+@ Partial final write
+2:
+                cmp             r5,  #48-96
+                blt             1f
+                vst3.8          {d0,  d1,  d2},  [r0], r12
+                vst3.8          {d4,  d5,  d6},  [r4], r12
+                beq             11b
+                vmov            q0,  q8
+                vmov            q2,  q10
+                sub             r5,  #48
+                vmov            d2,  d18
+                vmov            d6,  d22
+1:
+                cmp             r5,  #24-96
+                blt             1f
+                vst3.8          {d0,  d1,  d2},  [r0]!
+                beq             11b
+                vmov            q0,  q2
+                sub             r5,  #24
+                vmov            d2,  d6
+1:
+                cmp             r5,  #12-96
+                blt             1f
+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
+                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
+                vst3.8          {d0[2], d1[2], d2[2]}, [r0]!
+                vst3.8          {d0[3], d1[3], d2[3]}, [r0]!
+                beq             11b
+                vmov            s0,  s1
+                sub             r5,  #12
+                vmov            s2,  s3
+                vmov            s4,  s5
+1:
+                cmp             r5,  #6-96
+                blt             1f
+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
+                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
+                add             r0,  #12
+                beq             11b
+                vshr.u32        d0,  #16
+                sub             r5,  #6
+                vshr.u32        d1,  #16
+                vshr.u32        d2,  #16
+1:
+                cmp             r5, #3-96
+                blt             1f
+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
+                beq             11b
+                sub             r5, #3
+                vshr.u32        d0, #8
+                vshr.u32        d1, #8
+1:
+                cmp             r5, #2-96
+                blt             1f
+                vst2.8          {d0[0], d1[0]}, [r0]!
+                b               11b
+1:
+                vst1.8          {d0[0]}, [r0]!
+                b               11b
+
+endfunc
+

diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h
index 447f367bea..d457c10870 100644
--- a/libavutil/arm/rpi_sand_neon.h
+++ b/libavutil/arm/rpi_sand_neon.h
@@ -95,5 +95,16 @@ void ff_rpi_sand30_lines_to_planar_p010(
   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
   unsigned int h);            // [sp, #16] -> r7

+void ff_rpi_sand30_lines_to_planar_y8(
+  uint8_t * dest,             // [r0]
+  unsigned int dst_stride,    // [r1]
+  const uint8_t * src,        // [r2]
+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+  unsigned int src_stride2,   // [sp, #0]  -> r3
+  unsigned int _x,            // [sp, #4]  Ignored - 0
+  unsigned int y,             // [sp, #8]  (r7 in prefix)
+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+  unsigned int h);            // [sp, #16] -> r7
+
 #endif // AVUTIL_ARM_SAND_NEON_H

diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
index 256c3d532f..b6071e2928 100644
--- a/libavutil/rpi_sand_fns.c
+++ b/libavutil/rpi_sand_fns.c
@@ -247,7 +247,7 @@ void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
     const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
     const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words

-#if HAVE_SAND_ASM && 0
+#if HAVE_SAND_ASM
     if (_x == 0) {
         ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
         return;

From 24c3eef4487a36d5189ecd934b65a7c6a0b53d03 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 7 Jun 2022 14:46:12 +0000
Subject: [PATCH 054/161] v4l2_m2m_enc: Add the ability to encode DRM_PRIME
 frames

---
 libavcodec/v4l2_buffers.c | 100 +++++++++++---
 libavcodec/v4l2_buffers.h |  20 ++-
 libavcodec/v4l2_context.c | 212 +++++++++++++++++++++++++---
 libavcodec/v4l2_context.h |  15 +-
 libavcodec/v4l2_m2m.c     |  37 +++--
 libavcodec/v4l2_m2m.h     |   3 +
 libavcodec/v4l2_m2m_dec.c | 171 ++++++-----------------
 libavcodec/v4l2_m2m_enc.c | 283 +++++++++++++++++++++++++++++++++++++-
 8 files changed, 643 insertions(+), 198 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 8c4f18dbed..9ef2f40e39 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -29,6 +29,8 @@
 #include <fcntl.h>
 #include <poll.h>
 #include "libavcodec/avcodec.h"
+#include "libavcodec/internal.h"
+#include "libavutil/avassert.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/hwcontext.h"
 #include "v4l2_context.h"
@@ -60,27 +62,39 @@ static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf)
     return tb.num && tb.den ? tb : v4l2_timebase;
 }

+static inline struct timeval tv_from_int(const int64_t t)
+{
+    return (struct timeval){
+        .tv_usec = t % USEC_PER_SEC,
+        .tv_sec  = t / USEC_PER_SEC
+    };
+}
+
+static inline int64_t int_from_tv(const struct timeval t)
+{
+    return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec;
+}
+
 static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts)
 {
     /* convert pts to v4l2 timebase */
     const int64_t v4l2_pts =
-        out->context->no_pts_rescale ? pts :
         pts == AV_NOPTS_VALUE ? 0 :
             av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
-    out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
-    out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
+    out->buf.timestamp = tv_from_int(v4l2_pts);
 }

 static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf)
 {
+    const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp);
+    return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE;
+#if 0
     /* convert pts back to encoder timebase */
-    const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
-                        avbuf->buf.timestamp.tv_usec;
-
     return
         avbuf->context->no_pts_rescale ? v4l2_pts :
         v4l2_pts == 0 ? AV_NOPTS_VALUE :
             av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
+#endif
 }

 static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length)
@@ -435,7 +449,7 @@ static void v4l2_free_bufref(void *opaque, uint8_t *data)

         ff_mutex_lock(&ctx->lock);

-        avbuf->status = V4L2BUF_AVAILABLE;
+        ff_v4l2_buffer_set_avail(avbuf);

         if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) {
             av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name);
@@ -599,6 +613,38 @@ static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes)
     return i != 0  && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA));
 }

+static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+{
+    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
+
+    if (frame->format != AV_PIX_FMT_DRM_PRIME || !src)
+        return AVERROR(EINVAL);
+
+    av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF);
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
+        // Only currently cope with single buffer types
+        if (out->buf.length != 1)
+            return AVERROR_PATCHWELCOME;
+        if (src->nb_objects != 1)
+            return AVERROR(EINVAL);
+
+        out->planes[0].m.fd = src->objects[0].fd;
+    }
+    else {
+        if (src->nb_objects != 1)
+            return AVERROR(EINVAL);
+
+        out->buf.m.fd      = src->objects[0].fd;
+    }
+
+    // No need to copy src AVDescriptor and if we did then we may confuse
+    // fd close on free
+    out->ref_buf = av_buffer_ref(frame->buf[0]);
+
+    return 0;
+}
+
 static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
 {
     int i;
@@ -678,7 +724,7 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
  *
  ******************************************************************************/

-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts)
 {
     out->buf.flags = frame->key_frame ?
         (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
@@ -688,10 +734,15 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
     v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
     v4l2_set_color_range(out, frame->color_range);
     // PTS & interlace are buffer vars
-    v4l2_set_pts(out, frame->pts);
+    if (track_ts)
+        out->buf.timestamp = tv_from_int(track_ts);
+    else
+        v4l2_set_pts(out, frame->pts);
     v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);

-    return v4l2_buffer_swframe_to_buf(frame, out);
+    return frame->format == AV_PIX_FMT_DRM_PRIME ?
+        v4l2_buffer_primeframe_to_buf(frame, out) :
+        v4l2_buffer_swframe_to_buf(frame, out);
 }

 int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
@@ -754,6 +805,7 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)

     pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
     pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset;
+    pkt->flags = 0;

     if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
         pkt->flags |= AV_PKT_FLAG_KEY;
@@ -768,8 +820,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
     return 0;
 }

-int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-                                    const void *extdata, size_t extlen)
+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
+                                    const void *extdata, size_t extlen,
+                                    const int64_t timestamp)
 {
     int ret;

@@ -783,7 +836,10 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
     if (ret && ret != AVERROR(ENOMEM))
         return ret;

-    v4l2_set_pts(out, pkt->pts);
+    if (timestamp)
+        out->buf.timestamp = tv_from_int(timestamp);
+    else
+        v4l2_set_pts(out, pkt->pts);

     out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ?
         (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
@@ -794,7 +850,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,

 int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
 {
-    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0);
+    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
 }


@@ -814,13 +870,15 @@ static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data)
             close(avbuf->drm_frame.objects[i].fd);
     }

+    av_buffer_unref(&avbuf->ref_buf);
+
     ff_weak_link_unref(&avbuf->context_wl);

     av_free(avbuf);
 }


-int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx)
+int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem)
 {
     int ret, i;
     V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf));
@@ -837,7 +895,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
     }

     avbuf->context = ctx;
-    avbuf->buf.memory = V4L2_MEMORY_MMAP;
+    avbuf->buf.memory = mem;
     avbuf->buf.type = ctx->type;
     avbuf->buf.index = index;

@@ -867,6 +925,8 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
         avbuf->num_planes = 1;

     for (i = 0; i < avbuf->num_planes; i++) {
+        const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP &&
+            (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm);

         avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
             ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline :
@@ -875,21 +935,17 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
         if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;

-            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
-                !buf_to_m2mctx(avbuf)->output_drm) {
+            if (want_mmap)
                 avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
                                                PROT_READ | PROT_WRITE, MAP_SHARED,
                                                buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
-            }
         } else {
             avbuf->plane_info[i].length = avbuf->buf.length;

-            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
-                !buf_to_m2mctx(avbuf)->output_drm) {
+            if (want_mmap)
                 avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
                                                PROT_READ | PROT_WRITE, MAP_SHARED,
                                                buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
-            }
         }

         if (avbuf->plane_info[i].mm_addr == MAP_FAILED) {
diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
index 3b7ca4d99e..1ac32c5989 100644
--- a/libavcodec/v4l2_buffers.h
+++ b/libavcodec/v4l2_buffers.h
@@ -59,6 +59,10 @@ typedef struct V4L2Buffer {

     /* DRM descriptor */
     AVDRMFrameDescriptor drm_frame;
+    /* For DRM_PRIME encode - need to keep a ref to the source buffer till we
+     * are done
+     */
+    AVBufferRef * ref_buf;

     /* keep track of the mmap address and mmap length */
     struct V4L2Plane_info {
@@ -110,8 +114,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf);
  */
 int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);

-int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-                                    const void *extdata, size_t extlen);
+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
+                                    const void *extdata, size_t extlen,
+                                    const int64_t timestamp);

 /**
  * Extracts the data from an AVFrame to a V4L2Buffer
@@ -121,7 +126,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
  *
  * @returns 0 in case of success, a negative AVERROR code otherwise
  */
-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts);

 /**
  * Initializes a V4L2Buffer
@@ -131,7 +136,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
  *
  * @returns 0 in case of success, a negative AVERROR code otherwise
  */
-int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx);
+int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem);

 /**
  * Enqueues a V4L2Buffer
@@ -142,5 +147,12 @@ int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context
  */
 int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf);

+static inline void
+ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf)
+{
+    avbuf->status = V4L2BUF_AVAILABLE;
+    av_buffer_unref(&avbuf->ref_buf);
+}
+

 #endif // AVCODEC_V4L2_BUFFERS_H
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index b3662aedaa..7a707d21fc 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -43,6 +43,160 @@ struct v4l2_format_update {
     int update_avfmt;
 };

+
+static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
+{
+    return (int64_t)n;
+}
+
+static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
+{
+    return (unsigned int)pts;
+}
+
+// FFmpeg requires us to propagate a number of vars from the coded pkt into
+// the decoded frame. The only thing that tracks like that in V4L2 stateful
+// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
+// guarantees about PTS being unique or specified for every frame so replace
+// the supplied PTS with a simple incrementing number and keep a circular
+// buffer of all the things we want preserved (including the original PTS)
+// indexed by the tracking no.
+static int64_t
+xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt)
+{
+    int64_t track_pts;
+
+    // Avoid 0
+    if (++x->track_no == 0)
+        x->track_no = 1;
+
+    track_pts = track_to_pts(avctx, x->track_no);
+
+    av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
+    x->last_pkt_dts = avpkt->dts;
+    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
+        .discard          = 0,
+        .pending          = 1,
+        .pkt_size         = avpkt->size,
+        .pts              = avpkt->pts,
+        .dts              = avpkt->dts,
+        .reordered_opaque = avctx->reordered_opaque,
+        .pkt_pos          = avpkt->pos,
+        .pkt_duration     = avpkt->duration,
+        .track_pts        = track_pts
+    };
+    return track_pts;
+}
+
+static int64_t
+xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame)
+{
+    int64_t track_pts;
+
+    // Avoid 0
+    if (++x->track_no == 0)
+        x->track_no = 1;
+
+    track_pts = track_to_pts(avctx, x->track_no);
+
+    av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no);
+    x->last_pkt_dts = frame->pkt_dts;
+    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
+        .discard          = 0,
+        .pending          = 1,
+        .pkt_size         = 0,
+        .pts              = frame->pts,
+        .dts              = AV_NOPTS_VALUE,
+        .reordered_opaque = frame->reordered_opaque,
+        .pkt_pos          = frame->pkt_pos,
+        .pkt_duration     = frame->pkt_duration,
+        .track_pts        = track_pts
+    };
+    return track_pts;
+}
+
+
+// Returns -1 if we should discard the frame
+static int
+xlat_pts_frame_out(AVCodecContext *const avctx,
+             xlat_track_t * const x,
+             AVFrame *const frame)
+{
+    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
+    V4L2m2mTrackEl *const t = x->track_els + n;
+    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
+    {
+        av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
+               "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
+        frame->pts              = AV_NOPTS_VALUE;
+        frame->pkt_dts          = x->last_pkt_dts;
+        frame->reordered_opaque = x->last_opaque;
+        frame->pkt_pos          = -1;
+        frame->pkt_duration     = 0;
+        frame->pkt_size         = -1;
+    }
+    else if (!t->discard)
+    {
+        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
+        frame->pkt_dts          = x->last_pkt_dts;
+        frame->reordered_opaque = t->reordered_opaque;
+        frame->pkt_pos          = t->pkt_pos;
+        frame->pkt_duration     = t->pkt_duration;
+        frame->pkt_size         = t->pkt_size;
+
+        x->last_opaque = x->track_els[n].reordered_opaque;
+        if (frame->pts != AV_NOPTS_VALUE)
+            x->last_pts = frame->pts;
+        t->pending = 0;
+    }
+    else
+    {
+        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
+        return -1;
+    }
+
+    av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
+           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
+    return 0;
+}
+
+// Returns -1 if we should discard the frame
+static int
+xlat_pts_pkt_out(AVCodecContext *const avctx,
+             xlat_track_t * const x,
+             AVPacket *const pkt)
+{
+    unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE;
+    V4L2m2mTrackEl *const t = x->track_els + n;
+    if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts)
+    {
+        av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
+               "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
+        pkt->pts                = AV_NOPTS_VALUE;
+    }
+    else if (!t->discard)
+    {
+        pkt->pts                = t->pending ? t->pts : AV_NOPTS_VALUE;
+
+        x->last_opaque = x->track_els[n].reordered_opaque;
+        if (pkt->pts != AV_NOPTS_VALUE)
+            x->last_pts = pkt->pts;
+        t->pending = 0;
+    }
+    else
+    {
+        av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
+        return -1;
+    }
+
+    // * Would like something much better than this...xlat(offset + out_count)?
+    pkt->dts = pkt->pts;
+    av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n",
+           pkt->pts, t->track_pts, n);
+    return 0;
+}
+
+
 static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
 {
     return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
@@ -353,12 +507,14 @@ dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf)
     atomic_fetch_sub(&ctx->q_count, 1);

     avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
-    avbuf->status = V4L2BUF_AVAILABLE;
+    ff_v4l2_buffer_set_avail(avbuf);
     avbuf->buf = buf;
     if (is_mp) {
         memcpy(avbuf->planes, planes, sizeof(planes));
         avbuf->buf.m.planes = avbuf->planes;
     }
+    // Done with any attached buffer
+    av_buffer_unref(&avbuf->ref_buf);

     if (V4L2_TYPE_IS_CAPTURE(ctx->type)) {
         // Zero length cap buffer return == EOS
@@ -733,7 +889,7 @@ static void flush_all_buffers_status(V4L2Context* const ctx)
     for (i = 0; i < ctx->num_buffers; ++i) {
         struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
         if (buf->status == V4L2BUF_IN_DRIVER)
-            buf->status = V4L2BUF_AVAILABLE;
+            ff_v4l2_buffer_set_avail(buf);
     }
     atomic_store(&ctx->q_count, 0);
 }
@@ -787,6 +943,8 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
     {
         if (cmd == VIDIOC_STREAMOFF)
             flush_all_buffers_status(ctx);
+        else
+            ctx->first_buf = 1;

         ctx->streamon = (cmd == VIDIOC_STREAMON);
         av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name,
@@ -803,14 +961,16 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)

 int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
 {
-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    V4L2m2mContext *const s = ctx_to_m2mctx(ctx);
+    AVCodecContext *const avctx = s->avctx;
+    int64_t track_ts;
     V4L2Buffer* avbuf;
     int ret;

     if (!frame) {
         ret = v4l2_stop_encode(ctx);
         if (ret)
-            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
+            av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
         s->draining= 1;
         return 0;
     }
@@ -819,7 +979,9 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
     if (!avbuf)
         return AVERROR(EAGAIN);

-    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf);
+    track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame);
+
+    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts);
     if (ret)
         return ret;

@@ -830,14 +992,16 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
                                    const void * extdata, size_t extlen)
 {
     V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    AVCodecContext *const avctx = s->avctx;
     V4L2Buffer* avbuf;
     int ret;
+    int64_t track_ts;

     if (!pkt->size) {
         ret = v4l2_stop_decode(ctx);
         // Log but otherwise ignore stop failure
         if (ret)
-            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
+            av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
         s->draining = 1;
         return 0;
     }
@@ -846,7 +1010,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
     if (!avbuf)
         return AVERROR(EAGAIN);

-    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen);
+    track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt);
+
+    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts);
     if (ret == AVERROR(ENOMEM))
         av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
                __func__, pkt->size, avbuf->planes[0].length);
@@ -858,24 +1024,36 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,

 int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
 {
+    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    AVCodecContext *const avctx = s->avctx;
     V4L2Buffer *avbuf;
     int rv;

-    if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
-        return rv;
+    do {
+        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
+            return rv;
+        if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0)
+            return rv;
+    } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0);

-    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
+   return 0;
 }

 int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
 {
+    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    AVCodecContext *const avctx = s->avctx;
     V4L2Buffer *avbuf;
     int rv;

-    if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
-        return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
+    do {
+        if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
+            return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
+        if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0)
+            return rv;
+    } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0);

-    return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
+    return 0;
 }

 int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
@@ -951,7 +1129,7 @@ void ff_v4l2_context_release(V4L2Context* ctx)
 }


-static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers)
+static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem)
 {
     V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
     struct v4l2_requestbuffers req;
@@ -962,7 +1140,7 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers

     memset(&req, 0, sizeof(req));
     req.count = req_buffers;
-    req.memory = V4L2_MEMORY_MMAP;
+    req.memory = mem;
     req.type = ctx->type;
     while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) {
         if (errno != EINTR) {
@@ -986,7 +1164,7 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers
     }

     for (i = 0; i < ctx->num_buffers; i++) {
-        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx);
+        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem);
         if (ret) {
             av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
             goto fail_release;
@@ -1052,7 +1230,7 @@ int ff_v4l2_context_init(V4L2Context* ctx)
         goto fail_unref_hwframes;
     }

-    ret = create_buffers(ctx, ctx->num_buffers);
+    ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem);
     if (ret < 0)
         goto fail_unref_hwframes;

diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 0efff58f18..21265f1bd7 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -91,11 +91,19 @@ typedef struct V4L2Context {
      */
     int num_buffers;

+    /**
+     * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF
+     */
+    enum v4l2_memory buf_mem;
+
     /**
      * Whether the stream has been started (VIDIOC_STREAMON has been sent).
      */
     int streamon;

+    /* 1st buffer after stream on */
+    int first_buf;
+
     /**
      *  Either no more buffers available or an unrecoverable error was notified
      *  by the V4L2 kernel driver: once set the context has to be exited.
@@ -105,11 +113,10 @@ typedef struct V4L2Context {
     int flag_last;

     /**
-     * PTS rescale not wanted
-     * If the PTS is just a dummy frame count then rescale is
-     * actively harmful
+     * If NZ then when Qing frame/pkt use this rather than the
+     * "real" PTS
      */
-    int no_pts_rescale;
+    uint64_t track_ts;

     AVBufferRef *frames_ref;
     atomic_int q_count;
diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index 6dd01e2e00..1e30d15fd8 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -35,6 +35,14 @@
 #include "v4l2_fmt.h"
 #include "v4l2_m2m.h"

+static void
+xlat_init(xlat_track_t * const x)
+{
+    memset(x, 0, sizeof(*x));
+    x->last_pts = AV_NOPTS_VALUE;
+}
+
+
 static inline int v4l2_splane_video(struct v4l2_capability *cap)
 {
     if (cap->capabilities & (V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT) &&
@@ -67,7 +75,9 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)

     s->capture.done = s->output.done = 0;
     s->capture.name = "capture";
+    s->capture.buf_mem = V4L2_MEMORY_MMAP;
     s->output.name = "output";
+    s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
     atomic_init(&s->refcount, 0);
     sem_init(&s->refsync, 0, 0);

@@ -334,35 +344,38 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *priv)
     return v4l2_configure_contexts(s);
 }

-int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s)
+int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps)
 {
-    *s = av_mallocz(sizeof(V4L2m2mContext));
-    if (!*s)
+    V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext));
+
+    *pps = NULL;
+    if (!s)
         return AVERROR(ENOMEM);

-    priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext),
+    priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s),
                                          &v4l2_m2m_destroy_context, NULL, 0);
     if (!priv->context_ref) {
-        av_freep(s);
+        av_free(s);
         return AVERROR(ENOMEM);
     }

     /* assign the context */
-    priv->context = *s;
-    (*s)->priv = priv;
+    priv->context = s;
+    s->priv = priv;

     /* populate it */
-    priv->context->capture.num_buffers = priv->num_capture_buffers;
-    priv->context->output.num_buffers  = priv->num_output_buffers;
-    priv->context->self_ref = priv->context_ref;
-    priv->context->fd = -1;
+    s->capture.num_buffers = priv->num_capture_buffers;
+    s->output.num_buffers  = priv->num_output_buffers;
+    s->self_ref = priv->context_ref;
+    s->fd = -1;
+    xlat_init(&s->xlat);

     priv->context->frame = av_frame_alloc();
     if (!priv->context->frame) {
         av_buffer_unref(&priv->context_ref);
-        *s = NULL; /* freed when unreferencing context_ref */
         return AVERROR(ENOMEM);
     }

+    *pps = s;
     return 0;
 }
diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 19d618698d..d6cdaf65e1 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -103,6 +103,9 @@ typedef struct V4L2m2mContext {
     /* generate DRM frames */
     int output_drm;

+    /* input frames are drmprime */
+    int input_drm;
+
     /* Frame tracking */
     xlat_track_t xlat;
     int pending_hw;
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 7e17044706..fbbfc81342 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -169,96 +169,17 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s)
     return 0;
 }

-static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
-{
-    return (int64_t)n;
-}
-
-static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
-{
-    return (unsigned int)pts;
-}
-
-// FFmpeg requires us to propagate a number of vars from the coded pkt into
-// the decoded frame. The only thing that tracks like that in V4L2 stateful
-// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
-// guarantees about PTS being unique or specified for every frame so replace
-// the supplied PTS with a simple incrementing number and keep a circular
-// buffer of all the things we want preserved (including the original PTS)
-// indexed by the tracking no.
 static void
-xlat_pts_in(AVCodecContext *const avctx, xlat_track_t *const x, AVPacket *const avpkt)
-{
-    int64_t track_pts;
-
-    // Avoid 0
-    if (++x->track_no == 0)
-        x->track_no = 1;
-
-    track_pts = track_to_pts(avctx, x->track_no);
-
-    av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
-    x->last_pkt_dts = avpkt->dts;
-    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
-        .discard          = 0,
-        .pending          = 1,
-        .pkt_size         = avpkt->size,
-        .pts              = avpkt->pts,
-        .dts              = avpkt->dts,
-        .reordered_opaque = avctx->reordered_opaque,
-        .pkt_pos          = avpkt->pos,
-        .pkt_duration     = avpkt->duration,
-        .track_pts        = track_pts
-    };
-    avpkt->pts = track_pts;
-}
-
-// Returns -1 if we should discard the frame
-static int
-xlat_pts_out(AVCodecContext *const avctx,
-             xlat_track_t * const x,
+set_best_effort_pts(AVCodecContext *const avctx,
              pts_stats_t * const ps,
              AVFrame *const frame)
 {
-    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
-    V4L2m2mTrackEl *const t = x->track_els + n;
-    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
-    {
-        av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-        frame->pts              = AV_NOPTS_VALUE;
-        frame->pkt_dts          = x->last_pkt_dts;
-        frame->reordered_opaque = x->last_opaque;
-        frame->pkt_pos          = -1;
-        frame->pkt_duration     = 0;
-        frame->pkt_size         = -1;
-    }
-    else if (!t->discard)
-    {
-        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
-        frame->pkt_dts          = x->last_pkt_dts;
-        frame->reordered_opaque = t->reordered_opaque;
-        frame->pkt_pos          = t->pkt_pos;
-        frame->pkt_duration     = t->pkt_duration;
-        frame->pkt_size         = t->pkt_size;
-
-        x->last_opaque = x->track_els[n].reordered_opaque;
-        if (frame->pts != AV_NOPTS_VALUE)
-            x->last_pts = frame->pts;
-        t->pending = 0;
-    }
-    else
-    {
-        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-        return -1;
-    }
-
     pts_stats_add(ps, frame->pts);

     frame->best_effort_timestamp = pts_stats_guess(ps);
     frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
-    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
-           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
-    return 0;
+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n",
+           frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
 }

 static void
@@ -272,13 +193,6 @@ xlat_flush(xlat_track_t * const x)
     x->last_pts = AV_NOPTS_VALUE;
 }

-static void
-xlat_init(xlat_track_t * const x)
-{
-    memset(x, 0, sizeof(*x));
-    x->last_pts = AV_NOPTS_VALUE;
-}
-
 static int
 xlat_pending(const xlat_track_t * const x)
 {
@@ -419,8 +333,6 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
             av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret);
             return ret;
         }
-
-        xlat_pts_in(avctx, &s->xlat, &s->buf_pkt);
     }

     if (s->draining) {
@@ -542,49 +454,47 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
                 prefer_dq ? 5 :
                 src_rv == NQ_Q_FULL ? -1 : 0;

-            do {
-                // Dequeue frame will unref any previous contents of frame
-                // if it returns success so we don't need an explicit unref
-                // when discarding
-                // This returns AVERROR(EAGAIN) on timeout or if
-                // there is room in the input Q and timeout == -1
-                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
-
-                // Failure due to no buffer in Q?
-                if (dst_rv == AVERROR(ENOSPC)) {
-                    // Wait & retry
-                    if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
-                        dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
-                    }
+            // Dequeue frame will unref any previous contents of frame
+            // if it returns success so we don't need an explicit unref
+            // when discarding
+            // This returns AVERROR(EAGAIN) on timeout or if
+            // there is room in the input Q and timeout == -1
+            dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
+
+            // Failure due to no buffer in Q?
+            if (dst_rv == AVERROR(ENOSPC)) {
+                // Wait & retry
+                if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
+                    dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
                 }
+            }
+
+            // Adjust dynamic pending threshold
+            if (dst_rv == 0) {
+                if (--s->pending_hw < PENDING_HW_MIN)
+                    s->pending_hw = PENDING_HW_MIN;
+                s->pending_n = 0;

-                // Adjust dynamic pending threshold
-                if (dst_rv == 0) {
-                    if (--s->pending_hw < PENDING_HW_MIN)
-                        s->pending_hw = PENDING_HW_MIN;
+                set_best_effort_pts(avctx, &s->pts_stat, frame);
+            }
+            else if (dst_rv == AVERROR(EAGAIN)) {
+                if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
+                    s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
                     s->pending_n = 0;
                 }
-                else if (dst_rv == AVERROR(EAGAIN)) {
-                    if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
-                        s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
-                        s->pending_n = 0;
-                    }
-                }
+            }

-                if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
-                    av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
-                    dst_rv = AVERROR_EOF;
-                    s->capture.done = 1;
-                }
-                else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
-                    av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
-                           s->draining, s->capture.done);
-                else if (dst_rv && dst_rv != AVERROR(EAGAIN))
-                    av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
-                           s->draining, s->capture.done, dst_rv);
-
-                // Go again if we got a frame that we need to discard
-            } while (dst_rv == 0 && xlat_pts_out(avctx, &s->xlat, &s->pts_stat, frame));
+            if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
+                av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
+                dst_rv = AVERROR_EOF;
+                s->capture.done = 1;
+            }
+            else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
+                av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
+                       s->draining, s->capture.done);
+            else if (dst_rv && dst_rv != AVERROR(EAGAIN))
+                av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
+                       s->draining, s->capture.done, dst_rv);
         }

         ++i;
@@ -791,7 +701,6 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     if (ret < 0)
         return ret;

-    xlat_init(&s->xlat);
     pts_stats_init(&s->pts_stat, avctx, "decoder");
     s->pending_hw = PENDING_HW_MIN;

@@ -810,12 +719,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     output->av_codec_id = avctx->codec_id;
     output->av_pix_fmt  = AV_PIX_FMT_NONE;
     output->min_buf_size = max_coded_size(avctx);
-    output->no_pts_rescale = 1;

     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
     capture->av_pix_fmt = avctx->pix_fmt;
     capture->min_buf_size = 0;
-    capture->no_pts_rescale = 1;

     /* the client requests the codec to generate DRM frames:
      *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
index 9a0837ecf3..05ff6ba726 100644
--- a/libavcodec/v4l2_m2m_enc.c
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -24,6 +24,8 @@
 #include <linux/videodev2.h>
 #include <sys/ioctl.h>
 #include <search.h>
+#include <drm_fourcc.h>
+
 #include "encode.h"
 #include "libavcodec/avcodec.h"
 #include "libavutil/pixdesc.h"
@@ -38,6 +40,34 @@
 #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x
 #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x

+// P030 should be defined in drm_fourcc.h and hopefully will be sometime
+// in the future but until then...
+#ifndef DRM_FORMAT_P030
+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
+#endif
+
+#ifndef DRM_FORMAT_NV15
+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
+#endif
+
+#ifndef DRM_FORMAT_NV20
+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
+#endif
+
+#ifndef V4L2_CID_CODEC_BASE
+#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
+#endif
+
+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
+// in videodev2.h hopefully will be sometime in the future but until then...
+#ifndef V4L2_PIX_FMT_NV12_10_COL128
+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
+#endif
+
+#ifndef V4L2_PIX_FMT_NV12_COL128
+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
+#endif
+
 static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den)
 {
     struct v4l2_streamparm parm = { 0 };
@@ -148,15 +178,14 @@ static inline int v4l2_mpeg4_profile_from_ff(int p)
 static int v4l2_check_b_frame_support(V4L2m2mContext *s)
 {
     if (s->avctx->max_b_frames)
-        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n");
+        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames);

-    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0);
+    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1);
     v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0);
     if (s->avctx->max_b_frames == 0)
         return 0;

     avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding");
-
     return AVERROR_PATCHWELCOME;
 }

@@ -271,13 +300,184 @@ static int v4l2_prepare_encoder(V4L2m2mContext *s)
     return 0;
 }

+static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame)
+{
+    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
+
+    const uint32_t drm_fmt = src->layers[0].format;
+    // Treat INVALID as LINEAR
+    const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
+        DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
+    uint32_t pix_fmt = 0;
+    uint32_t w = 0;
+    uint32_t h = 0;
+    uint32_t bpl = src->layers[0].planes[0].pitch;
+
+    // We really don't expect multiple layers
+    // All formats that we currently cope with are single object
+
+    if (src->nb_layers != 1 || src->nb_objects != 1)
+        return AVERROR(EINVAL);
+
+    switch (drm_fmt) {
+        case DRM_FORMAT_YUV420:
+            if (mod == DRM_FORMAT_MOD_LINEAR) {
+                if (src->layers[0].nb_planes != 3)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_YUV420;
+                h = src->layers[0].planes[1].offset / bpl;
+                w = bpl;
+            }
+            break;
+
+        case DRM_FORMAT_NV12:
+            if (mod == DRM_FORMAT_MOD_LINEAR) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_NV12;
+                h = src->layers[0].planes[1].offset / bpl;
+                w = bpl;
+            }
+            else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_NV12_COL128;
+                w = bpl;
+                h = src->layers[0].planes[1].offset / 128;
+                bpl = fourcc_mod_broadcom_param(mod);
+            }
+            break;
+
+        case DRM_FORMAT_P030:
+            if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt =  V4L2_PIX_FMT_NV12_10_COL128;
+                w = bpl / 2;  // Matching lie to how we construct this
+                h = src->layers[0].planes[1].offset / 128;
+                bpl = fourcc_mod_broadcom_param(mod);
+            }
+            break;
+
+        default:
+            break;
+    }
+
+    if (!pix_fmt)
+        return AVERROR(EINVAL);
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
+        struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
+
+        pix->width = w;
+        pix->height = h;
+        pix->pixelformat = pix_fmt;
+        pix->plane_fmt[0].bytesperline = bpl;
+        pix->num_planes = 1;
+    }
+    else {
+        struct v4l2_pix_format *const pix = &format->fmt.pix;
+
+        pix->width = w;
+        pix->height = h;
+        pix->pixelformat = pix_fmt;
+        pix->bytesperline = bpl;
+    }
+
+    return 0;
+}
+
+// Do we have similar enough formats to be usable?
+static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b)
+{
+    if (a->type != b->type)
+        return 0;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) {
+        const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp;
+        const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp;
+        unsigned int i;
+        if (pa->pixelformat != pb->pixelformat ||
+            pa->num_planes != pb->num_planes)
+            return 0;
+        for (i = 0; i != pa->num_planes; ++i) {
+            if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline)
+                return 0;
+        }
+    }
+    else {
+        const struct v4l2_pix_format *const pa = &a->fmt.pix;
+        const struct v4l2_pix_format *const pb = &b->fmt.pix;
+        if (pa->pixelformat != pb->pixelformat ||
+            pa->bytesperline != pb->bytesperline)
+            return 0;
+    }
+    return 1;
+}
+
+
 static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
 {
     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
     V4L2Context *const output = &s->output;

+    // Signal EOF if needed
+    if (!frame) {
+        return ff_v4l2_context_enqueue_frame(output, frame);
+    }
+
+    if (s->input_drm && !output->streamon) {
+        int rv;
+        struct v4l2_format req_format = {.type = output->format.type};
+
+        // Set format when we first get a buffer
+        if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n");
+            return rv;
+        }
+
+        ff_v4l2_context_release(output);
+
+        output->format = req_format;
+
+        if ((rv = ff_v4l2_context_set_format(output)) != 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n");
+            return rv;
+        }
+
+        if (!fmt_eq(&req_format, &output->format)) {
+            av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n");
+            return AVERROR(EINVAL);
+        }
+
+        output->selection.top = frame->crop_top;
+        output->selection.left = frame->crop_left;
+        output->selection.width = av_frame_cropped_width(frame);
+        output->selection.height = av_frame_cropped_height(frame);
+
+        if ((rv = ff_v4l2_context_init(output)) != 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n");
+            return rv;
+        }
+
+        {
+            struct v4l2_selection selection = {
+                .type = V4L2_BUF_TYPE_VIDEO_OUTPUT,
+                .target = V4L2_SEL_TGT_CROP,
+                .r = output->selection
+            };
+            if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) {
+                av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n",
+                       selection.r.width, selection.r.height, selection.r.left, selection.r.top,
+                       av_err2str(AVERROR(errno)));
+            }
+            av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n",
+                   selection.r.width, selection.r.height, selection.r.left, selection.r.top);
+        }
+    }
+
 #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME
-    if (frame && frame->pict_type == AV_PICTURE_TYPE_I)
+    if (frame->pict_type == AV_PICTURE_TYPE_I)
         v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1);
 #endif

@@ -328,7 +528,70 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
     }

 dequeue:
-    return ff_v4l2_context_dequeue_packet(capture, avpkt);
+    if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
+        return ret;
+
+    if (capture->first_buf == 1) {
+        uint8_t * data;
+        const int len = avpkt->size;
+
+        // 1st buffer after streamon should be SPS/PPS
+        capture->first_buf = 2;
+
+        // Clear both possible stores so there is no chance of confusion
+        av_freep(&s->extdata_data);
+        s->extdata_size = 0;
+        av_freep(&avctx->extradata);
+        avctx->extradata_size = 0;
+
+        if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL)
+            memcpy(data, avpkt->data, len);
+
+        av_packet_unref(avpkt);
+
+        if (data == NULL)
+            return AVERROR(ENOMEM);
+
+        // We need to copy the header, but keep local if not global
+        if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) {
+            avctx->extradata = data;
+            avctx->extradata_size = len;
+        }
+        else {
+            s->extdata_data = data;
+            s->extdata_size = len;
+        }
+
+        if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
+            return ret;
+    }
+
+    // First frame must be key so mark as such even if encoder forgot
+    if (capture->first_buf == 2)
+        avpkt->flags |= AV_PKT_FLAG_KEY;
+
+    // Add SPS/PPS to the start of every key frame if non-global headers
+    if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) {
+        const size_t newlen = s->extdata_size + avpkt->size;
+        AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE);
+
+        if (buf == NULL) {
+            av_packet_unref(avpkt);
+            return AVERROR(ENOMEM);
+        }
+
+        memcpy(buf->data, s->extdata_data, s->extdata_size);
+        memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size);
+
+        av_buffer_unref(&avpkt->buf);
+        avpkt->buf = buf;
+        avpkt->data = buf->data;
+        avpkt->size = newlen;
+    }
+
+//    av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret);
+    capture->first_buf = 0;
+    return 0;
 }

 static av_cold int v4l2_encode_init(AVCodecContext *avctx)
@@ -340,6 +603,8 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
     uint32_t v4l2_fmt_output;
     int ret;

+    av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt);
+
     ret = ff_v4l2_m2m_create_context(priv, &s);
     if (ret < 0)
         return ret;
@@ -347,13 +612,17 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
     capture = &s->capture;
     output  = &s->output;

+    s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME);
+
     /* common settings output/capture */
     output->height = capture->height = avctx->height;
     output->width = capture->width = avctx->width;

     /* output context */
     output->av_codec_id = AV_CODEC_ID_RAWVIDEO;
-    output->av_pix_fmt = avctx->pix_fmt;
+    output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt :
+            avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt :
+            AV_PIX_FMT_YUV420P;

     /* capture context */
     capture->av_codec_id = avctx->codec_id;
@@ -372,7 +641,7 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
         v4l2_fmt_output = output->format.fmt.pix.pixelformat;

     pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO);
-    if (pix_fmt_output != avctx->pix_fmt) {
+    if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) {
         const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output);
         av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name);
         return AVERROR(EINVAL);

From 6b437ce70582c67971aa81871a6694a08b709784 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 8 Jun 2022 16:13:31 +0000
Subject: [PATCH 055/161] v4l2_m2m_dec: Use DTS for best effort PTS if PTS is
 always NO_PTS

If we do have DTS but don't have PTS then assume PTS=DTS.
Also get rid of last_dts from tracking as its info wasn't actually
useful in any way.
---
 libavcodec/v4l2_context.c | 6 ++----
 libavcodec/v4l2_m2m.h     | 1 -
 libavcodec/v4l2_m2m_dec.c | 8 +++++++-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 7a707d21fc..6b97eab41e 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -73,7 +73,6 @@ xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPack
     track_pts = track_to_pts(avctx, x->track_no);

     av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
-    x->last_pkt_dts = avpkt->dts;
     x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
         .discard          = 0,
         .pending          = 1,
@@ -100,7 +99,6 @@ xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFr
     track_pts = track_to_pts(avctx, x->track_no);

     av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no);
-    x->last_pkt_dts = frame->pkt_dts;
     x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
         .discard          = 0,
         .pending          = 1,
@@ -129,7 +127,7 @@ xlat_pts_frame_out(AVCodecContext *const avctx,
         av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
                "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
         frame->pts              = AV_NOPTS_VALUE;
-        frame->pkt_dts          = x->last_pkt_dts;
+        frame->pkt_dts          = AV_NOPTS_VALUE;
         frame->reordered_opaque = x->last_opaque;
         frame->pkt_pos          = -1;
         frame->pkt_duration     = 0;
@@ -138,7 +136,7 @@ xlat_pts_frame_out(AVCodecContext *const avctx,
     else if (!t->discard)
     {
         frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
-        frame->pkt_dts          = x->last_pkt_dts;
+        frame->pkt_dts          = t->dts;
         frame->reordered_opaque = t->reordered_opaque;
         frame->pkt_pos          = t->pkt_pos;
         frame->pkt_duration     = t->pkt_duration;
diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index d6cdaf65e1..ee72beb052 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -67,7 +67,6 @@ typedef struct pts_stats_s
 typedef struct xlat_track_s {
     unsigned int track_no;
     int64_t last_pts;
-    int64_t last_pkt_dts;
     int64_t last_opaque;
     V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
 } xlat_track_t;
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index fbbfc81342..485a96f4b4 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -177,7 +177,13 @@ set_best_effort_pts(AVCodecContext *const avctx,
     pts_stats_add(ps, frame->pts);

     frame->best_effort_timestamp = pts_stats_guess(ps);
-    frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
+    // If we can't guess from just PTS - try DTS
+    if (frame->best_effort_timestamp == AV_NOPTS_VALUE)
+        frame->best_effort_timestamp = frame->pkt_dts;
+
+    // We can't emulate what s/w does in a useful manner and using the
+    // "correct" answer seems to just confuse things.
+    frame->pkt_dts               = frame->pts;
     av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n",
            frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
 }

From ec8d1c2c0b6bd3544e5e30500a167fc31abde17a Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 30 Jun 2022 15:59:23 +0000
Subject: [PATCH 056/161] v4l2: Update H265 request for current API

This works with v9 of the H265 patch set which hopefully will be the
last one. Hevc controls extracted from patched v4l2-controls into
hevc-ctrls-v4 - if HEVC controls found in the system v4l2-controls then
those will be used instead.
---
 libavcodec/Makefile            |   2 +-
 libavcodec/hevc-ctrls-v4.h     | 515 +++++++++++++++++++++++++++++++++
 libavcodec/v4l2_req_hevc_v4.c  |   3 +
 libavcodec/v4l2_req_hevc_vx.c  |  81 ++++--
 libavcodec/v4l2_request_hevc.c |   6 +-
 libavcodec/v4l2_request_hevc.h |   1 +
 6 files changed, 583 insertions(+), 25 deletions(-)
 create mode 100644 libavcodec/hevc-ctrls-v4.h
 create mode 100644 libavcodec/v4l2_req_hevc_v4.c

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 2b3c16185d..d433a71236 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -1000,7 +1000,7 @@ OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
 OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
 OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec.o
 OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o\
-                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o
+                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o  v4l2_req_hevc_v4.o
 OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h
new file mode 100644
index 0000000000..7e05f6e7c3
--- /dev/null
+++ b/libavcodec/hevc-ctrls-v4.h
@@ -0,0 +1,515 @@
+/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */
+/*
+ *  Video for Linux Two controls header file
+ *
+ *  Copyright (C) 1999-2012 the contributors
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  Alternatively you can redistribute this file under the terms of the
+ *  BSD license as stated below:
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *  3. The names of its contributors may not be used to endorse or promote
+ *     products derived from this software without specific prior written
+ *     permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ *  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  The contents of this header was split off from videodev2.h. All control
+ *  definitions should be added to this header, which is included by
+ *  videodev2.h.
+ */
+
+#ifndef AVCODEC_HEVC_CTRLS_V4_H
+#define AVCODEC_HEVC_CTRLS_V4_H
+
+#include <linux/const.h>
+#include <linux/types.h>
+
+#define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_STATELESS_BASE + 400)
+#define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_STATELESS_BASE + 401)
+#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 402)
+#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_STATELESS_BASE + 403)
+#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 404)
+#define V4L2_CID_STATELESS_HEVC_DECODE_MODE	(V4L2_CID_CODEC_STATELESS_BASE + 405)
+#define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_STATELESS_BASE + 406)
+#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407)
+
+enum v4l2_stateless_hevc_decode_mode {
+	V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
+	V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
+};
+
+enum v4l2_stateless_hevc_start_code {
+	V4L2_STATELESS_HEVC_START_CODE_NONE,
+	V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
+};
+
+#define V4L2_HEVC_SLICE_TYPE_B	0
+#define V4L2_HEVC_SLICE_TYPE_P	1
+#define V4L2_HEVC_SLICE_TYPE_I	2
+
+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
+
+/**
+ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set
+ *
+ * @video_parameter_set_id: specifies the value of the
+ *			vps_video_parameter_set_id of the active VPS
+ * @seq_parameter_set_id: provides an identifier for the SPS for
+ *			  reference by other syntax elements
+ * @pic_width_in_luma_samples:	specifies the width of each decoded picture
+ *				in units of luma samples
+ * @pic_height_in_luma_samples: specifies the height of each decoded picture
+ *				in units of luma samples
+ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the
+ *                         samples of the luma array
+ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the
+ *                           samples of the chroma arrays
+ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of
+ *                                     the variable MaxPicOrderCntLsb
+ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum
+ *                                    required size of the decoded picture
+ *                                    buffer for the codec video sequence
+ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures
+ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the
+ *				    value of SpsMaxLatencyPictures array
+ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum
+ *					    luma coding block size
+ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between
+ *					      the maximum and minimum luma
+ *					      coding block size
+ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma
+ *					       transform block size
+ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between
+ *						 the maximum and minimum luma
+ *						 transform block size
+ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy
+ *					 depth for transform units of
+ *					 coding units coded in inter
+ *					 prediction mode
+ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy
+ *					 depth for transform units of
+ *					 coding units coded in intra
+ *					 prediction mode
+ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of
+ *                                    bits used to represent each of PCM sample
+ *                                    values of the luma component
+ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number
+ *                                      of bits used to represent each of PCM
+ *                                      sample values of the chroma components
+ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the
+ *                                              minimum size of coding blocks
+ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between
+ *						  the maximum and minimum size of
+ *						  coding blocks
+ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set()
+ *				 syntax structures included in the SPS
+ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term
+ *				reference pictures that are specified in the SPS
+ * @chroma_format_idc: specifies the chroma sampling
+ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number
+ *                             of temporal sub-layers
+ * @reserved: padding field. Should be zeroed by applications.
+ * @flags: see V4L2_HEVC_SPS_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_sps {
+	__u8	video_parameter_set_id;
+	__u8	seq_parameter_set_id;
+	__u16	pic_width_in_luma_samples;
+	__u16	pic_height_in_luma_samples;
+	__u8	bit_depth_luma_minus8;
+	__u8	bit_depth_chroma_minus8;
+	__u8	log2_max_pic_order_cnt_lsb_minus4;
+	__u8	sps_max_dec_pic_buffering_minus1;
+	__u8	sps_max_num_reorder_pics;
+	__u8	sps_max_latency_increase_plus1;
+	__u8	log2_min_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_luma_coding_block_size;
+	__u8	log2_min_luma_transform_block_size_minus2;
+	__u8	log2_diff_max_min_luma_transform_block_size;
+	__u8	max_transform_hierarchy_depth_inter;
+	__u8	max_transform_hierarchy_depth_intra;
+	__u8	pcm_sample_bit_depth_luma_minus1;
+	__u8	pcm_sample_bit_depth_chroma_minus1;
+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
+	__u8	num_short_term_ref_pic_sets;
+	__u8	num_long_term_ref_pics_sps;
+	__u8	chroma_format_idc;
+	__u8	sps_max_sub_layers_minus1;
+
+	__u8	reserved[6];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
+
+/**
+ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set
+ *
+ * @pic_parameter_set_id: identifies the PPS for reference by other
+ *			  syntax elements
+ * @num_extra_slice_header_bits: specifies the number of extra slice header
+ *				 bits that are present in the slice header RBSP
+ *				 for coded pictures referring to the PPS.
+ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the
+ *                                        inferred value of num_ref_idx_l0_active_minus1
+ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the
+ *                                        inferred value of num_ref_idx_l1_active_minus1
+ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for
+ *		     each slice referring to the PPS
+ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding
+ *			    tree block size and the minimum luma coding block
+ *			    size of coding units that convey cu_qp_delta_abs
+ *			    and cu_qp_delta_sign_flag
+ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb
+ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr
+ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns
+ *			     partitioning the picture
+ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning
+ *			  the picture
+ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in
+ *			 units of coding tree blocks
+ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in
+ *		       units of coding tree blocks
+ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for
+ *			  beta divided by 2
+ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC
+ *			divided by 2
+ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of
+ *                                    the variable Log2ParMrgLevel
+ * @reserved: padding field. Should be zeroed by applications.
+ * @flags: see V4L2_HEVC_PPS_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_pps {
+	__u8	pic_parameter_set_id;
+	__u8	num_extra_slice_header_bits;
+	__u8	num_ref_idx_l0_default_active_minus1;
+	__u8	num_ref_idx_l1_default_active_minus1;
+	__s8	init_qp_minus26;
+	__u8	diff_cu_qp_delta_depth;
+	__s8	pps_cb_qp_offset;
+	__s8	pps_cr_qp_offset;
+	__u8	num_tile_columns_minus1;
+	__u8	num_tile_rows_minus1;
+	__u8	column_width_minus1[20];
+	__u8	row_height_minus1[22];
+	__s8	pps_beta_offset_div2;
+	__s8	pps_tc_offset_div2;
+	__u8	log2_parallel_merge_level_minus2;
+	__u8	reserved;
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
+
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME				0
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD			1
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD			2
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM			3
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP			4
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP			5
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM		6
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING			7
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING			8
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM	9
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP	10
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM		11
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP		12
+
+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
+
+/**
+ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry
+ *
+ * @timestamp: timestamp of the V4L2 capture buffer to use as reference.
+ * @flags: long term flag for the reference frame
+ * @field_pic: whether the reference is a field picture or a frame.
+ * @reserved: padding field. Should be zeroed by applications.
+ * @pic_order_cnt_val: the picture order count of the current picture.
+ */
+struct v4l2_hevc_dpb_entry {
+	__u64	timestamp;
+	__u8	flags;
+	__u8	field_pic;
+	__u16	reserved;
+	__s32	pic_order_cnt_val;
+};
+
+/**
+ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters
+ *
+ * @delta_luma_weight_l0: the difference of the weighting factor applied
+ *			  to the luma prediction value for list 0
+ * @luma_offset_l0: the additive offset applied to the luma prediction value
+ *		    for list 0
+ * @delta_chroma_weight_l0: the difference of the weighting factor applied
+ *			    to the chroma prediction values for list 0
+ * @chroma_offset_l0: the difference of the additive offset applied to
+ *		      the chroma prediction values for list 0
+ * @delta_luma_weight_l1: the difference of the weighting factor applied
+ *			  to the luma prediction value for list 1
+ * @luma_offset_l1: the additive offset applied to the luma prediction value
+ *		    for list 1
+ * @delta_chroma_weight_l1: the difference of the weighting factor applied
+ *			    to the chroma prediction values for list 1
+ * @chroma_offset_l1: the difference of the additive offset applied to
+ *		      the chroma prediction values for list 1
+ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for
+ *			    all luma weighting factors
+ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm
+ *				    of the denominator for all chroma
+ *				    weighting factors
+ */
+struct v4l2_hevc_pred_weight_table {
+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__u8	luma_log2_weight_denom;
+	__s8	delta_chroma_log2_weight_denom;
+};
+
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
+
+/**
+ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters
+ *
+ * This control is a dynamically sized 1-dimensional array,
+ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
+ *
+ * @bit_size: size (in bits) of the current slice data
+ * @data_byte_offset: offset (in bytes) to the video data in the current slice data
+ * @num_entry_point_offsets: specifies the number of entry point offset syntax
+ *			     elements in the slice header.
+ * @nal_unit_type: specifies the coding type of the slice (B, P or I)
+ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit
+ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{}
+ * @colour_plane_id: specifies the colour plane associated with the current slice
+ * @slice_pic_order_cnt: specifies the picture order count
+ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum
+ *                                reference index for reference picture list 0
+ *                                that may be used to decode the slice
+ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum
+ *                                reference index for reference picture list 1
+ *                                that may be used to decode the slice
+ * @collocated_ref_idx: specifies the reference index of the collocated picture used
+ *			for temporal motion vector prediction
+ * @five_minus_max_num_merge_cand: specifies the maximum number of merging
+ *				   motion vector prediction candidates supported in
+ *				   the slice subtracted from 5
+ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding
+ *		    blocks in the slice
+ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset
+ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset
+ * @slice_act_y_qp_offset: screen content extension parameters
+ * @slice_act_cb_qp_offset: screen content extension parameters
+ * @slice_act_cr_qp_offset: screen content extension parameters
+ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2
+ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2
+ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or
+ *		more fields
+ * @reserved0: padding field. Should be zeroed by applications.
+ * @slice_segment_addr: specifies the address of the first coding tree block in
+ *			the slice segment
+ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB
+ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB
+ * @short_term_ref_pic_set_size: specifies the size of short-term reference
+ *				 pictures set included in the SPS
+ * @long_term_ref_pic_set_size: specifies the size of long-term reference
+ *				pictures set include in the SPS
+ * @pred_weight_table: the prediction weight coefficients for inter-picture
+ *		       prediction
+ * @reserved1: padding field. Should be zeroed by applications.
+ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_slice_params {
+	__u32	bit_size;
+	__u32	data_byte_offset;
+	__u32	num_entry_point_offsets;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+	__u8	nal_unit_type;
+	__u8	nuh_temporal_id_plus1;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	slice_type;
+	__u8	colour_plane_id;
+	__s32	slice_pic_order_cnt;
+	__u8	num_ref_idx_l0_active_minus1;
+	__u8	num_ref_idx_l1_active_minus1;
+	__u8	collocated_ref_idx;
+	__u8	five_minus_max_num_merge_cand;
+	__s8	slice_qp_delta;
+	__s8	slice_cb_qp_offset;
+	__s8	slice_cr_qp_offset;
+	__s8	slice_act_y_qp_offset;
+	__s8	slice_act_cb_qp_offset;
+	__s8	slice_act_cr_qp_offset;
+	__s8	slice_beta_offset_div2;
+	__s8	slice_tc_offset_div2;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
+	__u8	pic_struct;
+
+	__u8	reserved0[3];
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u32	slice_segment_addr;
+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u16	short_term_ref_pic_set_size;
+	__u16	long_term_ref_pic_set_size;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
+	struct v4l2_hevc_pred_weight_table pred_weight_table;
+
+	__u8	reserved1[2];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
+
+/**
+ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters
+ *
+ * @pic_order_cnt_val: picture order count
+ * @short_term_ref_pic_set_size: specifies the size of short-term reference
+ *				 pictures set included in the SPS of the first slice
+ * @long_term_ref_pic_set_size: specifies the size of long-term reference
+ *				pictures set include in the SPS of the first slice
+ * @num_active_dpb_entries: the number of entries in dpb
+ * @num_poc_st_curr_before: the number of reference pictures in the short-term
+ *			    set that come before the current frame
+ * @num_poc_st_curr_after: the number of reference pictures in the short-term
+ *			   set that come after the current frame
+ * @num_poc_lt_curr: the number of reference pictures in the long-term set
+ * @poc_st_curr_before: provides the index of the short term before references
+ *			in DPB array
+ * @poc_st_curr_after: provides the index of the short term after references
+ *		       in DPB array
+ * @poc_lt_curr: provides the index of the long term references in DPB array
+ * @reserved: padding field. Should be zeroed by applications.
+ * @dpb: the decoded picture buffer, for meta-data about reference frames
+ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_decode_params {
+	__s32	pic_order_cnt_val;
+	__u16	short_term_ref_pic_set_size;
+	__u16	long_term_ref_pic_set_size;
+	__u8	num_active_dpb_entries;
+	__u8	num_poc_st_curr_before;
+	__u8	num_poc_st_curr_after;
+	__u8	num_poc_lt_curr;
+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	reserved[4];
+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u64	flags;
+};
+
+/**
+ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters
+ *
+ * @scaling_list_4x4: scaling list is used for the scaling process for
+ *		      transform coefficients. The values on each scaling
+ *		      list are expected in raster scan order
+ * @scaling_list_8x8: scaling list is used for the scaling process for
+ *		      transform coefficients. The values on each scaling
+ *		      list are expected in raster scan order
+ * @scaling_list_16x16:	scaling list is used for the scaling process for
+ *			transform coefficients. The values on each scaling
+ *			list are expected in raster scan order
+ * @scaling_list_32x32:	scaling list is used for the scaling process for
+ *			transform coefficients. The values on each scaling
+ *			list are expected in raster scan order
+ * @scaling_list_dc_coef_16x16:	scaling list is used for the scaling process
+ *				for transform coefficients. The values on each
+ *				scaling list are expected in raster scan order.
+ * @scaling_list_dc_coef_32x32:	scaling list is used for the scaling process
+ *				for transform coefficients. The values on each
+ *				scaling list are expected in raster scan order.
+ */
+struct v4l2_ctrl_hevc_scaling_matrix {
+	__u8	scaling_list_4x4[6][16];
+	__u8	scaling_list_8x8[6][64];
+	__u8	scaling_list_16x16[6][64];
+	__u8	scaling_list_32x32[2][64];
+	__u8	scaling_list_dc_coef_16x16[6];
+	__u8	scaling_list_dc_coef_32x32[2];
+};
+
+#endif
diff --git a/libavcodec/v4l2_req_hevc_v4.c b/libavcodec/v4l2_req_hevc_v4.c
new file mode 100644
index 0000000000..c35579d8e0
--- /dev/null
+++ b/libavcodec/v4l2_req_hevc_v4.c
@@ -0,0 +1,3 @@
+#define HEVC_CTRLS_VERSION 4
+#include "v4l2_req_hevc_vx.c"
+
diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
index 611fa21cc3..761c5b2dc7 100644
--- a/libavcodec/v4l2_req_hevc_vx.c
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -6,8 +6,6 @@
 #include "internal.h"
 #include "thread.h"

-#include "v4l2_request_hevc.h"
-
 #if HEVC_CTRLS_VERSION == 1
 #include "hevc-ctrls-v1.h"

@@ -18,10 +16,37 @@
 #include "hevc-ctrls-v2.h"
 #elif HEVC_CTRLS_VERSION == 3
 #include "hevc-ctrls-v3.h"
+#elif HEVC_CTRLS_VERSION == 4
+#include <linux/v4l2-controls.h>
+#if !defined(V4L2_CID_STATELESS_HEVC_SPS)
+#include "hevc-ctrls-v4.h"
+#endif
 #else
 #error Unknown HEVC_CTRLS_VERSION
 #endif

+#ifndef V4L2_CID_STATELESS_HEVC_SPS
+#define V4L2_CID_STATELESS_HEVC_SPS                     V4L2_CID_MPEG_VIDEO_HEVC_SPS
+#define V4L2_CID_STATELESS_HEVC_PPS                     V4L2_CID_MPEG_VIDEO_HEVC_PPS
+#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS            V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS
+#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX          V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX
+#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS           V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS
+#define V4L2_CID_STATELESS_HEVC_DECODE_MODE             V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE
+#define V4L2_CID_STATELESS_HEVC_START_CODE              V4L2_CID_MPEG_VIDEO_HEVC_START_CODE
+
+#define V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED
+#define V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED
+#define V4L2_STATELESS_HEVC_START_CODE_NONE             V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE
+#define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B          V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B
+#endif
+
+// Should be in videodev2 but we might not have a good enough one
+#ifndef V4L2_PIX_FMT_HEVC_SLICE
+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
+#endif
+
+#include "v4l2_request_hevc.h"
+
 #include "libavutil/hwcontext_drm.h"

 #include <semaphore.h>
@@ -259,9 +284,13 @@ fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const
 #endif
             entry->field_pic = frame->frame->interlaced_frame;

+#if HEVC_CTRLS_VERSION <= 3
             /* TODO: Interleaved: Get the POC for each field. */
             entry->pic_order_cnt[0] = frame->poc;
             entry->pic_order_cnt[1] = frame->poc;
+#else
+            entry->pic_order_cnt_val = frame->poc;
+#endif
         }
     }
     return n;
@@ -287,8 +316,11 @@ static void fill_slice_params(const HEVCContext * const h,

     *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
         .bit_size = bit_size,
+#if HEVC_CTRLS_VERSION <= 3
         .data_bit_offset = bit_offset,
-
+#else
+        .data_byte_offset = bit_offset / 8 + 1,
+#endif
         /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
         .slice_segment_addr = sh->slice_segment_addr,

@@ -376,8 +408,10 @@ static void fill_slice_params(const HEVCContext * const h,
         av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
     }

+#if HEVC_CTRLS_VERSION <= 3
     for (i = 0; i < slice_params->num_entry_point_offsets; i++)
         slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
+#endif
 }

 #if HEVC_CTRLS_VERSION >= 2
@@ -761,30 +795,30 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,

     struct v4l2_ext_control control[] = {
         {
-            .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
+            .id = V4L2_CID_STATELESS_HEVC_SPS,
             .ptr = &controls->sps,
             .size = sizeof(controls->sps),
         },
         {
-            .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS,
+            .id = V4L2_CID_STATELESS_HEVC_PPS,
             .ptr = &controls->pps,
             .size = sizeof(controls->pps),
         },
 #if HEVC_CTRLS_VERSION >= 2
         {
-            .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS,
+            .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS,
             .ptr = dec,
             .size = sizeof(*dec),
         },
 #endif
         {
-            .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS,
+            .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
             .ptr = slices + slice_no,
             .size = sizeof(*slices) * slice_count,
         },
         // Optional
         {
-            .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX,
+            .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX,
             .ptr = &controls->scaling_matrix,
             .size = sizeof(controls->scaling_matrix),
         },
@@ -1000,12 +1034,12 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)

     // Check for var slice array
     struct v4l2_query_ext_ctrl qc[] = {
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS },
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS },
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS },
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX },
+        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS },
+        { .id = V4L2_CID_STATELESS_HEVC_SPS },
+        { .id = V4L2_CID_STATELESS_HEVC_PPS },
+        { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX },
 #if HEVC_CTRLS_VERSION >= 2
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS },
+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS },
 #endif
     };
     // Order & size must match!
@@ -1042,12 +1076,13 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)

     fill_sps(&ctrl_sps, sps);

-    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_MPEG_VIDEO_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
+    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_STATELESS_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
         av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n");
         return AVERROR(EINVAL);
     }

     ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0;
+    av_log(avctx, AV_LOG_INFO, "%s SPS muti-slice\n", ctx->multi_slice ? "Has" : "No");
     return 0;
 }

@@ -1058,29 +1093,29 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
     int ret;

     struct v4l2_query_ext_ctrl querys[] = {
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, },
+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
+        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
+        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, },
     };

     struct v4l2_ext_control ctrls[] = {
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
+        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
     };

     mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));

     ctx->decode_mode = querys[0].default_value;

-    if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED &&
-        ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) {
+    if (ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED &&
+        ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) {
         av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
         return AVERROR(EINVAL);
     }

     ctx->start_code = querys[1].default_value;
-    if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE &&
-        ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
+    if (ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_NONE &&
+        ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) {
         av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
         return AVERROR(EINVAL);
     }
diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
index 20e4e0ab15..cd79aad563 100644
--- a/libavcodec/v4l2_request_hevc.c
+++ b/libavcodec/v4l2_request_hevc.c
@@ -210,7 +210,11 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
         goto fail4;
     }

-    if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
+    if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) {
+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
+        ctx->fns = &V2(ff_v4l2_req_hevc, 4);
+    }
+    else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
         ctx->fns = &V2(ff_v4l2_req_hevc, 3);
     }
diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
index ed48d62e2d..d4adb3f812 100644
--- a/libavcodec/v4l2_request_hevc.h
+++ b/libavcodec/v4l2_request_hevc.h
@@ -99,5 +99,6 @@ typedef struct v4l2_req_decode_fns {
 extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
 extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
 extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3);
+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 4);

 #endif

From 21a348ae3282318fa96d3a6e2c70f3d4b90a7d52 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Sun, 3 Jul 2022 13:40:41 +0000
Subject: [PATCH 057/161] v4l2_req: Observe limit on size of slice_array

This in fact provides some minor simplifications by combing the
multi-slice and single-slice paths.

(cherry picked from commit 7631e6d1a66fca9048605c214f3464c90d37932c)
---
 libavcodec/v4l2_req_hevc_vx.c  | 39 ++++++++++++++--------------------
 libavcodec/v4l2_request_hevc.h |  5 +----
 2 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
index 761c5b2dc7..9d08d13d9e 100644
--- a/libavcodec/v4l2_req_hevc_vx.c
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -840,18 +840,21 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
     int bcount = get_bits_count(&h->HEVClc->gb);
     uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount;

+    const unsigned int n = rd->num_slices;
+    const unsigned int block_start = (n / ctx->max_slices) * ctx->max_slices;
+
     int rv;
     struct slice_info * si;

     if ((rv = slice_add(rd)) != 0)
         return rv;

-    si = rd->slices + rd->num_slices - 1;
+    si = rd->slices + n;
     si->ptr = buffer;
     si->len = size;

-    if (ctx->multi_slice && rd->num_slices > 1) {
-        struct slice_info *const si0 = rd->slices;
+    if (n != block_start) {
+        struct slice_info *const si0 = rd->slices + block_start;
         const size_t offset = (buffer - si0->ptr);
         boff += offset * 8;
         size += offset;
@@ -859,11 +862,11 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
     }

 #if HEVC_CTRLS_VERSION >= 2
-    if (rd->num_slices == 1)
+    if (n == 0)
         fill_decode_params(h, &rd->dec);
-    fill_slice_params(h, &rd->dec, rd->slice_params + rd->num_slices - 1, size * 8, boff);
+    fill_slice_params(h, &rd->dec, rd->slice_params + n, size * 8, boff);
 #else
-    fill_slice_params(h, rd->slice_params + rd->num_slices - 1, size * 8, boff);
+    fill_slice_params(h, rd->slice_params + n, size * 8, boff);
 #endif

     return 0;
@@ -997,18 +1000,11 @@ static int v4l2_request_hevc_end_frame(AVCodecContext *avctx)
     }

     // Send as slices
-    if (ctx->multi_slice)
-    {
-        if ((rv = send_slice(avctx, rd, &rc, 0, rd->num_slices)) != 0)
+    for (i = 0; i < rd->num_slices; i += ctx->max_slices) {
+        const unsigned int e = FFMIN(rd->num_slices, i + ctx->max_slices);
+        if ((rv = send_slice(avctx, rd, &rc, i, e)) != 0)
             goto fail;
     }
-    else
-    {
-        for (i = 0; i != rd->num_slices; ++i) {
-            if ((rv = send_slice(avctx, rd, &rc, i, i + 1)) != 0)
-                goto fail;
-        }
-    }

     // Set the drm_prime desriptor
     drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs));
@@ -1081,8 +1077,6 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
         return AVERROR(EINVAL);
     }

-    ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0;
-    av_log(avctx, AV_LOG_INFO, "%s SPS muti-slice\n", ctx->multi_slice ? "Has" : "No");
     return 0;
 }

@@ -1120,11 +1114,10 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
         return AVERROR(EINVAL);
     }

-    ctx->max_slices = querys[2].elems;
-    if (ctx->max_slices > MAX_SLICES) {
-        av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices);
-        return AVERROR(EINVAL);
-    }
+    ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) ||
+                       querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ?
+        1 : querys[2].dims[0];
+    av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices);

     ctrls[0].value = ctx->decode_mode;
     ctrls[1].value = ctx->start_code;
diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
index d4adb3f812..0029e23309 100644
--- a/libavcodec/v4l2_request_hevc.h
+++ b/libavcodec/v4l2_request_hevc.h
@@ -46,8 +46,6 @@
 #define V4L2_CTRL_FLAG_DYNAMIC_ARRAY	0x0800
 #endif

-#define MAX_SLICES 128
-
 #define VCAT(name, version) name##_v##version
 #define V2(n,v) VCAT(n, v)
 #define V(n) V2(n, HEVC_CTRLS_VERSION)
@@ -64,10 +62,9 @@ typedef struct V4L2RequestContextHEVC {

     unsigned int timestamp;  // ?? maybe uint64_t

-    int multi_slice;
     int decode_mode;
     int start_code;
-    int max_slices;
+    unsigned int max_slices;

     req_decode_q decode_q;


From 4f1d74cc8eea6a1bd6f2317a10c0ecf620315dec Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 4 Jul 2022 14:43:20 +0100
Subject: [PATCH 058/161] v4l2_req: Add entry point offsets array control

---
 libavcodec/v4l2_req_hevc_vx.c  | 88 +++++++++++++++++++++++++++-------
 libavcodec/v4l2_request_hevc.h |  3 +-
 2 files changed, 72 insertions(+), 19 deletions(-)

diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
index 9d08d13d9e..43ef6631ed 100644
--- a/libavcodec/v4l2_req_hevc_vx.c
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -82,11 +82,16 @@ typedef struct V4L2MediaReqDescriptor {
     struct v4l2_ctrl_hevc_slice_params * slice_params;
     struct slice_info * slices;

+    size_t num_offsets;
+    size_t alloced_offsets;
+    uint32_t *offsets;
+
 } V4L2MediaReqDescriptor;

 struct slice_info {
     const uint8_t * ptr;
     size_t len; // bytes
+    size_t n_offsets;
 };

 // Handy container for accumulating controls before setting
@@ -245,7 +250,7 @@ static int slice_add(V4L2MediaReqDescriptor * const rd)
     if (rd->num_slices >= rd->alloced_slices) {
         struct v4l2_ctrl_hevc_slice_params * p2;
         struct slice_info * s2;
-        size_t n2 = rd->num_slices == 0 ? 8 : rd->num_slices * 2;
+        size_t n2 = rd->alloced_slices == 0 ? 8 : rd->alloced_slices * 2;

         p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2));
         if (p2 == NULL)
@@ -263,6 +268,23 @@ static int slice_add(V4L2MediaReqDescriptor * const rd)
     return 0;
 }

+static int offsets_add(V4L2MediaReqDescriptor *const rd, const size_t n, const unsigned * const offsets)
+{
+    if (rd->num_offsets + n > rd->alloced_offsets) {
+        size_t n2 = rd->alloced_slices == 0 ? 128 : rd->alloced_slices * 2;
+        void * p2;
+        while (rd->num_offsets + n > n2)
+            n2 *= 2;
+        if ((p2 = av_realloc_array(rd->offsets, n2, sizeof(*rd->offsets))) == NULL)
+            return AVERROR(ENOMEM);
+        rd->offsets = p2;
+        rd->alloced_offsets = n2;
+    }
+    for (size_t i = 0; i != n; ++i)
+        rd->offsets[rd->num_offsets++] = offsets[i] - 1;
+    return 0;
+}
+
 static unsigned int
 fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries)
 {
@@ -403,12 +425,12 @@ static void fill_slice_params(const HEVCContext * const h,
     fill_pred_table(h, &slice_params->pred_weight_table);

     slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
+#if HEVC_CTRLS_VERSION <= 3
     if (slice_params->num_entry_point_offsets > 256) {
         slice_params->num_entry_point_offsets = 256;
         av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
     }

-#if HEVC_CTRLS_VERSION <= 3
     for (i = 0; i < slice_params->num_entry_point_offsets; i++)
         slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
 #endif
@@ -787,13 +809,17 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
 #if HEVC_CTRLS_VERSION >= 2
     struct v4l2_ctrl_hevc_decode_params * const dec,
 #endif
-    struct v4l2_ctrl_hevc_slice_params * const slices,
-    const unsigned int slice_no,
-    const unsigned int slice_count)
+    struct v4l2_ctrl_hevc_slice_params * const slices, const unsigned int slice_count,
+    void * const offsets, const size_t offset_count)
 {
     int rv;
+#if HEVC_CTRLS_VERSION >= 2
+    unsigned int n = 4;
+#else
+    unsigned int n = 3;
+#endif

-    struct v4l2_ext_control control[] = {
+    struct v4l2_ext_control control[6] = {
         {
             .id = V4L2_CID_STATELESS_HEVC_SPS,
             .ptr = &controls->sps,
@@ -813,21 +839,28 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
 #endif
         {
             .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
-            .ptr = slices + slice_no,
+            .ptr = slices,
             .size = sizeof(*slices) * slice_count,
         },
-        // Optional
-        {
+    };
+
+    if (controls->has_scaling)
+        control[n++] = (struct v4l2_ext_control) {
             .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX,
             .ptr = &controls->scaling_matrix,
             .size = sizeof(controls->scaling_matrix),
-        },
-    };
+        };
+
+#if HEVC_CTRLS_VERSION >= 4
+    if (offsets)
+        control[n++] = (struct v4l2_ext_control) {
+            .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS,
+            .ptr = offsets,
+            .size = sizeof(((struct V4L2MediaReqDescriptor *)0)->offsets[0]) * offset_count,
+        };
+#endif

-    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control,
-            controls->has_scaling ?
-                FF_ARRAY_ELEMS(control) :
-                FF_ARRAY_ELEMS(control) - 1);
+    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, n);

     return rv;
 }
@@ -852,6 +885,7 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
     si = rd->slices + n;
     si->ptr = buffer;
     si->len = size;
+    si->n_offsets = rd->num_offsets;

     if (n != block_start) {
         struct slice_info *const si0 = rd->slices + block_start;
@@ -868,6 +902,9 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
 #else
     fill_slice_params(h, rd->slice_params + n, size * 8, boff);
 #endif
+    if (ctx->max_offsets != 0 &&
+        (rv = offsets_add(rd, h->sh.num_entry_point_offsets, h->sh.entry_point_offset)) != 0)
+        return rv;

     return 0;
 }
@@ -893,10 +930,13 @@ static int send_slice(AVCodecContext * const avctx,
 {
     V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;

+    const int is_last = (j == rd->num_slices);
     struct slice_info *const si = rd->slices + i;
     struct media_request * req = NULL;
     struct qent_src * src = NULL;
     MediaBufsStatus stat;
+    void * offsets = rd->offsets + rd->slices[i].n_offsets;
+    size_t n_offsets = (is_last ? rd->num_offsets : rd->slices[j].n_offsets) - rd->slices[i].n_offsets;

     if ((req = media_request_get(ctx->mpool)) == NULL) {
         av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__);
@@ -908,8 +948,8 @@ static int send_slice(AVCodecContext * const avctx,
 #if HEVC_CTRLS_VERSION >= 2
                      &rd->dec,
 #endif
-                     rd->slice_params,
-                     i, j - i)) {
+                     rd->slice_params + i, j - i,
+                     offsets, n_offsets)) {
         av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__);
         goto fail1;
     }
@@ -935,7 +975,7 @@ static int send_slice(AVCodecContext * const avctx,

     stat = mediabufs_start_request(ctx->mbufs, &req, &src,
                                    i == 0 ? rd->qe_dst : NULL,
-                                   j == rd->num_slices);
+                                   is_last);

     if (stat != MEDIABUFS_STATUS_SUCCESS) {
         av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__);
@@ -1090,6 +1130,9 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
         { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
         { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
         { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, },
+#if HEVC_CTRLS_VERSION >= 4
+        { .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, },
+#endif
     };

     struct v4l2_ext_control ctrls[] = {
@@ -1119,6 +1162,14 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
         1 : querys[2].dims[0];
     av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices);

+#if HEVC_CTRLS_VERSION >= 4
+    ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ?
+        0 : querys[3].dims[0];
+    av_log(avctx, AV_LOG_INFO, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets);
+#else
+    ctx->max_offsets = 0;
+#endif
+
     ctrls[0].value = ctx->decode_mode;
     ctrls[1].value = ctx->start_code;

@@ -1141,6 +1192,7 @@ static void v4l2_req_frame_free(void *opaque, uint8_t *data)

     av_freep(&rd->slices);
     av_freep(&rd->slice_params);
+    av_freep(&rd->offsets);

     av_free(rd);
 }
diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
index 0029e23309..99c90064ea 100644
--- a/libavcodec/v4l2_request_hevc.h
+++ b/libavcodec/v4l2_request_hevc.h
@@ -64,7 +64,8 @@ typedef struct V4L2RequestContextHEVC {

     int decode_mode;
     int start_code;
-    unsigned int max_slices;
+    unsigned int max_slices;    // 0 => not wanted (frame mode)
+    unsigned int max_offsets;   // 0 => not wanted

     req_decode_q decode_q;


From d0e5ed2dff1b8f8909ceb968cb3afe2b20093fda Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 4 Jul 2022 16:22:54 +0100
Subject: [PATCH 059/161] v4l2_req: Support Annex B

---
 libavcodec/v4l2_req_hevc_vx.c | 61 +++++++++++++++++++++++------------
 1 file changed, 41 insertions(+), 20 deletions(-)

diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
index 43ef6631ed..5e0db9850a 100644
--- a/libavcodec/v4l2_req_hevc_vx.c
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -879,6 +879,18 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
     int rv;
     struct slice_info * si;

+    // This looks dodgy but we know that FFmpeg has parsed this from a buffer
+    // that contains the entire frame including the start code
+    if (ctx->start_code == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) {
+        buffer -= 3;
+        size += 3;
+        boff += 24;
+        if (buffer[0] != 0 || buffer[1] != 0 || buffer[2] != 1) {
+            av_log(avctx, AV_LOG_ERROR, "Start code requested but missing %02x:%02x:%02x\n",
+                   buffer[0], buffer[1], buffer[2]);
+        }
+    }
+
     if ((rv = slice_add(rd)) != 0)
         return rv;

@@ -969,10 +981,6 @@ static int send_slice(AVCodecContext * const avctx,
         goto fail2;
     }

-#warning ANNEX_B start code
-//        if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
-//        }
-
     stat = mediabufs_start_request(ctx->mbufs, &req, &src,
                                    i == 0 ? rd->qe_dst : NULL,
                                    is_last);
@@ -1120,6 +1128,12 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
     return 0;
 }

+static inline int
+ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v)
+{
+    return v >= c->minimum && v <= c->maximum;
+}
+
 // Final init
 static int
 set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
@@ -1142,21 +1156,6 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)

     mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));

-    ctx->decode_mode = querys[0].default_value;
-
-    if (ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED &&
-        ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) {
-        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
-        return AVERROR(EINVAL);
-    }
-
-    ctx->start_code = querys[1].default_value;
-    if (ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_NONE &&
-        ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) {
-        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
-        return AVERROR(EINVAL);
-    }
-
     ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) ||
                        querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ?
         1 : querys[2].dims[0];
@@ -1165,11 +1164,33 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
 #if HEVC_CTRLS_VERSION >= 4
     ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ?
         0 : querys[3].dims[0];
-    av_log(avctx, AV_LOG_INFO, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets);
+    av_log(avctx, AV_LOG_DEBUG, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets);
 #else
     ctx->max_offsets = 0;
 #endif

+    ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
+
+    if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
+    {
+        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED;
+
+        // Prefer NONE as it doesn't require the slightly dodgy look
+        // backwards in our raw buffer
+        if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
+            ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
+        else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B))
+            ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
+        else {
+            av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__);
+            return AVERROR(EINVAL);
+        }
+    }
+    else
+    {
+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__);
+    }
+
     ctrls[0].value = ctx->decode_mode;
     ctrls[1].value = ctx->start_code;


From a75506e18a964c9f50efa224a3fa4179c9ef2127 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 4 Jul 2022 18:24:03 +0100
Subject: [PATCH 060/161] v4l2_req: Add frame mode decode

---
 libavcodec/v4l2_req_hevc_vx.c | 69 +++++++++++++++++++++++------------
 1 file changed, 46 insertions(+), 23 deletions(-)

diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
index 5e0db9850a..ada53d0d44 100644
--- a/libavcodec/v4l2_req_hevc_vx.c
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -814,9 +814,9 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
 {
     int rv;
 #if HEVC_CTRLS_VERSION >= 2
-    unsigned int n = 4;
-#else
     unsigned int n = 3;
+#else
+    unsigned int n = 2;
 #endif

     struct v4l2_ext_control control[6] = {
@@ -837,12 +837,14 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
             .size = sizeof(*dec),
         },
 #endif
-        {
+    };
+
+    if (slices)
+        control[n++] = (struct v4l2_ext_control) {
             .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
             .ptr = slices,
             .size = sizeof(*slices) * slice_count,
-        },
-    };
+        };

     if (controls->has_scaling)
         control[n++] = (struct v4l2_ext_control) {
@@ -865,6 +867,8 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
     return rv;
 }

+// This only works because we started out from a single coded frame buffer
+// that will remain intact until after end_frame
 static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
 {
     const HEVCContext * const h = avctx->priv_data;
@@ -891,6 +895,17 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
         }
     }

+    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) {
+        if (rd->slices == NULL) {
+            if ((rd->slices = av_mallocz(sizeof(*rd->slices))) == NULL)
+                return AVERROR(ENOMEM);
+            rd->slices->ptr = buffer;
+            rd->num_slices = 1;
+        }
+        rd->slices->len = buffer - rd->slices->ptr + size;
+        return 0;
+    }
+
     if ((rv = slice_add(rd)) != 0)
         return rv;

@@ -1169,28 +1184,36 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
     ctx->max_offsets = 0;
 #endif

-    ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
-
-    if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
-    {
+    if (querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED ||
+        querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED)
+        ctx->decode_mode = querys[0].default_value;
+    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED))
+        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED;
+    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
         ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED;
-
-        // Prefer NONE as it doesn't require the slightly dodgy look
-        // backwards in our raw buffer
-        if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
-            ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
-        else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B))
-            ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
-        else {
-            av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__);
-            return AVERROR(EINVAL);
-        }
-    }
-    else
-    {
+    else {
         av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__);
+        return AVERROR(EINVAL);
     }

+    if (querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_NONE ||
+        querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)
+        ctx->start_code = querys[1].default_value;
+    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B))
+        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
+    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
+        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
+    else {
+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__);
+        return AVERROR(EINVAL);
+    }
+
+    // If we are in slice mode & START_CODE_NONE supported then pick that
+    // as it doesn't require the slightly dodgy look backwards in our raw buffer
+    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED &&
+        ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
+        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
+
     ctrls[0].value = ctx->decode_mode;
     ctrls[1].value = ctx->start_code;


From 9cf01f1485dcf71bcad7981d45029425d9abf115 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 5 Jul 2022 12:54:22 +0000
Subject: [PATCH 061/161] v4l2_req: Fix probe for frame based decode

---
 libavcodec/v4l2_req_hevc_vx.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
index ada53d0d44..5d083016f8 100644
--- a/libavcodec/v4l2_req_hevc_vx.c
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -1082,6 +1082,12 @@ fail:
     return rv;
 }

+static inline int
+ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v)
+{
+    return v >= c->minimum && v <= c->maximum;
+}
+
 // Initial check & init
 static int
 probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
@@ -1094,6 +1100,7 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
     // Check for var slice array
     struct v4l2_query_ext_ctrl qc[] = {
         { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS },
+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
         { .id = V4L2_CID_STATELESS_HEVC_SPS },
         { .id = V4L2_CID_STATELESS_HEVC_PPS },
         { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX },
@@ -1104,6 +1111,7 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
     // Order & size must match!
     static const size_t ctrl_sizes[] = {
         sizeof(struct v4l2_ctrl_hevc_slice_params),
+        sizeof(int32_t),
         sizeof(struct v4l2_ctrl_hevc_sps),
         sizeof(struct v4l2_ctrl_hevc_pps),
         sizeof(struct v4l2_ctrl_hevc_scaling_matrix),
@@ -1121,11 +1129,22 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
         return AVERROR(EINVAL);
 #endif

-    if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) {
-        av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION);
+    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls);
+    i = 0;
+#if HEVC_CTRLS_VERSION >= 4
+    // Skip slice check if no slice mode
+    if (qc[1].type != 0 && !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
+        i = 1;
+#else
+    // Fail frame mode silently for anything prior to V4
+    if (qc[1].type == 0 || !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
         return AVERROR(EINVAL);
-    }
-    for (i = 0; i != noof_ctrls; ++i) {
+#endif
+    for (; i != noof_ctrls; ++i) {
+        if (qc[i].type == 0) {
+            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %#x missing\n", HEVC_CTRLS_VERSION, qc[i].id);
+            return AVERROR(EINVAL);
+        }
         if (ctrl_sizes[i] != (size_t)qc[i].elem_size) {
             av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n",
                    HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size);
@@ -1143,12 +1162,6 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
     return 0;
 }

-static inline int
-ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v)
-{
-    return v >= c->minimum && v <= c->maximum;
-}
-
 // Final init
 static int
 set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)

From e7a62226f26073149d35c89268f56e17c8f45d76 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 26 Jul 2022 15:46:14 +0000
Subject: [PATCH 062/161] vf_deinterlace_v4l2m2m: Support NV12 through
 deinterlace

Supports NV12 (though not yet NV12M) through deinterlace.
Also improves error handling such that attempting to deinterlace an
unsupported drm format causes an error.
No longer leaks frame structures.
---
 libavfilter/vf_deinterlace_v4l2m2m.c | 160 ++++++++++++++++++---------
 1 file changed, 107 insertions(+), 53 deletions(-)

diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
index 1a933b7e0a..1a3bef5bcb 100644
--- a/libavfilter/vf_deinterlace_v4l2m2m.c
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -373,14 +373,16 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue)
 		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);

     if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-        if (fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 ||
+        if ((fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 &&
+             fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12) ||
             fmt->fmt.pix_mp.field != field) {
             av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);

             return AVERROR(EINVAL);
         }
     } else {
-        if (fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 ||
+        if ((fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 &&
+             fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12) ||
             fmt->fmt.pix.field != field) {
             av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);

@@ -391,7 +393,7 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue)
     return 0;
 }

-static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, int height, int pitch, int ysize)
+static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height, int pitch, int ysize)
 {
     struct v4l2_format *fmt        = &queue->format;
     DeintV4L2M2MContextShared *ctx = queue->ctx;
@@ -402,13 +404,16 @@ static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width,
         .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS,
     };

+    // This works for most single object 4:2:0 types
     if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        fmt->fmt.pix_mp.pixelformat = pixelformat;
         fmt->fmt.pix_mp.field = field;
         fmt->fmt.pix_mp.width = width;
         fmt->fmt.pix_mp.height = ysize / pitch;
         fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch;
         fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1);
     } else {
+        fmt->fmt.pix.pixelformat = pixelformat;
         fmt->fmt.pix.field = field;
         fmt->fmt.pix.width = width;
         fmt->fmt.pix.height = height;
@@ -417,12 +422,22 @@ static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width,
     }

     ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt);
-    if (ret)
+    if (ret) {
+        ret = AVERROR(errno);
         av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret);
+        return ret;
+    }
+
+    if (pixelformat != fmt->fmt.pix.pixelformat) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt->fmt.pix.pixelformat));
+        return AVERROR(EINVAL);
+    }

     ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel);
-    if (ret)
-        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_SELECTION failed: %d\n", ret);
+    if (ret) {
+        ret = AVERROR(errno);
+        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION failed: %d\n", ret);
+    }

     sel.r.width = width;
     sel.r.height = height;
@@ -432,10 +447,12 @@ static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width,
     sel.flags = V4L2_SEL_FLAG_LE;

     ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel);
-    if (ret)
-        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_SELECTION failed: %d\n", ret);
+    if (ret) {
+        ret = AVERROR(errno);
+        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %d\n", ret);
+    }

-    return ret;
+    return 0;
 }

 static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node)
@@ -517,10 +534,25 @@ static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf)
     return 0;
 }

-static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat)
 {
     struct v4l2_exportbuffer expbuf;
     int i, ret;
+    uint64_t mod = DRM_FORMAT_MOD_LINEAR;
+    uint32_t fmt = 0;
+
+    switch (pixelformat) {
+    case V4L2_PIX_FMT_NV12:
+        fmt = DRM_FORMAT_NV12;
+        break;
+    case V4L2_PIX_FMT_YUV420:
+        fmt = DRM_FORMAT_YUV420;
+        break;
+    default:
+        return AVERROR(EINVAL);
+    }
+
+    avbuf->drm_frame.layers[0].format = fmt;

     for (i = 0; i < avbuf->num_planes; i++) {
         memset(&expbuf, 0, sizeof(expbuf));
@@ -539,12 +571,12 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
             /* drm frame */
             avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length;
             avbuf->drm_frame.objects[i].fd = expbuf.fd;
-            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
+            avbuf->drm_frame.objects[i].format_modifier = mod;
         } else {
             /* drm frame */
             avbuf->drm_frame.objects[0].size = avbuf->buffer.length;
             avbuf->drm_frame.objects[0].fd = expbuf.fd;
-            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+            avbuf->drm_frame.objects[0].format_modifier = mod;
         }
     }

@@ -629,7 +661,7 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
             if (ret)
                 goto fail;

-            ret = v4l2_buffer_export_drm(buf);
+            ret = v4l2_buffer_export_drm(buf, multiplanar ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat);
             if (ret)
                 goto fail;
         }
@@ -878,7 +910,6 @@ static void v4l2_free_buffer(void *opaque, uint8_t *unused)

 static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
 {
-    int av_pix_fmt = AV_PIX_FMT_YUV420P;
     AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
     AVDRMLayerDescriptor *layer;

@@ -895,20 +926,13 @@ static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
         layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
     }

-    switch (av_pix_fmt) {
-    case AV_PIX_FMT_YUYV422:
-
-        layer->format = DRM_FORMAT_YUYV;
+    switch (layer->format) {
+    case DRM_FORMAT_YUYV:
         layer->nb_planes = 1;
-
         break;

-    case AV_PIX_FMT_NV12:
-    case AV_PIX_FMT_NV21:
-
-        layer->format = av_pix_fmt == AV_PIX_FMT_NV12 ?
-            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
-
+    case DRM_FORMAT_NV12:
+    case DRM_FORMAT_NV21:
         if (avbuf->num_planes > 1)
             break;

@@ -920,10 +944,7 @@ static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
         layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
         break;

-    case AV_PIX_FMT_YUV420P:
-
-        layer->format = DRM_FORMAT_YUV420;
-
+    case DRM_FORMAT_YUV420:
         if (avbuf->num_planes > 1)
             break;

@@ -1032,6 +1053,26 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
     return 0;
 }

+static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc)
+{
+    const int is_linear = (drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_LINEAR ||
+            drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID);
+
+    switch (drm_desc->layers[0].format) {
+    case DRM_FORMAT_YUV420:
+        if (is_linear)
+            return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_YUV420 : 0;
+        break;
+    case DRM_FORMAT_NV12:
+        if (is_linear)
+            return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_NV12 : 0;
+        break;
+    default:
+        break;
+    }
+    return 0;
+}
+
 static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
 {
     AVFilterContext *avctx         = link->dst;
@@ -1047,23 +1088,27 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
            avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out);

     if (ctx->field_order == V4L2_FIELD_ANY) {
-        AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0];
+        const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0];
+        const uint32_t pixelformat = desc_pixelformat(drm_desc);
+
+        if (pixelformat == 0) {
+            av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n",
+                   av_fourcc2str(drm_desc->layers[0].format),
+                   drm_desc->nb_objects, drm_desc->objects[0].format_modifier);
+            return AVERROR(EINVAL);
+        }
+
         ctx->orig_width = drm_desc->layers[0].planes[0].pitch;
         ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width;

         av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height,
            drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset);

-        if (in->top_field_first)
-            ctx->field_order = V4L2_FIELD_INTERLACED_TB;
-        else
-            ctx->field_order = V4L2_FIELD_INTERLACED_BT;
-
-        ret = deint_v4l2m2m_set_format(output, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
+        ret = deint_v4l2m2m_set_format(output, pixelformat, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
         if (ret)
             return ret;

-        ret = deint_v4l2m2m_set_format(capture, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
+        ret = deint_v4l2m2m_set_format(capture, pixelformat, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
         if (ret)
             return ret;

@@ -1082,6 +1127,12 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
         ret = deint_v4l2m2m_streamon(output);
         if (ret)
             return ret;
+
+        if (in->top_field_first)
+            ctx->field_order = V4L2_FIELD_INTERLACED_TB;
+        else
+            ctx->field_order = V4L2_FIELD_INTERLACED_BT;
+
     }

     ret = deint_v4l2m2m_enqueue_frame(output, in);
@@ -1157,28 +1208,31 @@ again:
         return 0;
     }

-    {
+    recycle_q(&s->output);
+    n = count_enqueued(&s->output);
+
+    while (n < 6) {
         AVFrame * frame;
         int rv;

-        recycle_q(&s->output);
-        n = count_enqueued(&s->output);
+        if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) {
+            av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv));
+            return rv;
+        }

-        while (n < 6) {
-            if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) {
-                av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv));
-                return rv;
-            }
+        if (frame == NULL) {
+            av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
+            break;
+        }

-            if (frame == NULL) {
-                av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
-                break;
-            }
+        rv = deint_v4l2m2m_filter_frame(inlink, frame);
+        av_frame_free(&frame);

-            deint_v4l2m2m_filter_frame(inlink, frame);
-            av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
-            ++n;
-        }
+        if (rv != 0)
+            return rv;
+
+        av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
+        ++n;
     }

     if (n < 6) {

From 3d07826bcf588ad0384d00b210415664aa4489fb Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 19 Aug 2022 15:29:11 +0000
Subject: [PATCH 063/161] v4l2_req: Enable use of MMAP for buffer alloc

Use MMAP rather than DMABUF if either the dmabuf device can't be opened
or create_buf doesn't set the capability.
---
 libavcodec/v4l2_req_dmabufs.c  |  22 +++
 libavcodec/v4l2_req_dmabufs.h  |   3 +
 libavcodec/v4l2_req_media.c    | 263 ++++++++++++++++++++++++++++-----
 libavcodec/v4l2_req_media.h    |  21 ++-
 libavcodec/v4l2_request_hevc.c |  42 +++++-
 5 files changed, 307 insertions(+), 44 deletions(-)

diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
index ae6c648369..c4bbed18c6 100644
--- a/libavcodec/v4l2_req_dmabufs.c
+++ b/libavcodec/v4l2_req_dmabufs.c
@@ -36,6 +36,26 @@ static unsigned int total_bufs = 0;
 static size_t total_size = 0;
 #endif

+struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size)
+{
+    struct dmabuf_h *dh;
+
+    if (mapptr == MAP_FAILED)
+        return NULL;
+
+    dh = malloc(sizeof(*dh));
+    if (!dh)
+        return NULL;
+
+    *dh = (struct dmabuf_h) {
+        .fd = -1,
+        .size = size,
+        .mapptr = mapptr
+    };
+
+    return dh;
+}
+
 struct dmabuf_h * dmabuf_import(int fd, size_t size)
 {
     struct dmabuf_h *dh;
@@ -122,6 +142,8 @@ int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags)
     struct dma_buf_sync sync = {
         .flags = flags
     };
+    if (dh->fd == -1)
+        return 0;
     while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) {
         const int err = errno;
         if (errno == EINTR)
diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h
index cfb17e801d..c1d3d8c8d7 100644
--- a/libavcodec/v4l2_req_dmabufs.h
+++ b/libavcodec/v4l2_req_dmabufs.h
@@ -18,6 +18,9 @@ static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t s
 }
 /* Create from existing fd - dups(fd) */
 struct dmabuf_h * dmabuf_import(int fd, size_t size);
+/* Import an MMAP - return NULL if mapptr = MAP_FAIL */
+struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size);
+
 void * dmabuf_map(struct dmabuf_h * const dh);

 /* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */
diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c
index 980b306b8a..910ac77bb6 100644
--- a/libavcodec/v4l2_req_media.c
+++ b/libavcodec/v4l2_req_media.c
@@ -33,9 +33,11 @@
 #include <string.h>
 #include <unistd.h>
 #include <linux/media.h>
+#include <linux/mman.h>
 #include <sys/ioctl.h>
 #include <sys/select.h>
 #include <sys/ioctl.h>
+#include <sys/mman.h>

 #include <linux/videodev2.h>

@@ -95,6 +97,32 @@ struct media_request {
     struct polltask * pt;
 };

+static inline enum v4l2_memory
+mediabufs_memory_to_v4l2(const enum mediabufs_memory m)
+{
+    return (enum v4l2_memory)m;
+}
+
+const char *
+mediabufs_memory_name(const enum mediabufs_memory m)
+{
+    switch (m) {
+    case MEDIABUFS_MEMORY_UNSET:
+        return "Unset";
+    case MEDIABUFS_MEMORY_MMAP:
+        return "MMap";
+    case MEDIABUFS_MEMORY_USERPTR:
+        return "UserPtr";
+    case MEDIABUFS_MEMORY_OVERLAY:
+        return "Overlay";
+    case MEDIABUFS_MEMORY_DMABUF:
+        return "DMABuf";
+    default:
+        break;
+    }
+    return "Unknown";
+}
+

 static inline int do_trywait(sem_t *const sem)
 {
@@ -115,14 +143,14 @@ static inline int do_wait(sem_t *const sem)
 }

 static int request_buffers(int video_fd, unsigned int type,
-                           enum v4l2_memory memory, unsigned int buffers_count)
+                           enum mediabufs_memory memory, unsigned int buffers_count)
 {
     struct v4l2_requestbuffers buffers;
     int rc;

     memset(&buffers, 0, sizeof(buffers));
     buffers.type = type;
-    buffers.memory = memory;
+    buffers.memory = mediabufs_memory_to_v4l2(memory);
     buffers.count = buffers_count;

     rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers);
@@ -324,6 +352,7 @@ struct qent_base {
     struct qent_base *next;
     struct qent_base *prev;
     enum qent_status status;
+    enum mediabufs_memory memtype;
     uint32_t index;
     struct dmabuf_h *dh[VIDEO_MAX_PLANES];
     struct timeval timestamp;
@@ -348,9 +377,9 @@ struct qe_list_head {
 };

 struct buf_pool {
+    enum mediabufs_memory memtype;
     pthread_mutex_t lock;
     sem_t free_sem;
-    enum v4l2_buf_type buf_type;
     struct qe_list_head free;
     struct qe_list_head inuse;
 };
@@ -367,9 +396,10 @@ static inline struct qent_src *base_to_src(struct qent_base *be)
 }


-#define QENT_BASE_INITIALIZER {\
+#define QENT_BASE_INITIALIZER(mtype) {\
     .ref_count = ATOMIC_VAR_INIT(0),\
     .status = QENT_NEW,\
+    .memtype = (mtype),\
     .index  = INDEX_UNSET\
 }

@@ -390,13 +420,13 @@ static void qe_src_free(struct qent_src *const be_src)
     free(be_src);
 }

-static struct qent_src * qe_src_new(void)
+static struct qent_src * qe_src_new(enum mediabufs_memory mtype)
 {
     struct qent_src *const be_src = malloc(sizeof(*be_src));
     if (!be_src)
         return NULL;
     *be_src = (struct qent_src){
-        .base = QENT_BASE_INITIALIZER
+        .base = QENT_BASE_INITIALIZER(mtype)
     };
     return be_src;
 }
@@ -413,13 +443,13 @@ static void qe_dst_free(struct qent_dst *const be_dst)
     free(be_dst);
 }

-static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl)
+static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl, const enum mediabufs_memory memtype)
 {
     struct qent_dst *const be_dst = malloc(sizeof(*be_dst));
     if (!be_dst)
         return NULL;
     *be_dst = (struct qent_dst){
-        .base = QENT_BASE_INITIALIZER,
+        .base = QENT_BASE_INITIALIZER(memtype),
         .lock = PTHREAD_MUTEX_INITIALIZER,
         .cond = PTHREAD_COND_INITIALIZER,
         .mbc_wl = ff_weak_link_ref(wl)
@@ -553,14 +583,14 @@ static struct qent_base *queue_tryget_free(struct buf_pool *const bp)
     return buf;
 }

-static struct qent_base * queue_find_extract_fd(struct buf_pool *const bp, const int fd)
+static struct qent_base * queue_find_extract_index(struct buf_pool *const bp, const unsigned int index)
 {
     struct qent_base *be;

     pthread_mutex_lock(&bp->lock);
     /* Expect 1st in Q, but allow anywhere */
     for (be = bp->inuse.head; be; be = be->next) {
-        if (dmabuf_fd(be->dh[0]) == fd) {
+        if (be->index == index) {
             bq_extract_inuse(bp, be);
             break;
         }
@@ -602,6 +632,8 @@ struct mediabufs_ctl {
     struct pollqueue * pq;
     struct ff_weak_link_master * this_wlm;

+    enum mediabufs_memory src_memtype;
+    enum mediabufs_memory dst_memtype;
     struct v4l2_format src_fmt;
     struct v4l2_format dst_fmt;
     struct v4l2_capability capability;
@@ -614,7 +646,7 @@ static int qe_v4l2_queue(struct qent_base *const be,
 {
     struct v4l2_buffer buffer = {
         .type = fmt->type,
-        .memory = V4L2_MEMORY_DMABUF,
+        .memory = mediabufs_memory_to_v4l2(be->memtype),
         .index = be->index
     };
     struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
@@ -628,7 +660,10 @@ static int qe_v4l2_queue(struct qent_base *const be,
             /* *** Really need a pixdesc rather than a format so we can fill in data_offset */
             planes[i].length = dmabuf_size(be->dh[i]);
             planes[i].bytesused = dmabuf_len(be->dh[i]);
-            planes[i].m.fd = dmabuf_fd(be->dh[i]);
+            if (be->memtype == MEDIABUFS_MEMORY_DMABUF)
+                planes[i].m.fd = dmabuf_fd(be->dh[i]);
+            else
+                planes[i].m.mem_offset = 0;
         }
         buffer.m.planes = planes;
         buffer.length = i;
@@ -639,7 +674,10 @@ static int qe_v4l2_queue(struct qent_base *const be,

         buffer.bytesused = dmabuf_len(be->dh[0]);
         buffer.length = dmabuf_size(be->dh[0]);
-        buffer.m.fd = dmabuf_fd(be->dh[0]);
+        if (be->memtype == MEDIABUFS_MEMORY_DMABUF)
+            buffer.m.fd = dmabuf_fd(be->dh[0]);
+        else
+            buffer.m.offset = 0;
     }

     if (!is_dst && mreq) {
@@ -668,14 +706,13 @@ static struct qent_base * qe_dequeue(struct buf_pool *const bp,
                      const int vfd,
                      const struct v4l2_format * const f)
 {
-    int fd;
     struct qent_base *be;
     int rc;
     const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type);
     struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
     struct v4l2_buffer buffer = {
         .type =  f->type,
-        .memory = V4L2_MEMORY_DMABUF
+        .memory = mediabufs_memory_to_v4l2(bp->memtype)
     };
     if (mp) {
         buffer.length = f->fmt.pix_mp.num_planes;
@@ -690,10 +727,9 @@ static struct qent_base * qe_dequeue(struct buf_pool *const bp,
         return NULL;
     }

-    fd = mp ? planes[0].m.fd : buffer.m.fd;
-    be = queue_find_extract_fd(bp, fd);
+    be = queue_find_extract_index(bp, buffer.index);
     if (!be) {
-        request_log("Failed to find fd %d in Q\n", fd);
+        request_log("Failed to find index %d in Q\n", buffer.index);
         return NULL;
     }

@@ -1104,7 +1140,7 @@ static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, stru

     struct v4l2_create_buffers cbuf = {
         .count = n,
-        .memory = V4L2_MEMORY_DMABUF,
+        .memory = mediabufs_memory_to_v4l2(mbc->dst->memtype),
         .format = mbc->dst_fmt,
     };

@@ -1125,12 +1161,97 @@ static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, stru
     return cbuf.count;
 }

+static MediaBufsStatus
+qe_import_from_buf(struct mediabufs_ctl *const mbc, struct qent_base * const be, const struct v4l2_format *const fmt,
+                   const unsigned int n, const bool x_dmabuf)
+{
+    struct v4l2_buffer buf = {
+        .index = n,
+        .type = fmt->type,
+    };
+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
+    int ret;
+
+    if (be->dh[0])
+        return 0;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        memset(planes, 0, sizeof(planes));
+        buf.m.planes = planes;
+        buf.length = VIDEO_MAX_PLANES;
+    }
+
+    if ((ret = ioctl(mbc->vfd, VIDIOC_QUERYBUF, &buf)) != 0) {
+        request_err(mbc->dc, "VIDIOC_QUERYBUF failed");
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type))
+    {
+        unsigned int i;
+        for (i = 0; i != buf.length; ++i) {
+            if (x_dmabuf) {
+                struct v4l2_exportbuffer xbuf = {
+                    .type = buf.type,
+                    .index = buf.index,
+                    .plane = i,
+                    .flags = O_RDWR, // *** Arguably O_RDONLY would be fine
+                };
+                if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0)
+                    be->dh[i] = dmabuf_import(xbuf.fd, planes[i].length);
+            }
+            else {
+                be->dh[i] = dmabuf_import_mmap(
+                    mmap(NULL, planes[i].length,
+                        PROT_READ | PROT_WRITE,
+                        MAP_SHARED | MAP_POPULATE,
+                        mbc->vfd, planes[i].m.mem_offset),
+                    planes[i].length);
+            }
+            /* On failure tidy up and die */
+            if (!be->dh[i]) {
+                while (i--) {
+                    dmabuf_free(be->dh[i]);
+                    be->dh[i] = NULL;
+                }
+                return MEDIABUFS_ERROR_OPERATION_FAILED;
+            }
+        }
+    }
+    else
+    {
+        if (x_dmabuf) {
+            struct v4l2_exportbuffer xbuf = {
+                .type = buf.type,
+                .index = buf.index,
+                .flags = O_RDWR, // *** Arguably O_RDONLY would be fine
+            };
+            if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0)
+                be->dh[0] = dmabuf_import(xbuf.fd, buf.length);
+        }
+        else {
+            be->dh[0] = dmabuf_import_mmap(
+                mmap(NULL, buf.length,
+                    PROT_READ | PROT_WRITE,
+                    MAP_SHARED | MAP_POPULATE,
+                    mbc->vfd, buf.m.offset),
+                buf.length);
+        }
+        /* On failure tidy up and die */
+        if (!be->dh[0]) {
+            return MEDIABUFS_ERROR_OPERATION_FAILED;
+        }
+    }
+
+    return 0;
+}
+
 struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc)
 {
     struct qent_dst * be_dst;

     if (mbc == NULL) {
-        be_dst = qe_dst_new(NULL);
+        be_dst = qe_dst_new(NULL, MEDIABUFS_MEMORY_DMABUF);
         if (be_dst)
             be_dst->base.status = QENT_IMPORT;
         return be_dst;
@@ -1144,7 +1265,7 @@ struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struc
     else {
         be_dst = base_to_dst(queue_tryget_free(mbc->dst));
         if (!be_dst) {
-            be_dst = qe_dst_new(mbc->this_wlm);
+            be_dst = qe_dst_new(mbc->this_wlm, mbc->dst->memtype);
             if (!be_dst)
                 return NULL;

@@ -1155,12 +1276,21 @@ struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struc
         }
     }

-    if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) {
-        /* Given  how create buf works we can't uncreate it on alloc failure
-         * all we can do is put it on the free Q
-        */
-        queue_put_free(mbc->dst, &be_dst->base);
-        return NULL;
+    if (mbc->dst->memtype == MEDIABUFS_MEMORY_MMAP) {
+        if (qe_import_from_buf(mbc, &be_dst->base, &mbc->dst_fmt, be_dst->base.index, true)) {
+            request_err(mbc->dc, "Failed to export as dmabuf\n");
+            queue_put_free(mbc->dst, &be_dst->base);
+            return NULL;
+        }
+    }
+    else {
+        if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) {
+            /* Given  how create buf works we can't uncreate it on alloc failure
+             * all we can do is put it on the free Q
+            */
+            queue_put_free(mbc->dst, &be_dst->base);
+            return NULL;
+        }
     }

     be_dst->base.status = QENT_PENDING;
@@ -1208,7 +1338,7 @@ MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,

 // ** This is a mess if we get partial alloc but without any way to remove
 //    individual V4L2 Q members we are somewhat stuffed
-MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed)
+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype)
 {
     unsigned int i;
     int a = 0;
@@ -1218,10 +1348,12 @@ MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, cons
     if (n > 32)
         return MEDIABUFS_ERROR_ALLOCATION_FAILED;

+    mbc->dst->memtype = memtype;
+
     // Create qents first as it is hard to get rid of the V4L2 buffers on error
     for (qc = 0; qc != n; ++qc)
     {
-        if ((qes[qc] = qe_dst_new(mbc->this_wlm)) == NULL)
+        if ((qes[qc] = qe_dst_new(mbc->this_wlm, mbc->dst->memtype)) == NULL)
             goto fail;
     }

@@ -1260,19 +1392,61 @@ void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src *
     queue_put_free(mbc->src, &qe_src->base);
 }

+static MediaBufsStatus
+chk_memory_type(struct mediabufs_ctl *const mbc,
+    const struct v4l2_format * const f,
+    const enum mediabufs_memory m)
+{
+    struct v4l2_create_buffers cbuf = {
+        .count = 0,
+        .memory = V4L2_MEMORY_MMAP,
+        .format = *f
+    };
+
+    if (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf) != 0)
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+
+    switch (m) {
+    case MEDIABUFS_MEMORY_DMABUF:
+        // 0 = Unknown but assume not in that case
+        if ((cbuf.capabilities & V4L2_BUF_CAP_SUPPORTS_DMABUF) == 0)
+            return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY;
+        break;
+    case MEDIABUFS_MEMORY_MMAP:
+        break;
+    default:
+        return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY;
+    }
+
+    return MEDIABUFS_STATUS_SUCCESS;
+}
+
+MediaBufsStatus
+mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype)
+{
+    return chk_memory_type(mbc, &mbc->src_fmt, memtype);
+}
+
+MediaBufsStatus
+mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype)
+{
+    return chk_memory_type(mbc, &mbc->dst_fmt, memtype);
+}
+
 /* src format must have been set up before this */
 MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc,
                   struct dmabufs_ctl * const dbsc,
-                  unsigned int n)
+                  unsigned int n, const enum mediabufs_memory memtype)
 {
     unsigned int i;
     struct v4l2_requestbuffers req = {
         .count = n,
         .type = mbc->src_fmt.type,
-        .memory = V4L2_MEMORY_DMABUF
+        .memory = mediabufs_memory_to_v4l2(memtype)
     };

     bq_free_all_free_src(mbc->src);
+
     while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) {
         if (errno != EINTR) {
             request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__);
@@ -1286,21 +1460,36 @@ MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc,
     }

     for (i = 0; i != n; ++i) {
-        struct qent_src *const be_src = qe_src_new();
+        struct qent_src *const be_src = qe_src_new(memtype);
         if (!be_src) {
             request_err(mbc->dc, "Failed to create src be %d\n", i);
             goto fail;
         }
-        if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) {
-            qe_src_free(be_src);
+        switch (memtype) {
+        case MEDIABUFS_MEMORY_MMAP:
+            if (qe_import_from_buf(mbc, &be_src->base, &mbc->src_fmt, i, false)) {
+                qe_src_free(be_src);
+                goto fail;
+            }
+            be_src->fixed_size = 1;
+            break;
+        case MEDIABUFS_MEMORY_DMABUF:
+            if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) {
+                qe_src_free(be_src);
+                goto fail;
+            }
+            be_src->fixed_size = !mediabufs_src_resizable(mbc);
+            break;
+        default:
+            request_err(mbc->dc, "Unexpected memorty type\n");
             goto fail;
         }
         be_src->base.index = i;
-        be_src->fixed_size = !mediabufs_src_resizable(mbc);

         queue_put_free(mbc->src, &be_src->base);
     }

+    mbc->src->memtype = memtype;
     return MEDIABUFS_STATUS_SUCCESS;

 fail:
@@ -1437,9 +1626,13 @@ int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_

 int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc)
 {
+#if 1
+    return 0;
+#else
     // Single planar OUTPUT can only take exact size buffers
     // Multiplanar will take larger than negotiated
     return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type);
+#endif
 }

 static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc)
diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h
index 0307a831de..890947b2e2 100644
--- a/libavcodec/v4l2_req_media.h
+++ b/libavcodec/v4l2_req_media.h
@@ -43,6 +43,7 @@ typedef enum media_buf_status {
     MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE,
     MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT,
     MEDIABUFS_ERROR_ALLOCATION_FAILED,
+    MEDIABUFS_ERROR_UNSUPPORTED_MEMORY,
 } MediaBufsStatus;

 struct media_pool * media_pool_new(const char * const media_path,
@@ -70,6 +71,15 @@ struct qent_dst;
 struct dmabuf_h;
 struct dmabufs_ctl;

+// 1-1 mammping to V4L2 type - just defined separetely to avoid some include versioning difficulties
+enum mediabufs_memory {
+   MEDIABUFS_MEMORY_UNSET            = 0,
+   MEDIABUFS_MEMORY_MMAP             = 1,
+   MEDIABUFS_MEMORY_USERPTR          = 2,
+   MEDIABUFS_MEMORY_OVERLAY          = 3,
+   MEDIABUFS_MEMORY_DMABUF           = 4,
+};
+
 int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp);
 struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst);

@@ -93,6 +103,8 @@ MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
                 unsigned int plane,
                 int fd, size_t size);

+const char * mediabufs_memory_name(const enum mediabufs_memory m);
+
 MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
                 struct media_request **const pmreq,
                 struct qent_src **const psrc_be,
@@ -106,7 +118,7 @@ struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc,
 // Create dst slots without alloc
 // If fixed true then qent_alloc will only get slots from this pool and will
 // block until a qent has been unrefed
-MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed);
+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype);

 MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc);
 MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc);
@@ -140,7 +152,12 @@ MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,

 MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw,
                   struct dmabufs_ctl * const dbsc,
-                  unsigned int n);
+                  unsigned int n,
+                  const enum mediabufs_memory memtype);
+
+// Want to have appropriate formats set first
+MediaBufsStatus mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype);
+MediaBufsStatus mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype);

 #define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c))
 unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc);
diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
index cd79aad563..5cf17dd5e3 100644
--- a/libavcodec/v4l2_request_hevc.c
+++ b/libavcodec/v4l2_request_hevc.c
@@ -144,6 +144,8 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
     const struct decdev * decdev;
     const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2;  // Assuming constant for all APIs but avoiding V4L2 includes
     size_t src_size;
+    enum mediabufs_memory src_memtype;
+    enum mediabufs_memory dst_memtype;

     av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);

@@ -174,8 +176,14 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
            decdev_media_path(decdev), decdev_video_path(decdev));

     if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) {
-        av_log(avctx, AV_LOG_ERROR, "Unable to open dmabufs\n");
-        goto fail0;
+        av_log(avctx, AV_LOG_DEBUG, "Unable to open dmabufs - try mmap buffers\n");
+        src_memtype = MEDIABUFS_MEMORY_MMAP;
+        dst_memtype = MEDIABUFS_MEMORY_MMAP;
+    }
+    else {
+        av_log(avctx, AV_LOG_DEBUG, "Dmabufs opened - try dmabuf buffers\n");
+        src_memtype = MEDIABUFS_MEMORY_DMABUF;
+        dst_memtype = MEDIABUFS_MEMORY_DMABUF;
     }

     if ((ctx->pq = pollqueue_new()) == NULL) {
@@ -196,8 +204,9 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
     // Ask for an initial bitbuf size of max size / 4
     // We will realloc if we need more
     // Must use sps->h/w as avctx contains cropped size
+retry_src_memtype:
     src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8);
-    if (mediabufs_src_resizable(ctx->mbufs))
+    if (src_memtype == MEDIABUFS_MEMORY_DMABUF && mediabufs_src_resizable(ctx->mbufs))
         src_size /= 4;
     // Kludge for conformance tests which break Annex A limits
     else if (src_size < 0x40000)
@@ -210,6 +219,15 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
         goto fail4;
     }

+    if (mediabufs_src_chk_memtype(ctx->mbufs, src_memtype)) {
+        if (src_memtype == MEDIABUFS_MEMORY_DMABUF) {
+            src_memtype = MEDIABUFS_MEMORY_MMAP;
+            goto retry_src_memtype;
+        }
+        av_log(avctx, AV_LOG_ERROR, "Failed to get src memory type\n");
+        goto fail4;
+    }
+
     if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) {
         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
         ctx->fns = &V2(ff_v4l2_req_hevc, 4);
@@ -238,7 +256,7 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
         goto fail4;
     }

-    if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6)) {
+    if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6, src_memtype)) {
         av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n");
         goto fail4;
     }
@@ -250,8 +268,17 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
                sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering,
                avctx->thread_count, avctx->extra_hw_frames);

+        if (mediabufs_dst_chk_memtype(ctx->mbufs, dst_memtype)) {
+            if (dst_memtype != MEDIABUFS_MEMORY_DMABUF) {
+                av_log(avctx, AV_LOG_ERROR, "Failed to get dst memory type\n");
+                goto fail4;
+            }
+            av_log(avctx, AV_LOG_DEBUG, "Dst DMABUF not supported - trying mmap\n");
+            dst_memtype = MEDIABUFS_MEMORY_MMAP;
+        }
+
         // extra_hw_frames is -1 if unset
-        if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0))) {
+        if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0), dst_memtype)) {
             av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n");
             goto fail4;
         }
@@ -277,9 +304,10 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
     // Set our s/w format
     avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format;

-    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s\n",
+    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s\n",
            ctx->fns->name,
-           decdev_media_path(decdev), decdev_video_path(decdev));
+           decdev_media_path(decdev), decdev_video_path(decdev),
+           mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype));

     return 0;


From 79c2fcac56586ce9eea0cc8c6b13d2cd54f3e468 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 22 Aug 2022 12:35:40 +0000
Subject: [PATCH 064/161] Set buffer lengths on DQ

---
 libavcodec/v4l2_req_media.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c
index 910ac77bb6..1a9944774a 100644
--- a/libavcodec/v4l2_req_media.c
+++ b/libavcodec/v4l2_req_media.c
@@ -733,6 +733,14 @@ static struct qent_base * qe_dequeue(struct buf_pool *const bp,
         return NULL;
     }

+    if (mp) {
+        unsigned int i;
+        for (i = 0; i != buffer.length; ++i)
+            dmabuf_len_set(be->dh[i], V4L2_TYPE_IS_CAPTURE(f->type) ? planes[i].bytesused : 0);
+    }
+    else
+        dmabuf_len_set(be->dh[0], V4L2_TYPE_IS_CAPTURE(f->type) ? buffer.length : 0);
+
     be->timestamp = buffer.timestamp;
     be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE;
     return be;

From 8f3245ca1e4b2ec7e13fc2f3bffbc964ee8fc290 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 22 Aug 2022 17:11:24 +0000
Subject: [PATCH 065/161] Fix compile if videodev2.h defines V4L2 HEVC request
 API

If videodev2.h does define the HEVC request API it is really hard to
set old variations of the controls so if it does then we only compile
against the system includes and remove the back compatability.
---
 configure                      | 9 +++++++++
 libavcodec/Makefile            | 4 ++--
 libavcodec/hevc-ctrls-v4.h     | 2 ++
 libavcodec/v4l2_req_hevc_vx.c  | 5 -----
 libavcodec/v4l2_request_hevc.c | 6 ++++--
 5 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/configure b/configure
index fdc95146bf..5c00a183e3 100755
--- a/configure
+++ b/configure
@@ -1946,6 +1946,7 @@ FEATURE_LIST="
     swscale_alpha
     vout_drm
     vout_egl
+    v4l2_req_hevc_vx
 "

 # this list should be kept in linking order
@@ -6912,6 +6913,14 @@ fi

 check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
 check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
+disable v4l2_req_hevc_vx
+if enabled hevc_v4l2request_hwaccel; then
+    enable v4l2_req_hevc_vx
+fi
+if enabled hevc_v4l2_request; then
+    disable v4l2_req_hevc_vx
+fi
+
 check_headers sys/videoio.h
 test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index d433a71236..11f183c9b9 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -999,8 +999,8 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)       += dxva2_hevc.o
 OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
 OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
 OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec.o
-OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o\
-                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o  v4l2_req_hevc_v4.o
+OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o v4l2_req_hevc_v4.o
+OBJS-$(CONFIG_V4L2_REQ_HEVC_VX)           += v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o
 OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h
index 7e05f6e7c3..7829d82084 100644
--- a/libavcodec/hevc-ctrls-v4.h
+++ b/libavcodec/hevc-ctrls-v4.h
@@ -53,6 +53,8 @@
 #include <linux/const.h>
 #include <linux/types.h>

+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
+
 #define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_STATELESS_BASE + 400)
 #define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_STATELESS_BASE + 401)
 #define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 402)
diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
index 5d083016f8..e1bd5c6a1f 100644
--- a/libavcodec/v4l2_req_hevc_vx.c
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -40,11 +40,6 @@
 #define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B          V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B
 #endif

-// Should be in videodev2 but we might not have a good enough one
-#ifndef V4L2_PIX_FMT_HEVC_SLICE
-#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
-#endif
-
 #include "v4l2_request_hevc.h"

 #include "libavutil/hwcontext_drm.h"
diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
index 5cf17dd5e3..614a1b4d99 100644
--- a/libavcodec/v4l2_request_hevc.c
+++ b/libavcodec/v4l2_request_hevc.c
@@ -17,7 +17,7 @@
  */


-
+#include "config.h"
 #include "decode.h"
 #include "hevcdec.h"
 #include "hwconfig.h"
@@ -142,7 +142,7 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
     const HEVCSPS * const sps = h->ps.sps;
     int ret;
     const struct decdev * decdev;
-    const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2;  // Assuming constant for all APIs but avoiding V4L2 includes
+    const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 4).src_pix_fmt_v4l2;  // Assuming constant for all APIs but avoiding V4L2 includes
     size_t src_size;
     enum mediabufs_memory src_memtype;
     enum mediabufs_memory dst_memtype;
@@ -232,6 +232,7 @@ retry_src_memtype:
         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
         ctx->fns = &V2(ff_v4l2_req_hevc, 4);
     }
+#if CONFIG_V4L2_REQ_HEVC_VX
     else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
         ctx->fns = &V2(ff_v4l2_req_hevc, 3);
@@ -244,6 +245,7 @@ retry_src_memtype:
         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n");
         ctx->fns = &V2(ff_v4l2_req_hevc, 1);
     }
+#endif
     else {
         av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n");
         ret = AVERROR(EINVAL);

From 35ec6af32c4f05b076f84ab343a8fc0d3263ba44 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 12 Sep 2022 17:59:22 +0100
Subject: [PATCH 066/161] v4l2_m2m_enc: Send headers in in pkt side_data

If GLOBAL_HEADERS are requested then we can't provide them at init time
so send as NEW_EXTRADATA side data in a similar way to some AV1
encoders.
---
 libavcodec/v4l2_m2m_enc.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
index 05ff6ba726..099ad23928 100644
--- a/libavcodec/v4l2_m2m_enc.c
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -544,14 +544,12 @@ dequeue:
         av_freep(&avctx->extradata);
         avctx->extradata_size = 0;

-        if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL)
-            memcpy(data, avpkt->data, len);
+        if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL)
+            goto fail_no_mem;

+        memcpy(data, avpkt->data, len);
         av_packet_unref(avpkt);

-        if (data == NULL)
-            return AVERROR(ENOMEM);
-
         // We need to copy the header, but keep local if not global
         if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) {
             avctx->extradata = data;
@@ -567,18 +565,28 @@ dequeue:
     }

     // First frame must be key so mark as such even if encoder forgot
-    if (capture->first_buf == 2)
+    if (capture->first_buf == 2) {
         avpkt->flags |= AV_PKT_FLAG_KEY;

+        // Add any extradata to the 1st packet we emit as we cannot create it at init
+        if (avctx->extradata_size > 0 && avctx->extradata) {
+            void * const side = av_packet_new_side_data(avpkt,
+                                           AV_PKT_DATA_NEW_EXTRADATA,
+                                           avctx->extradata_size);
+            if (!side)
+                goto fail_no_mem;
+
+            memcpy(side, avctx->extradata, avctx->extradata_size);
+        }
+    }
+
     // Add SPS/PPS to the start of every key frame if non-global headers
     if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) {
         const size_t newlen = s->extdata_size + avpkt->size;
         AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE);

-        if (buf == NULL) {
-            av_packet_unref(avpkt);
-            return AVERROR(ENOMEM);
-        }
+        if (buf == NULL)
+            goto fail_no_mem;

         memcpy(buf->data, s->extdata_data, s->extdata_size);
         memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size);
@@ -592,6 +600,11 @@ dequeue:
 //    av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret);
     capture->first_buf = 0;
     return 0;
+
+fail_no_mem:
+    ret = AVERROR(ENOMEM);
+    av_packet_unref(avpkt);
+    return ret;
 }

 static av_cold int v4l2_encode_init(AVCodecContext *avctx)

From dfc754491cea9192945b92ca9c8d3919321e30ad Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 14 Sep 2022 15:44:10 +0000
Subject: [PATCH 067/161] matroskaenc: Allow H264 SPS/PPS headers in packet
 sidedata

---
 libavformat/matroskaenc.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c
index 113541bd9a..61e4c976ef 100644
--- a/libavformat/matroskaenc.c
+++ b/libavformat/matroskaenc.c
@@ -77,6 +77,10 @@

 #define IS_WEBM(mkv) (CONFIG_WEBM_MUXER && CONFIG_MATROSKA_MUXER ? \
                       ((mkv)->mode == MODE_WEBM) : CONFIG_WEBM_MUXER)
+
+/* Reserved size for H264 headers if not extant at init time */
+#define MAX_H264_HEADER_SIZE 1024
+
 #define IS_SEEKABLE(pb, mkv) (((pb)->seekable & AVIO_SEEKABLE_NORMAL) && \
                               !(mkv)->is_live)

@@ -1121,8 +1125,12 @@ static int mkv_assemble_native_codecprivate(AVFormatContext *s, AVIOContext *dyn
     case AV_CODEC_ID_WAVPACK:
         return put_wv_codecpriv(dyn_cp, extradata, extradata_size);
     case AV_CODEC_ID_H264:
-        return ff_isom_write_avcc(dyn_cp, extradata,
-                                  extradata_size);
+        if (par->extradata_size)
+            return ff_isom_write_avcc(dyn_cp, extradata,
+                                      extradata_size);
+        else
+            *size_to_reserve = MAX_H264_HEADER_SIZE;
+        break;
     case AV_CODEC_ID_HEVC:
         return ff_isom_write_hvcc(dyn_cp, extradata,
                                   extradata_size, 0);
@@ -2731,8 +2739,8 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt)
         }
         break;
 #endif
-    // FIXME: Remove the following once libaom starts propagating proper extradata during init()
-    //        See https://bugs.chromium.org/p/aomedia/issues/detail?id=2208
+    // FIXME: Remove the following once libaom starts propagating extradata during init()
+    //        See https://bugs.chromium.org/p/aomedia/issues/detail?id=2012
     case AV_CODEC_ID_AV1:
         if (side_data_size && mkv->track.bc && !par->extradata_size) {
             // If the reserved space doesn't suffice, only write
@@ -2744,6 +2752,16 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt)
         } else if (!par->extradata_size)
             return AVERROR_INVALIDDATA;
         break;
+    // H264 V4L2 has a similar issue
+    case AV_CODEC_ID_H264:
+        if (side_data_size && mkv->track.bc && !par->extradata_size) {
+            ret = mkv_update_codecprivate(s, mkv, side_data, side_data_size,
+                                          par, mkv->track.bc, track, 0);
+            if (ret < 0)
+                return ret;
+        } else if (!par->extradata_size)
+            return AVERROR_INVALIDDATA;
+        break;
     default:
         if (side_data_size)
             av_log(s, AV_LOG_DEBUG, "Ignoring new extradata in a packet for stream %d.\n", pkt->stream_index);

From 30c6ca4e24ae2acbd7f7f122f5275beb62b625c6 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 14 Sep 2022 15:55:15 +0000
Subject: [PATCH 068/161] movenc: Allow H264 SPS/PPS headers in packet sidedata

---
 libavformat/movenc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavformat/movenc.c b/libavformat/movenc.c
index c4fcb5f8b1..891adbf7b2 100644
--- a/libavformat/movenc.c
+++ b/libavformat/movenc.c
@@ -6343,6 +6343,7 @@ static int mov_write_single_packet(AVFormatContext *s, AVPacket *pkt)
     if (trk->par->codec_id == AV_CODEC_ID_MP4ALS ||
             trk->par->codec_id == AV_CODEC_ID_AAC ||
             trk->par->codec_id == AV_CODEC_ID_AV1 ||
+            trk->par->codec_id == AV_CODEC_ID_H264 ||
             trk->par->codec_id == AV_CODEC_ID_FLAC) {
         size_t side_size;
         uint8_t *side = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);

From 1c7c3e99e9ed90f241aecbe7b2269229587d1e03 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 26 Sep 2022 12:45:05 +0100
Subject: [PATCH 069/161] Allow ffmpeg to select codec internal hwfmts if
 no_cvt_hw

This allows the selection of DRM_PRIME from v4l2m2m without forcing it
in the decoder.

Not utterly sure this is the right method for 5.1 but it does work
---
 fftools/ffmpeg.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
index ba0c1898cf..839da7b472 100644
--- a/fftools/ffmpeg.c
+++ b/fftools/ffmpeg.c
@@ -2763,12 +2763,15 @@ static enum AVPixelFormat get_format(AVCodecContext *s, const enum AVPixelFormat
             break;

         if (ist->hwaccel_id == HWACCEL_GENERIC ||
-            ist->hwaccel_id == HWACCEL_AUTO) {
+            ist->hwaccel_id == HWACCEL_AUTO ||
+            no_cvt_hw) {
             for (i = 0;; i++) {
                 config = avcodec_get_hw_config(s->codec, i);
                 if (!config)
                     break;
-                if (!(config->methods &
+                if (no_cvt_hw && (config->methods & AV_CODEC_HW_CONFIG_METHOD_INTERNAL))
+                    av_log(s, AV_LOG_DEBUG, "no_cvt_hw so trying pix_fmt %d with codec internal hwaccel\n", *p);
+                else if (!(config->methods &
                       AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX))
                     continue;
                 if (config->pix_fmt == *p)

From ecf273fd02e8aafe8775b1f291b9664b1b49572e Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 1 Sep 2022 11:42:41 +0000
Subject: [PATCH 070/161] vf_deinterlace_v4l2m2m: Add a v4l2m2m scaler

The logic for running an isp based scaler is pretty much identical to
that for the deinterlacer so add to the deinterlacer. This requires
some rework of the setup code to avoid assumptions that are true for
deinterlace but not scale but the reworked code requires few switches
based on operation.
---
 libavfilter/allfilters.c             |    1 +
 libavfilter/vf_deinterlace_v4l2m2m.c | 1123 ++++++++++++++++++++------
 2 files changed, 877 insertions(+), 247 deletions(-)

diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 357ff61ca8..d504fa1bc8 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -421,6 +421,7 @@ extern const AVFilter ff_vf_scale;
 extern const AVFilter ff_vf_scale_cuda;
 extern const AVFilter ff_vf_scale_npp;
 extern const AVFilter ff_vf_scale_qsv;
+extern const AVFilter ff_vf_scale_v4l2m2m;
 extern const AVFilter ff_vf_scale_vaapi;
 extern const AVFilter ff_vf_scale_vulkan;
 extern const AVFilter ff_vf_scale2ref;
diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
index 1a3bef5bcb..2df39ec0f1 100644
--- a/libavfilter/vf_deinterlace_v4l2m2m.c
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -52,31 +52,36 @@
 #include "avfilter.h"
 #include "formats.h"
 #include "internal.h"
+#include "scale_eval.h"
 #include "video.h"

+#ifndef DRM_FORMAT_P030
+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel packed */
+#endif
+
 typedef struct V4L2Queue V4L2Queue;
 typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared;

-typedef struct V4L2PlaneInfo {
-    int bytesperline;
-    size_t length;
-} V4L2PlaneInfo;
+typedef enum filter_type_v4l2_e
+{
+    FILTER_V4L2_DEINTERLACE = 1,
+    FILTER_V4L2_SCALE,
+} filter_type_v4l2_t;

 typedef struct V4L2Buffer {
     int enqueued;
     int reenqueue;
-    int fd;
     struct v4l2_buffer buffer;
     AVFrame frame;
     struct v4l2_plane planes[VIDEO_MAX_PLANES];
     int num_planes;
-    V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES];
     AVDRMFrameDescriptor drm_frame;
     V4L2Queue *q;
 } V4L2Buffer;

 typedef struct V4L2Queue {
     struct v4l2_format format;
+    struct v4l2_selection sel;
     int num_buffers;
     V4L2Buffer *buffers;
     DeintV4L2M2MContextShared *ctx;
@@ -111,11 +116,18 @@ typedef struct pts_track_s

 typedef struct DeintV4L2M2MContextShared {
     void * logctx;  // For logging - will be NULL when done
+    filter_type_v4l2_t filter_type;

     int fd;
     int done;
     int width;
     int height;
+
+    // from options
+    int output_width;
+    int output_height;
+    enum AVPixelFormat output_format;
+
     int orig_width;
     int orig_height;
     atomic_uint refcount;
@@ -134,8 +146,60 @@ typedef struct DeintV4L2M2MContext {
     const AVClass *class;

     DeintV4L2M2MContextShared *shared;
+
+    char * w_expr;
+    char * h_expr;
+    char * output_format_string;;
+
+    int force_original_aspect_ratio;
+    int force_divisible_by;
+
+    char *colour_primaries_string;
+    char *colour_transfer_string;
+    char *colour_matrix_string;
+    int   colour_range;
+    char *chroma_location_string;
+
+    enum AVColorPrimaries colour_primaries;
+    enum AVColorTransferCharacteristic colour_transfer;
+    enum AVColorSpace colour_matrix;
+    enum AVChromaLocation chroma_location;
 } DeintV4L2M2MContext;

+// These just list the ones we know we can cope with
+static uint32_t
+fmt_av_to_v4l2(const enum AVPixelFormat avfmt)
+{
+    switch (avfmt) {
+    case AV_PIX_FMT_YUV420P:
+        return V4L2_PIX_FMT_YUV420;
+    case AV_PIX_FMT_NV12:
+        return V4L2_PIX_FMT_NV12;
+    case AV_PIX_FMT_RPI4_8:
+    case AV_PIX_FMT_SAND128:
+        return V4L2_PIX_FMT_NV12_COL128;
+    default:
+        break;
+    }
+    return 0;
+}
+
+static enum AVPixelFormat
+fmt_v4l2_to_av(const uint32_t pixfmt)
+{
+    switch (pixfmt) {
+    case V4L2_PIX_FMT_YUV420:
+        return AV_PIX_FMT_YUV420P;
+    case V4L2_PIX_FMT_NV12:
+        return AV_PIX_FMT_NV12;
+    case V4L2_PIX_FMT_NV12_COL128:
+        return AV_PIX_FMT_RPI4_8;
+    default:
+        break;
+    }
+    return AV_PIX_FMT_NONE;
+}
+
 static unsigned int pts_stats_interval(const pts_stats_t * const stats)
 {
     return stats->last_interval;
@@ -301,6 +365,39 @@ static int pts_track_init(pts_track_t * const trk, void *logctx)
     return 0;
 }

+static inline uint32_t
+fmt_bpl(const struct v4l2_format * const fmt, const unsigned int plane_n)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.plane_fmt[plane_n].bytesperline : fmt->fmt.pix.bytesperline;
+}
+
+static inline uint32_t
+fmt_height(const struct v4l2_format * const fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
+}
+
+static inline uint32_t
+fmt_width(const struct v4l2_format * const fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
+}
+
+static inline uint32_t
+fmt_pixelformat(const struct v4l2_format * const fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
+}
+
+static void
+init_format(V4L2Queue * const q, const uint32_t format_type)
+{
+    memset(&q->format, 0, sizeof(q->format));
+    memset(&q->sel,    0, sizeof(q->sel));
+    q->format.type = format_type;
+    q->sel.type    = format_type;
+}
+
 static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx)
 {
     struct v4l2_capability cap;
@@ -311,80 +408,99 @@ static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx)
     if (ret < 0)
         return ret;

-    if (!(cap.capabilities & V4L2_CAP_STREAMING))
+    if (ctx->filter_type == FILTER_V4L2_SCALE &&
+        strcmp("bcm2835-codec-isp", cap.card) != 0)
+    {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Not ISP\n");
         return AVERROR(EINVAL);
+    }

-    if (cap.capabilities & V4L2_CAP_VIDEO_M2M) {
-        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
-
-        return 0;
+    if (!(cap.capabilities & V4L2_CAP_STREAMING)) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "No streaming\n");
+        return AVERROR(EINVAL);
     }

     if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) {
-        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
-        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
-
-        return 0;
+        init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE);
+        init_format(&ctx->output,  V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE);
+    }
+    else if (cap.capabilities & V4L2_CAP_VIDEO_M2M) {
+        init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE);
+        init_format(&ctx->output,  V4L2_BUF_TYPE_VIDEO_OUTPUT);
+    }
+    else {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Not M2M\n");
+        return AVERROR(EINVAL);
     }

-    return AVERROR(EINVAL);
+    return 0;
 }

-static int deint_v4l2m2m_try_format(V4L2Queue *queue)
+// Just use for probe - doesn't modify q format
+static int deint_v4l2m2m_try_format(V4L2Queue *queue, const uint32_t width, const uint32_t height, const enum AVPixelFormat avfmt)
 {
-    struct v4l2_format *fmt        = &queue->format;
+    struct v4l2_format fmt         = {.type = queue->format.type};
     DeintV4L2M2MContextShared *ctx = queue->ctx;
     int ret, field;
+    // Pick YUV to test with if not otherwise specified
+    uint32_t pixelformat = avfmt == AV_PIX_FMT_NONE ? V4L2_PIX_FMT_YUV420 : fmt_av_to_v4l2(avfmt);
+    enum AVPixelFormat r_avfmt;
+

-    ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt);
+    ret = ioctl(ctx->fd, VIDIOC_G_FMT, &fmt);
     if (ret)
         av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret);

-    if (V4L2_TYPE_IS_OUTPUT(fmt->type))
+    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && V4L2_TYPE_IS_OUTPUT(fmt.type))
         field = V4L2_FIELD_INTERLACED_TB;
     else
         field = V4L2_FIELD_NONE;

-    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-        fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420;
-        fmt->fmt.pix_mp.field = field;
-        fmt->fmt.pix_mp.width = ctx->width;
-        fmt->fmt.pix_mp.height = ctx->height;
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
+        fmt.fmt.pix_mp.pixelformat = pixelformat;
+        fmt.fmt.pix_mp.field = field;
+        fmt.fmt.pix_mp.width = width;
+        fmt.fmt.pix_mp.height = height;
     } else {
-        fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_YUV420;
-        fmt->fmt.pix.field = field;
-        fmt->fmt.pix.width = ctx->width;
-        fmt->fmt.pix.height = ctx->height;
+        fmt.fmt.pix.pixelformat = pixelformat;
+        fmt.fmt.pix.field = field;
+        fmt.fmt.pix.width = width;
+        fmt.fmt.pix.height = height;
     }

-    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__,
-		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
-		 fmt->fmt.pix_mp.pixelformat,
-		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
+    av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__,
+         fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height,
+         fmt.fmt.pix_mp.pixelformat,
+         fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline);

-    ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt);
+    ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, &fmt);
     if (ret)
         return AVERROR(EINVAL);

-    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__,
-		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
-		 fmt->fmt.pix_mp.pixelformat,
-		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
+    av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__,
+         fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height,
+         fmt.fmt.pix_mp.pixelformat,
+         fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline);

-    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-        if ((fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 &&
-             fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12) ||
-            fmt->fmt.pix_mp.field != field) {
-            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
+    r_avfmt = fmt_v4l2_to_av(fmt_pixelformat(&fmt));
+    if (r_avfmt != avfmt && avfmt != AV_PIX_FMT_NONE) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Unable to set format %s on %s port\n", av_get_pix_fmt_name(avfmt), V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src");
+        return AVERROR(EINVAL);
+    }
+    if (r_avfmt == AV_PIX_FMT_NONE) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "No supported format on %s port\n", V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src");
+        return AVERROR(EINVAL);
+    }
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
+        if (fmt.fmt.pix_mp.field != field) {
+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type);

             return AVERROR(EINVAL);
         }
     } else {
-        if ((fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 &&
-             fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12) ||
-            fmt->fmt.pix.field != field) {
-            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
+        if (fmt.fmt.pix.field != field) {
+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type);

             return AVERROR(EINVAL);
         }
@@ -393,68 +509,410 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue)
     return 0;
 }

-static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height, int pitch, int ysize)
+static int
+do_s_fmt(V4L2Queue * const q)
 {
-    struct v4l2_format *fmt        = &queue->format;
-    DeintV4L2M2MContextShared *ctx = queue->ctx;
+    DeintV4L2M2MContextShared * const ctx = q->ctx;
+    const uint32_t pixelformat = fmt_pixelformat(&q->format);
     int ret;

-    struct v4l2_selection sel = {
-        .type = fmt->type,
-        .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS,
-    };
-
-    // This works for most single object 4:2:0 types
-    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-        fmt->fmt.pix_mp.pixelformat = pixelformat;
-        fmt->fmt.pix_mp.field = field;
-        fmt->fmt.pix_mp.width = width;
-        fmt->fmt.pix_mp.height = ysize / pitch;
-        fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch;
-        fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1);
-    } else {
-        fmt->fmt.pix.pixelformat = pixelformat;
-        fmt->fmt.pix.field = field;
-        fmt->fmt.pix.width = width;
-        fmt->fmt.pix.height = height;
-        fmt->fmt.pix.sizeimage = 0;
-        fmt->fmt.pix.bytesperline = 0;
-    }
-
-    ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt);
+    ret = ioctl(ctx->fd, VIDIOC_S_FMT, &q->format);
     if (ret) {
         ret = AVERROR(errno);
-        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret);
+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %s\n", av_err2str(ret));
         return ret;
     }

-    if (pixelformat != fmt->fmt.pix.pixelformat) {
-        av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt->fmt.pix.pixelformat));
+    if (pixelformat != fmt_pixelformat(&q->format)) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt_pixelformat(&q->format)));
         return AVERROR(EINVAL);
     }

-    ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel);
+    q->sel.target = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE,
+    q->sel.flags  = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_FLAG_LE : V4L2_SEL_FLAG_GE;
+
+    ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &q->sel);
     if (ret) {
         ret = AVERROR(errno);
-        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION failed: %d\n", ret);
+        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %s\n", av_err2str(ret));
     }

-    sel.r.width = width;
-    sel.r.height = height;
-    sel.r.left = 0;
-    sel.r.top = 0;
-    sel.target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE,
-    sel.flags = V4L2_SEL_FLAG_LE;
+    return 0;
+}

-    ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel);
-    if (ret) {
-        ret = AVERROR(errno);
-        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %d\n", ret);
+static void
+set_fmt_color(struct v4l2_format *const fmt,
+               const enum AVColorPrimaries avcp,
+               const enum AVColorSpace avcs,
+               const enum AVColorTransferCharacteristic avxc)
+{
+    enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT;
+    enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT;
+    enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT;
+
+    switch (avcp) {
+    case AVCOL_PRI_BT709:
+        cs = V4L2_COLORSPACE_REC709;
+        ycbcr = V4L2_YCBCR_ENC_709;
+        break;
+    case AVCOL_PRI_BT470M:
+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
+        ycbcr = V4L2_YCBCR_ENC_601;
+        break;
+    case AVCOL_PRI_BT470BG:
+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
+        break;
+    case AVCOL_PRI_SMPTE170M:
+        cs = V4L2_COLORSPACE_SMPTE170M;
+        break;
+    case AVCOL_PRI_SMPTE240M:
+        cs = V4L2_COLORSPACE_SMPTE240M;
+        break;
+    case AVCOL_PRI_BT2020:
+        cs = V4L2_COLORSPACE_BT2020;
+        break;
+    case AVCOL_PRI_SMPTE428:
+    case AVCOL_PRI_SMPTE431:
+    case AVCOL_PRI_SMPTE432:
+    case AVCOL_PRI_EBU3213:
+    case AVCOL_PRI_RESERVED:
+    case AVCOL_PRI_FILM:
+    case AVCOL_PRI_UNSPECIFIED:
+    default:
+        break;
+    }
+
+    switch (avcs) {
+    case AVCOL_SPC_RGB:
+        cs = V4L2_COLORSPACE_SRGB;
+        break;
+    case AVCOL_SPC_BT709:
+        cs = V4L2_COLORSPACE_REC709;
+        break;
+    case AVCOL_SPC_FCC:
+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
+        break;
+    case AVCOL_SPC_BT470BG:
+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
+        break;
+    case AVCOL_SPC_SMPTE170M:
+        cs = V4L2_COLORSPACE_SMPTE170M;
+        break;
+    case AVCOL_SPC_SMPTE240M:
+        cs = V4L2_COLORSPACE_SMPTE240M;
+        break;
+    case AVCOL_SPC_BT2020_CL:
+        cs = V4L2_COLORSPACE_BT2020;
+        ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM;
+        break;
+    case AVCOL_SPC_BT2020_NCL:
+        cs = V4L2_COLORSPACE_BT2020;
+        break;
+    default:
+        break;
+    }
+
+    switch (xfer) {
+    case AVCOL_TRC_BT709:
+        xfer = V4L2_XFER_FUNC_709;
+        break;
+    case AVCOL_TRC_IEC61966_2_1:
+        xfer = V4L2_XFER_FUNC_SRGB;
+        break;
+    case AVCOL_TRC_SMPTE240M:
+        xfer = V4L2_XFER_FUNC_SMPTE240M;
+        break;
+    case AVCOL_TRC_SMPTE2084:
+        xfer = V4L2_XFER_FUNC_SMPTE2084;
+        break;
+    default:
+        break;
+    }
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        fmt->fmt.pix_mp.colorspace = cs;
+        fmt->fmt.pix_mp.ycbcr_enc = ycbcr;
+        fmt->fmt.pix_mp.xfer_func = xfer;
+    } else {
+        fmt->fmt.pix.colorspace = cs;
+        fmt->fmt.pix.ycbcr_enc = ycbcr;
+        fmt->fmt.pix.xfer_func = xfer;
+    }
+}
+
+static void
+set_fmt_color_range(struct v4l2_format *const fmt, const enum AVColorRange avcr)
+{
+    const enum v4l2_quantization q =
+        avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE :
+        avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE :
+            V4L2_QUANTIZATION_DEFAULT;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        fmt->fmt.pix_mp.quantization = q;
+    } else {
+        fmt->fmt.pix.quantization = q;
+    }
+}
+
+static enum AVColorPrimaries get_color_primaries(const struct v4l2_format *const fmt)
+{
+    enum v4l2_ycbcr_encoding ycbcr;
+    enum v4l2_colorspace cs;
+
+    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.colorspace :
+        fmt->fmt.pix.colorspace;
+
+    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.ycbcr_enc:
+        fmt->fmt.pix.ycbcr_enc;
+
+    switch(ycbcr) {
+    case V4L2_YCBCR_ENC_XV709:
+    case V4L2_YCBCR_ENC_709: return AVCOL_PRI_BT709;
+    case V4L2_YCBCR_ENC_XV601:
+    case V4L2_YCBCR_ENC_601:return AVCOL_PRI_BT470M;
+    default:
+        break;
+    }
+
+    switch(cs) {
+    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_PRI_BT470BG;
+    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_PRI_SMPTE170M;
+    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_PRI_SMPTE240M;
+    case V4L2_COLORSPACE_BT2020: return AVCOL_PRI_BT2020;
+    default:
+        break;
+    }
+
+    return AVCOL_PRI_UNSPECIFIED;
+}
+
+static enum AVColorSpace get_color_space(const struct v4l2_format *const fmt)
+{
+    enum v4l2_ycbcr_encoding ycbcr;
+    enum v4l2_colorspace cs;
+
+    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.colorspace :
+        fmt->fmt.pix.colorspace;
+
+    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.ycbcr_enc:
+        fmt->fmt.pix.ycbcr_enc;
+
+    switch(cs) {
+    case V4L2_COLORSPACE_SRGB: return AVCOL_SPC_RGB;
+    case V4L2_COLORSPACE_REC709: return AVCOL_SPC_BT709;
+    case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_SPC_FCC;
+    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_SPC_BT470BG;
+    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_SPC_SMPTE170M;
+    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_SPC_SMPTE240M;
+    case V4L2_COLORSPACE_BT2020:
+        if (ycbcr == V4L2_YCBCR_ENC_BT2020_CONST_LUM)
+            return AVCOL_SPC_BT2020_CL;
+        else
+             return AVCOL_SPC_BT2020_NCL;
+    default:
+        break;
+    }
+
+    return AVCOL_SPC_UNSPECIFIED;
+}
+
+static enum AVColorTransferCharacteristic get_color_trc(const struct v4l2_format *const fmt)
+{
+    enum v4l2_ycbcr_encoding ycbcr;
+    enum v4l2_xfer_func xfer;
+    enum v4l2_colorspace cs;
+
+    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.colorspace :
+        fmt->fmt.pix.colorspace;
+
+    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.ycbcr_enc:
+        fmt->fmt.pix.ycbcr_enc;
+
+    xfer = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.xfer_func:
+        fmt->fmt.pix.xfer_func;
+
+    switch (xfer) {
+    case V4L2_XFER_FUNC_709: return AVCOL_TRC_BT709;
+    case V4L2_XFER_FUNC_SRGB: return AVCOL_TRC_IEC61966_2_1;
+    default:
+        break;
+    }
+
+    switch (cs) {
+    case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_TRC_GAMMA22;
+    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_TRC_GAMMA28;
+    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_TRC_SMPTE170M;
+    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_TRC_SMPTE240M;
+    default:
+        break;
+    }
+
+    switch (ycbcr) {
+    case V4L2_YCBCR_ENC_XV709:
+    case V4L2_YCBCR_ENC_XV601: return AVCOL_TRC_BT1361_ECG;
+    default:
+        break;
+    }
+
+    return AVCOL_TRC_UNSPECIFIED;
+}
+
+static enum AVColorRange get_color_range(const struct v4l2_format *const fmt)
+{
+    enum v4l2_quantization qt;
+
+    qt = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.quantization :
+        fmt->fmt.pix.quantization;
+
+    switch (qt) {
+    case V4L2_QUANTIZATION_LIM_RANGE: return AVCOL_RANGE_MPEG;
+    case V4L2_QUANTIZATION_FULL_RANGE: return AVCOL_RANGE_JPEG;
+    default:
+        break;
+    }
+
+     return AVCOL_RANGE_UNSPECIFIED;
+}
+
+static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame)
+{
+    struct v4l2_format *const format = &q->format;
+    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
+
+    const uint32_t drm_fmt = src->layers[0].format;
+    // Treat INVALID as LINEAR
+    const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
+        DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
+    uint32_t pix_fmt = 0;
+    uint32_t w = 0;
+    uint32_t h = 0;
+    uint32_t bpl = src->layers[0].planes[0].pitch;
+
+    // We really don't expect multiple layers
+    // All formats that we currently cope with are single object
+
+    if (src->nb_layers != 1 || src->nb_objects != 1)
+        return AVERROR(EINVAL);
+
+    switch (drm_fmt) {
+        case DRM_FORMAT_YUV420:
+            if (mod == DRM_FORMAT_MOD_LINEAR) {
+                if (src->layers[0].nb_planes != 3)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_YUV420;
+                h = src->layers[0].planes[1].offset / bpl;
+                w = bpl;
+            }
+            break;
+
+        case DRM_FORMAT_NV12:
+            if (mod == DRM_FORMAT_MOD_LINEAR) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_NV12;
+                h = src->layers[0].planes[1].offset / bpl;
+                w = bpl;
+            }
+            else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_NV12_COL128;
+                w = bpl;
+                h = src->layers[0].planes[1].offset / 128;
+                bpl = fourcc_mod_broadcom_param(mod);
+            }
+            break;
+
+        case DRM_FORMAT_P030:
+            if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt =  V4L2_PIX_FMT_NV12_10_COL128;
+                w = bpl / 2;  // Matching lie to how we construct this
+                h = src->layers[0].planes[1].offset / 128;
+                bpl = fourcc_mod_broadcom_param(mod);
+            }
+            break;
+
+        default:
+            break;
+    }
+
+    if (!pix_fmt)
+        return AVERROR(EINVAL);
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
+        struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
+
+        pix->width = w;
+        pix->height = h;
+        pix->pixelformat = pix_fmt;
+        pix->plane_fmt[0].bytesperline = bpl;
+        pix->num_planes = 1;
+    }
+    else {
+        struct v4l2_pix_format *const pix = &format->fmt.pix;
+
+        pix->width = w;
+        pix->height = h;
+        pix->pixelformat = pix_fmt;
+        pix->bytesperline = bpl;
     }

+    set_fmt_color(format, frame->color_primaries, frame->colorspace, frame->color_trc);
+    set_fmt_color_range(format, frame->color_range);
+
+    q->sel.r.width = frame->width - (frame->crop_left + frame->crop_right);
+    q->sel.r.height = frame->height - (frame->crop_top + frame->crop_bottom);
+    q->sel.r.left = frame->crop_left;
+    q->sel.r.top = frame->crop_top;
+
     return 0;
 }

+
+static int set_dst_format(DeintV4L2M2MContext * const priv, V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height)
+{
+    struct v4l2_format * const fmt   = &queue->format;
+    struct v4l2_selection *const sel = &queue->sel;
+
+    memset(&fmt->fmt, 0, sizeof(fmt->fmt));
+
+    // Align w/h to 16 here in case there are alignment requirements at the next
+    // stage of the filter chain (also RPi deinterlace setup is bust and this
+    // fixes it)
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        fmt->fmt.pix_mp.pixelformat = pixelformat;
+        fmt->fmt.pix_mp.field = field;
+        fmt->fmt.pix_mp.width = FFALIGN(width, 16);
+        fmt->fmt.pix_mp.height = FFALIGN(height, 16);
+    } else {
+        fmt->fmt.pix.pixelformat = pixelformat;
+        fmt->fmt.pix.field = field;
+        fmt->fmt.pix.width = FFALIGN(width, 16);
+        fmt->fmt.pix.height = FFALIGN(height, 16);
+    }
+
+    set_fmt_color(fmt, priv->colour_primaries, priv->colour_matrix, priv->colour_transfer);
+    set_fmt_color_range(fmt, priv->colour_range);
+
+    sel->r.width = width;
+    sel->r.height = height;
+    sel->r.left = 0;
+    sel->r.top = 0;
+
+    return do_s_fmt(queue);
+}
+
 static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node)
 {
     int ret;
@@ -464,16 +922,22 @@ static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node
         return AVERROR(errno);

     ret = deint_v4l2m2m_prepare_context(ctx);
-    if (ret)
+    if (ret) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to prepare context\n");
         goto fail;
+    }

-    ret = deint_v4l2m2m_try_format(&ctx->capture);
-    if (ret)
+    ret = deint_v4l2m2m_try_format(&ctx->capture, ctx->output_width, ctx->output_height, ctx->output_format);
+    if (ret) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try dst format\n");
         goto fail;
+    }

-    ret = deint_v4l2m2m_try_format(&ctx->output);
-    if (ret)
+    ret = deint_v4l2m2m_try_format(&ctx->output, ctx->width, ctx->height, AV_PIX_FMT_NONE);
+    if (ret) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try src format\n");
         goto fail;
+    }

     return 0;

@@ -534,26 +998,118 @@ static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf)
     return 0;
 }

-static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat)
+static void
+drm_frame_init(AVDRMFrameDescriptor * const d)
+{
+    unsigned int i;
+    for (i = 0; i != AV_DRM_MAX_PLANES; ++i) {
+        d->objects[i].fd = -1;
+    }
+}
+
+static void
+drm_frame_uninit(AVDRMFrameDescriptor * const d)
+{
+    unsigned int i;
+    for (i = 0; i != d->nb_objects; ++i) {
+        if (d->objects[i].fd != -1) {
+            close(d->objects[i].fd);
+            d->objects[i].fd = -1;
+        }
+    }
+}
+
+static void
+avbufs_delete(V4L2Buffer** ppavbufs, const unsigned int n)
+{
+    unsigned int i;
+    V4L2Buffer* const avbufs = *ppavbufs;
+
+    if (avbufs == NULL)
+        return;
+    *ppavbufs = NULL;
+
+    for (i = 0; i != n; ++i) {
+        V4L2Buffer* const avbuf = avbufs + i;
+        drm_frame_uninit(&avbuf->drm_frame);
+    }
+
+    av_free(avbufs);
+}
+
+static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf)
 {
     struct v4l2_exportbuffer expbuf;
     int i, ret;
     uint64_t mod = DRM_FORMAT_MOD_LINEAR;
-    uint32_t fmt = 0;

-    switch (pixelformat) {
-    case V4L2_PIX_FMT_NV12:
-        fmt = DRM_FORMAT_NV12;
-        break;
-    case V4L2_PIX_FMT_YUV420:
-        fmt = DRM_FORMAT_YUV420;
-        break;
-    default:
-        return AVERROR(EINVAL);
+    AVDRMFrameDescriptor * const drm_desc = &avbuf->drm_frame;
+    AVDRMLayerDescriptor * const layer = &drm_desc->layers[0];
+    const struct v4l2_format *const fmt = &q->format;
+    const uint32_t height = fmt_height(fmt);
+    const uint32_t width  = fmt_width(fmt);
+    ptrdiff_t bpl0;
+
+    /* fill the DRM frame descriptor */
+    drm_desc->nb_layers = 1;
+    layer->nb_planes = avbuf->num_planes;
+
+    for (int i = 0; i < avbuf->num_planes; i++) {
+        layer->planes[i].object_index = i;
+        layer->planes[i].offset = 0;
+        layer->planes[i].pitch = fmt_bpl(fmt, i);
     }
+    bpl0 = layer->planes[0].pitch;
+
+    switch (fmt_pixelformat(fmt)) {
+
+        case V4L2_PIX_FMT_NV12_COL128:
+            mod = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl0);
+            layer->format = V4L2_PIX_FMT_NV12;
+
+            if (avbuf->num_planes > 1)
+                break;
+
+            layer->nb_planes = 2;
+            layer->planes[1].object_index = 0;
+            layer->planes[1].offset = height * 128;
+            layer->planes[0].pitch = width;
+            layer->planes[1].pitch = width;
+            break;

-    avbuf->drm_frame.layers[0].format = fmt;
+        case DRM_FORMAT_NV12:
+            layer->format = V4L2_PIX_FMT_NV12;

+            if (avbuf->num_planes > 1)
+                break;
+
+            layer->nb_planes = 2;
+            layer->planes[1].object_index = 0;
+            layer->planes[1].offset = bpl0 * height;
+            layer->planes[1].pitch = bpl0;
+            break;
+
+        case V4L2_PIX_FMT_YUV420:
+            layer->format = DRM_FORMAT_YUV420;
+
+            if (avbuf->num_planes > 1)
+                break;
+
+            layer->nb_planes = 3;
+            layer->planes[1].object_index = 0;
+            layer->planes[1].offset = bpl0 * height;
+            layer->planes[1].pitch = bpl0 / 2;
+            layer->planes[2].object_index = 0;
+            layer->planes[2].offset = layer->planes[1].offset + ((bpl0 * height) / 4);
+            layer->planes[2].pitch = bpl0 / 2;
+            break;
+
+        default:
+            drm_desc->nb_layers = 0;
+            return AVERROR(EINVAL);
+    }
+
+    drm_desc->nb_objects = 0;
     for (i = 0; i < avbuf->num_planes; i++) {
         memset(&expbuf, 0, sizeof(expbuf));

@@ -565,19 +1121,11 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat)
         if (ret < 0)
             return AVERROR(errno);

-        avbuf->fd = expbuf.fd;
-
-        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) {
-            /* drm frame */
-            avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length;
-            avbuf->drm_frame.objects[i].fd = expbuf.fd;
-            avbuf->drm_frame.objects[i].format_modifier = mod;
-        } else {
-            /* drm frame */
-            avbuf->drm_frame.objects[0].size = avbuf->buffer.length;
-            avbuf->drm_frame.objects[0].fd = expbuf.fd;
-            avbuf->drm_frame.objects[0].format_modifier = mod;
-        }
+        drm_desc->objects[i].size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type) ?
+            avbuf->buffer.m.planes[i].length : avbuf->buffer.length;
+        drm_desc->objects[i].fd = expbuf.fd;
+        drm_desc->objects[i].format_modifier = mod;
+        drm_desc->nb_objects = i + 1;
     }

     return 0;
@@ -588,7 +1136,7 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
     struct v4l2_format *fmt = &queue->format;
     DeintV4L2M2MContextShared *ctx = queue->ctx;
     struct v4l2_requestbuffers req;
-    int ret, i, j, multiplanar;
+    int ret, i, multiplanar;
     uint32_t memory;

     memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ?
@@ -617,10 +1165,9 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
     }

     for (i = 0; i < queue->num_buffers; i++) {
-        V4L2Buffer *buf = &queue->buffers[i];
+        V4L2Buffer * const buf = &queue->buffers[i];

         buf->enqueued = 0;
-        buf->fd = -1;
         buf->q = queue;

         buf->buffer.type = fmt->type;
@@ -632,6 +1179,12 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
             buf->buffer.m.planes = buf->planes;
         }

+        drm_frame_init(&buf->drm_frame);
+    }
+
+    for (i = 0; i < queue->num_buffers; i++) {
+        V4L2Buffer * const buf = &queue->buffers[i];
+
         ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer);
         if (ret < 0) {
             ret = AVERROR(errno);
@@ -639,29 +1192,14 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
             goto fail;
         }

-        if (multiplanar)
-            buf->num_planes = buf->buffer.length;
-        else
-            buf->num_planes = 1;
-
-        for (j = 0; j < buf->num_planes; j++) {
-            V4L2PlaneInfo *info = &buf->plane_info[j];
-
-            if (multiplanar) {
-                info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline;
-                info->length = buf->buffer.m.planes[j].length;
-            } else {
-                info->bytesperline = fmt->fmt.pix.bytesperline;
-                info->length = buf->buffer.length;
-            }
-        }
+        buf->num_planes = multiplanar ? buf->buffer.length : 1;

         if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) {
             ret = deint_v4l2m2m_enqueue_buffer(buf);
             if (ret)
                 goto fail;

-            ret = v4l2_buffer_export_drm(buf, multiplanar ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat);
+            ret = v4l2_buffer_export_drm(queue, buf);
             if (ret)
                 goto fail;
         }
@@ -670,12 +1208,8 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
     return 0;

 fail:
-    for (i = 0; i < queue->num_buffers; i++)
-        if (queue->buffers[i].fd >= 0)
-            close(queue->buffers[i].fd);
-    av_free(queue->buffers);
-    queue->buffers = NULL;
-
+    avbufs_delete(&queue->buffers, queue->num_buffers);
+    queue->num_buffers = 0;
     return ret;
 }

@@ -862,7 +1396,6 @@ static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx)
     if (atomic_fetch_sub(&ctx->refcount, 1) == 1) {
         V4L2Queue *capture = &ctx->capture;
         V4L2Queue *output  = &ctx->output;
-        int i;

         av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__);

@@ -871,12 +1404,7 @@ static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx)
             deint_v4l2m2m_streamoff(output);
         }

-        if (capture->buffers)
-            for (i = 0; i < capture->num_buffers; i++) {
-                capture->buffers[i].q = NULL;
-                if (capture->buffers[i].fd >= 0)
-                    close(capture->buffers[i].fd);
-            }
+        avbufs_delete(&capture->buffers, capture->num_buffers);

         deint_v4l2m2m_unref_queued(output);

@@ -908,73 +1436,15 @@ static void v4l2_free_buffer(void *opaque, uint8_t *unused)
     deint_v4l2m2m_destroy_context(ctx);
 }

-static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
-{
-    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
-    AVDRMLayerDescriptor *layer;
-
-    /* fill the DRM frame descriptor */
-    drm_desc->nb_objects = avbuf->num_planes;
-    drm_desc->nb_layers = 1;
-
-    layer = &drm_desc->layers[0];
-    layer->nb_planes = avbuf->num_planes;
-
-    for (int i = 0; i < avbuf->num_planes; i++) {
-        layer->planes[i].object_index = i;
-        layer->planes[i].offset = 0;
-        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
-    }
-
-    switch (layer->format) {
-    case DRM_FORMAT_YUYV:
-        layer->nb_planes = 1;
-        break;
-
-    case DRM_FORMAT_NV12:
-    case DRM_FORMAT_NV21:
-        if (avbuf->num_planes > 1)
-            break;
-
-        layer->nb_planes = 2;
-
-        layer->planes[1].object_index = 0;
-        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
-            height;
-        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
-        break;
-
-    case DRM_FORMAT_YUV420:
-        if (avbuf->num_planes > 1)
-            break;
-
-        layer->nb_planes = 3;
-
-        layer->planes[1].object_index = 0;
-        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
-            height;
-        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
-
-        layer->planes[2].object_index = 0;
-        layer->planes[2].offset = layer->planes[1].offset +
-            ((avbuf->plane_info[0].bytesperline *
-              height) >> 2);
-        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
-        break;
-
-    default:
-        drm_desc->nb_layers = 0;
-        break;
-    }
-
-    return (uint8_t *) drm_desc;
-}
-
 // timeout in ms
 static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout)
 {
     DeintV4L2M2MContextShared *ctx = queue->ctx;
     V4L2Buffer* avbuf;
+    enum AVColorPrimaries color_primaries;
+    enum AVColorSpace colorspace;
+    enum AVColorTransferCharacteristic color_trc;
+    enum AVColorRange color_range;

     av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__);

@@ -985,8 +1455,6 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim
     }

     // Fill in PTS and anciliary info from src frame
-    // we will want to overwrite some fields as only the pts/dts
-    // fields are updated with new timing in this fn
     pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame);

     frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame,
@@ -999,18 +1467,36 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim

     atomic_fetch_add(&ctx->refcount, 1);

-    frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height);
+    frame->data[0] = (uint8_t *)&avbuf->drm_frame;
     frame->format = AV_PIX_FMT_DRM_PRIME;
     if (ctx->hw_frames_ctx)
         frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
-    frame->height = ctx->height;
-    frame->width = ctx->width;
-
-    // Not interlaced now
-    frame->interlaced_frame = 0;
-    frame->top_field_first = 0;
-    // Pkt duration halved
-    frame->pkt_duration /= 2;
+    frame->height = ctx->output_height;
+    frame->width = ctx->output_width;
+
+    color_primaries = get_color_primaries(&ctx->capture.format);
+    colorspace      = get_color_space(&ctx->capture.format);
+    color_trc       = get_color_trc(&ctx->capture.format);
+    color_range     = get_color_range(&ctx->capture.format);
+
+    // If the color parameters are unspecified by V4L2 then leave alone as they
+    // will have been copied from src
+    if (color_primaries != AVCOL_PRI_UNSPECIFIED)
+        frame->color_primaries = color_primaries;
+    if (colorspace != AVCOL_SPC_UNSPECIFIED)
+        frame->colorspace = colorspace;
+    if (color_trc != AVCOL_TRC_UNSPECIFIED)
+        frame->color_trc = color_trc;
+    if (color_range != AVCOL_RANGE_UNSPECIFIED)
+        frame->color_range = color_range;
+
+    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE) {
+        // Not interlaced now
+        frame->interlaced_frame = 0;   // *** Fill in from dst buffer?
+        frame->top_field_first = 0;
+        // Pkt duration halved
+        frame->pkt_duration /= 2;
+    }

     if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) {
         av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n");
@@ -1032,15 +1518,34 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
     ctx->height = avctx->inputs[0]->h;
     ctx->width = avctx->inputs[0]->w;

-    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d\n", __func__, ctx->width, ctx->height);
+    if (ctx->filter_type == FILTER_V4L2_SCALE) {
+        if ((ret = ff_scale_eval_dimensions(priv,
+                                            priv->w_expr, priv->h_expr,
+                                            inlink, outlink,
+                                            &ctx->output_width, &ctx->output_height)) < 0)
+            return ret;
+
+        ff_scale_adjust_dimensions(inlink, &ctx->output_width, &ctx->output_height,
+                                   priv->force_original_aspect_ratio, priv->force_divisible_by);
+    }
+    else {
+        ctx->output_width  = ctx->width;
+        ctx->output_height = ctx->height;
+    }
+
+    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d\n", __func__, ctx->width, ctx->height, ctx->output_width, ctx->output_height);

     outlink->time_base           = inlink->time_base;
-    outlink->w                   = inlink->w;
-    outlink->h                   = inlink->h;
-    outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+    outlink->w                   = ctx->output_width;
+    outlink->h                   = ctx->output_height;
     outlink->format              = inlink->format;
     outlink->frame_rate = (AVRational) {1, 0};  // Deny knowledge of frame rate

+    if (inlink->sample_aspect_ratio.num)
+        outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio);
+    else
+        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+
     ret = deint_v4l2m2m_find_device(ctx);
     if (ret)
         return ret;
@@ -1055,18 +1560,19 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink)

 static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc)
 {
-    const int is_linear = (drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_LINEAR ||
-            drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID);
+    const uint64_t mod = drm_desc->objects[0].format_modifier;
+    const int is_linear = (mod == DRM_FORMAT_MOD_LINEAR || mod == DRM_FORMAT_MOD_INVALID);
+
+    // Only currently support single object things
+    if (drm_desc->nb_objects != 1)
+        return 0;

     switch (drm_desc->layers[0].format) {
     case DRM_FORMAT_YUV420:
-        if (is_linear)
-            return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_YUV420 : 0;
-        break;
+        return is_linear ? V4L2_PIX_FMT_YUV420 : 0;
     case DRM_FORMAT_NV12:
-        if (is_linear)
-            return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_NV12 : 0;
-        break;
+        return is_linear ? V4L2_PIX_FMT_NV12 :
+            fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 : 0;
     default:
         break;
     }
@@ -1089,7 +1595,7 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)

     if (ctx->field_order == V4L2_FIELD_ANY) {
         const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0];
-        const uint32_t pixelformat = desc_pixelformat(drm_desc);
+        uint32_t pixelformat = desc_pixelformat(drm_desc);

         if (pixelformat == 0) {
             av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n",
@@ -1104,29 +1610,49 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
         av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height,
            drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset);

-        ret = deint_v4l2m2m_set_format(output, pixelformat, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
-        if (ret)
+        if ((ret = set_src_fmt(output, in)) != 0) {
+            av_log(avctx, AV_LOG_WARNING, "Unknown input DRM format: %s mod: %#" PRIx64 "\n",
+                   av_fourcc2str(drm_desc->layers[0].format), drm_desc->objects[0].format_modifier);
+            return ret;
+        }
+
+        ret = do_s_fmt(output);
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed to set source format\n");
             return ret;
+        }

-        ret = deint_v4l2m2m_set_format(capture, pixelformat, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
-        if (ret)
+        if (ctx->output_format != AV_PIX_FMT_NONE)
+           pixelformat = fmt_av_to_v4l2(ctx->output_format);
+        ret = set_dst_format(priv, capture, pixelformat, V4L2_FIELD_NONE, ctx->output_width, ctx->output_height);
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed to set destination format\n");
             return ret;
+        }

         ret = deint_v4l2m2m_allocate_buffers(capture);
-        if (ret)
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed to allocate destination buffers\n");
             return ret;
+        }

         ret = deint_v4l2m2m_streamon(capture);
-        if (ret)
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed set destination streamon: %s\n", av_err2str(ret));
             return ret;
+        }

         ret = deint_v4l2m2m_allocate_buffers(output);
-        if (ret)
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed to allocate src buffers\n");
             return ret;
+        }

         ret = deint_v4l2m2m_streamon(output);
-        if (ret)
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed set src streamon: %s\n", av_err2str(ret));
             return ret;
+        }

         if (in->top_field_first)
             ctx->field_order = V4L2_FIELD_INTERLACED_TB;
@@ -1251,7 +1777,7 @@ again:
     return did_something ? 0 : FFERROR_NOT_READY;
 }

-static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
+static av_cold int common_v4l2m2m_init(AVFilterContext * const avctx, const filter_type_v4l2_t filter_type)
 {
     DeintV4L2M2MContext * const priv = avctx->priv;
     DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared));
@@ -1262,6 +1788,7 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
     }
     priv->shared = ctx;
     ctx->logctx = priv;
+    ctx->filter_type = filter_type;
     ctx->fd = -1;
     ctx->output.ctx = ctx;
     ctx->output.num_buffers = 8;
@@ -1274,9 +1801,52 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)

     atomic_init(&ctx->refcount, 1);

+    if (priv->output_format_string) {
+        ctx->output_format = av_get_pix_fmt(priv->output_format_string);
+        if (ctx->output_format == AV_PIX_FMT_NONE) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid ffmpeg output format '%s'.\n", priv->output_format_string);
+            return AVERROR(EINVAL);
+        }
+        if (fmt_av_to_v4l2(ctx->output_format) == 0) {
+            av_log(avctx, AV_LOG_ERROR, "Unsupported output format for V4L2: %s.\n", av_get_pix_fmt_name(ctx->output_format));
+            return AVERROR(EINVAL);
+        }
+    } else {
+        // Use the input format once that is configured.
+        ctx->output_format = AV_PIX_FMT_NONE;
+    }
+
+#define STRING_OPTION(var_name, func_name, default_value) do { \
+        if (priv->var_name ## _string) { \
+            int var = av_ ## func_name ## _from_name(priv->var_name ## _string); \
+            if (var < 0) { \
+                av_log(avctx, AV_LOG_ERROR, "Invalid %s.\n", #var_name); \
+                return AVERROR(EINVAL); \
+            } \
+            priv->var_name = var; \
+        } else { \
+            priv->var_name = default_value; \
+        } \
+    } while (0)
+
+    STRING_OPTION(colour_primaries, color_primaries, AVCOL_PRI_UNSPECIFIED);
+    STRING_OPTION(colour_transfer,  color_transfer,  AVCOL_TRC_UNSPECIFIED);
+    STRING_OPTION(colour_matrix,    color_space,     AVCOL_SPC_UNSPECIFIED);
+    STRING_OPTION(chroma_location,  chroma_location, AVCHROMA_LOC_UNSPECIFIED);
+
     return 0;
 }

+static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
+{
+    return common_v4l2m2m_init(avctx, FILTER_V4L2_DEINTERLACE);
+}
+
+static av_cold int scale_v4l2m2m_init(AVFilterContext *avctx)
+{
+    return common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE);
+}
+
 static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
 {
     DeintV4L2M2MContext *priv = avctx->priv;
@@ -1294,6 +1864,51 @@ static const AVOption deinterlace_v4l2m2m_options[] = {

 AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m);

+#define OFFSET(x) offsetof(DeintV4L2M2MContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
+
+static const AVOption scale_v4l2m2m_options[] = {
+    { "w", "Output video width",
+      OFFSET(w_expr), AV_OPT_TYPE_STRING, {.str = "iw"}, .flags = FLAGS },
+    { "h", "Output video height",
+      OFFSET(h_expr), AV_OPT_TYPE_STRING, {.str = "ih"}, .flags = FLAGS },
+    { "format", "Output video format (software format of hardware frames)",
+      OFFSET(output_format_string), AV_OPT_TYPE_STRING, .flags = FLAGS },
+      // These colour properties match the ones of the same name in vf_scale.
+      { "out_color_matrix", "Output colour matrix coefficient set",
+      OFFSET(colour_matrix_string), AV_OPT_TYPE_STRING, { .str = NULL }, .flags = FLAGS },
+    { "out_range", "Output colour range",
+      OFFSET(colour_range), AV_OPT_TYPE_INT, { .i64 = AVCOL_RANGE_UNSPECIFIED },
+      AVCOL_RANGE_UNSPECIFIED, AVCOL_RANGE_JPEG, FLAGS, "range" },
+        { "full",    "Full range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
+        { "limited", "Limited range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
+        { "jpeg",    "Full range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
+        { "mpeg",    "Limited range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
+        { "tv",      "Limited range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
+        { "pc",      "Full range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
+    // These colour properties match the ones in the VAAPI scaler
+    { "out_color_primaries", "Output colour primaries",
+      OFFSET(colour_primaries_string), AV_OPT_TYPE_STRING,
+      { .str = NULL }, .flags = FLAGS },
+    { "out_color_transfer", "Output colour transfer characteristics",
+      OFFSET(colour_transfer_string),  AV_OPT_TYPE_STRING,
+      { .str = NULL }, .flags = FLAGS },
+    { "out_chroma_location", "Output chroma sample location",
+      OFFSET(chroma_location_string),  AV_OPT_TYPE_STRING,
+      { .str = NULL }, .flags = FLAGS },
+    { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 2, FLAGS, "force_oar" },
+    { "force_divisible_by", "enforce that the output resolution is divisible by a defined integer when force_original_aspect_ratio is used", OFFSET(force_divisible_by), AV_OPT_TYPE_INT, { .i64 = 1}, 1, 256, FLAGS },
+    { NULL },
+};
+
+AVFILTER_DEFINE_CLASS(scale_v4l2m2m);
+
 static const AVFilterPad deint_v4l2m2m_inputs[] = {
     {
         .name         = "default",
@@ -1321,3 +1936,17 @@ AVFilter ff_vf_deinterlace_v4l2m2m = {
     .priv_class     = &deinterlace_v4l2m2m_class,
     .activate       = deint_v4l2m2m_activate,
 };
+
+AVFilter ff_vf_scale_v4l2m2m = {
+    .name           = "scale_v4l2m2m",
+    .description    = NULL_IF_CONFIG_SMALL("V4L2 M2M scaler"),
+    .priv_size      = sizeof(DeintV4L2M2MContext),
+    .init           = &scale_v4l2m2m_init,
+    .uninit         = &deint_v4l2m2m_uninit,
+    FILTER_INPUTS(deint_v4l2m2m_inputs),
+    FILTER_OUTPUTS(deint_v4l2m2m_outputs),
+    FILTER_SINGLE_SAMPLEFMT(AV_PIX_FMT_DRM_PRIME),
+    .priv_class     = &scale_v4l2m2m_class,
+    .activate       = deint_v4l2m2m_activate,
+};
+

From 7e7147d50bc6e3f13834525dba3a47d170422f07 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 22 Sep 2022 14:54:46 +0000
Subject: [PATCH 071/161] v4l2_m2m: Adjust buffer allocation based on min/max
 controls

Clip requested buffer count to min/max declared by driver.
If 0 buffers requested then set to min+2.
This allows encode to keep its src buffer count down to a plausible
minimum which helps with flow control.
---
 libavcodec/v4l2_context.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 6b97eab41e..ba36689ff3 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -1187,6 +1187,7 @@ fail_release:

 int ff_v4l2_context_init(V4L2Context* ctx)
 {
+    struct v4l2_queryctrl qctrl;
     V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
     int ret;

@@ -1228,6 +1229,24 @@ int ff_v4l2_context_init(V4L2Context* ctx)
         goto fail_unref_hwframes;
     }

+    memset(&qctrl, 0, sizeof(qctrl));
+    qctrl.id = V4L2_CID_MIN_BUFFERS_FOR_OUTPUT;
+    if (ioctl(s->fd, VIDIOC_QUERYCTRL, &qctrl) != 0) {
+        ret = AVERROR(errno);
+        if (ret != AVERROR(EINVAL)) {
+            av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_QUERCTRL failed: %s\n", ctx->name, av_err2str(ret));
+            goto fail_unref_hwframes;
+        }
+        // Control unsupported - set default if wanted
+        if (ctx->num_buffers < 2)
+            ctx->num_buffers = 4;
+    }
+    else {
+        if (ctx->num_buffers < 2)
+            ctx->num_buffers = qctrl.minimum + 2;
+        ctx->num_buffers = av_clip(ctx->num_buffers, qctrl.minimum, qctrl.maximum);
+    }
+
     ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem);
     if (ret < 0)
         goto fail_unref_hwframes;

From b69a2707a192ac509174899233a094373a3f5dc9 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 22 Sep 2022 15:00:12 +0000
Subject: [PATCH 072/161] v4l2_m2m_dec: If src Q is full then wait indefinitely
 for buffer

If it is not possible to add another buffer to the src Q then alawys
wait indefinitely for either an output frame or the Q to have space.

This has issues if the reason that the Q is stalled is due to dst buffer
exhaustion and buffers cannot be returned async by another thread but
the current scheme confuses ffmpegs pipeline scheduling.
---
 libavcodec/v4l2_m2m_dec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 485a96f4b4..bb183097f6 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -456,9 +456,9 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
         if (dst_rv != 0 && TRY_DQ(src_rv)) {
             // Pick a timeout depending on state
             const int t =
+                src_rv == NQ_Q_FULL ? -1 :
                 src_rv == NQ_DRAINING ? 300 :
-                prefer_dq ? 5 :
-                src_rv == NQ_Q_FULL ? -1 : 0;
+                prefer_dq ? 5 : 0;

             // Dequeue frame will unref any previous contents of frame
             // if it returns success so we don't need an explicit unref

From b1d37be81bbf683a0eb16923c9b9f045fd0ea0c0 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 22 Sep 2022 15:12:27 +0000
Subject: [PATCH 073/161] vf_deinterlace_v4l2m2m: Add Q name to structure for
 debug

---
 libavfilter/vf_deinterlace_v4l2m2m.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
index 2df39ec0f1..4edecc02bf 100644
--- a/libavfilter/vf_deinterlace_v4l2m2m.c
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -84,6 +84,7 @@ typedef struct V4L2Queue {
     struct v4l2_selection sel;
     int num_buffers;
     V4L2Buffer *buffers;
+    const char * name;
     DeintV4L2M2MContextShared *ctx;
 } V4L2Queue;

@@ -1792,8 +1793,10 @@ static av_cold int common_v4l2m2m_init(AVFilterContext * const avctx, const filt
     ctx->fd = -1;
     ctx->output.ctx = ctx;
     ctx->output.num_buffers = 8;
+    ctx->output.name = "OUTPUT";
     ctx->capture.ctx = ctx;
     ctx->capture.num_buffers = 12;
+    ctx->capture.name = "CAPTURE";
     ctx->done = 0;
     ctx->field_order = V4L2_FIELD_ANY;


From 794a5bfc3ec74fdc7664508a287a075708d5deef Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 22 Sep 2022 16:08:42 +0000
Subject: [PATCH 074/161] v4l2_m2m_enc: Set src buffer count to min+2 by
 default

Set output.num_buffers to 0 by default which will then be set to min+2
by the allocation code. This fixes an issue where the deinterlacer had
fewer dest buffer than the encoder has src buffers and so ran dry
creating deadlock in the ffmpeg filter chain.
---
 libavcodec/v4l2_m2m_enc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
index 099ad23928..b8ba815c37 100644
--- a/libavcodec/v4l2_m2m_enc.c
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -672,9 +672,10 @@ static av_cold int v4l2_encode_close(AVCodecContext *avctx)
 #define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM

 #define V4L_M2M_CAPTURE_OPTS \
-    V4L_M2M_DEFAULT_OPTS,\
+    { "num_output_buffers", "Number of buffers in the output context",\
+        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS },\
     { "num_capture_buffers", "Number of buffers in the capture context", \
-        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 4 }, 4, INT_MAX, FLAGS }
+        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 8 }, 8, INT_MAX, FLAGS }

 static const AVOption mpeg4_options[] = {
     V4L_M2M_CAPTURE_OPTS,

From 85c42743046a05b347f33b1933e6d52ea1d17e00 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 22 Sep 2022 16:13:57 +0000
Subject: [PATCH 075/161] vf_deinterlace_m2m: For deinterlace set outlink FR to
 twice inlink

We used to set the outlink framerate to unknown but it turns out that
ffmpegs filter pipeline copes with that badly. Otherwise leave at 0,0
which will copy FR from inlink to outlink.
---
 libavfilter/vf_deinterlace_v4l2m2m.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
index 4edecc02bf..c52dae1c44 100644
--- a/libavfilter/vf_deinterlace_v4l2m2m.c
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -1534,13 +1534,16 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
         ctx->output_height = ctx->height;
     }

-    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d\n", __func__, ctx->width, ctx->height, ctx->output_width, ctx->output_height);
+    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d FR: %d/%d->%d/%d\n", __func__,
+           ctx->width, ctx->height, ctx->output_width, ctx->output_height,
+           inlink->frame_rate.num, inlink->frame_rate.den, outlink->frame_rate.num, outlink->frame_rate.den);

     outlink->time_base           = inlink->time_base;
     outlink->w                   = ctx->output_width;
     outlink->h                   = ctx->output_height;
     outlink->format              = inlink->format;
-    outlink->frame_rate = (AVRational) {1, 0};  // Deny knowledge of frame rate
+    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && inlink->frame_rate.den != 0)
+        outlink->frame_rate = (AVRational){inlink->frame_rate.num * 2, inlink->frame_rate.den};

     if (inlink->sample_aspect_ratio.num)
         outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio);

From 34a24bc0b0d427c75659d3907cb75afb6a9dc255 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 23 Sep 2022 11:30:56 +0000
Subject: [PATCH 076/161] v4l2m2m: Add ff_v4l2_dq_all to drain all buffers from
 a Q

Useful for where (encode) we might have drmprime buffers that we want to
return to the source ASAP.
---
 libavcodec/v4l2_context.c | 17 +++++++++++------
 libavcodec/v4l2_context.h |  2 ++
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index ba36689ff3..4a359bf45e 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -707,17 +707,22 @@ clean_v4l2_buffer(V4L2Buffer * const avbuf)
     return avbuf;
 }

+void
+ff_v4l2_dq_all(V4L2Context *const ctx)
+{
+    V4L2Buffer * avbuf;
+    do {
+        get_qbuf(ctx, &avbuf, 0);
+    } while (avbuf);
+}
+
 static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
 {
     int i;

     /* get back as many output buffers as possible */
-    if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-        V4L2Buffer * avbuf;
-        do {
-            get_qbuf(ctx, &avbuf, 0);
-        } while (avbuf);
-    }
+    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
+        ff_v4l2_dq_all(ctx);

     for (i = 0; i < ctx->num_buffers; i++) {
         V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 21265f1bd7..523c53e97d 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -218,4 +218,6 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const
  */
 int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f);

+void ff_v4l2_dq_all(V4L2Context *const ctx);
+
 #endif // AVCODEC_V4L2_CONTEXT_H

From 95dfc168c74f7b0f282c1b2ad9deb8fba10a7ce5 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 23 Sep 2022 11:38:36 +0000
Subject: [PATCH 077/161] v4l2_m2m_enc: DQ output more frequently

Ensure that we DQ any released src buffers on every op to avoid deadlock
with source.

There is a plausible argument that this patch is inelegant and the drain
should be integrated into dq_buf, but that is a further reaching delta.
---
 libavcodec/v4l2_m2m_enc.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
index b8ba815c37..a992a3cccc 100644
--- a/libavcodec/v4l2_m2m_enc.c
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -421,6 +421,8 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
     V4L2Context *const output = &s->output;

+    ff_v4l2_dq_all(output);
+
     // Signal EOF if needed
     if (!frame) {
         return ff_v4l2_context_enqueue_frame(output, frame);
@@ -492,6 +494,8 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
     AVFrame *frame = s->frame;
     int ret;

+    ff_v4l2_dq_all(output);
+
     if (s->draining)
         goto dequeue;

@@ -528,7 +532,9 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
     }

 dequeue:
-    if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
+    ret = ff_v4l2_context_dequeue_packet(capture, avpkt);
+    ff_v4l2_dq_all(output);
+    if (ret)
         return ret;

     if (capture->first_buf == 1) {
@@ -560,7 +566,9 @@ dequeue:
             s->extdata_size = len;
         }

-        if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
+        ret = ff_v4l2_context_dequeue_packet(capture, avpkt);
+        ff_v4l2_dq_all(output);
+        if (ret)
             return ret;
     }


From a40b1c38b0615fce0c0d9eb97510ab9e77b3e1ac Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 26 Sep 2022 18:20:00 +0100
Subject: [PATCH 078/161] conf_native: Remove --enable-rpi from all builds

---
 pi-util/conf_native.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
index 37cea71756..f22d531ca4 100755
--- a/pi-util/conf_native.sh
+++ b/pi-util/conf_native.sh
@@ -54,9 +54,9 @@ if [ $MMAL ]; then
   RPI_LIBDIRS="-L$RPI_OPT_VC/lib"
   RPI_DEFINES="$RPI_DEFINES -D__VCCOREVER__=0x4000000"
   RPI_EXTRALIBS="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm -Wl,--end-group"
-  RPIOPTS="--enable-mmal --enable-rpi"
+  RPIOPTS="--enable-mmal"
 else
-  RPIOPTS="--disable-mmal --enable-sand"
+  RPIOPTS="--disable-mmal"
 fi

 C=`lsb_release -sc`
@@ -89,6 +89,7 @@ $FFSRC/configure \
  $MCOPTS\
  --disable-stripping\
  --disable-thumb\
+ --enable-sand\
  --enable-v4l2-request\
  --enable-libdrm\
  --enable-vout-egl\

From 8fddfc8f1e3c95caded18705ed29be0ae95517bc Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 29 Sep 2022 19:48:08 +0000
Subject: [PATCH 079/161] v4l2_m2m_dec: Deal correctly with avcC H264 data in
 extradata

Decoders expect AnnexB style headers, mkv and similar formats have
somewhat oddly wrapped extradata. Convert to annex-b style before use.
---
 libavcodec/v4l2_m2m.h     |   2 +-
 libavcodec/v4l2_m2m_dec.c | 177 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 169 insertions(+), 10 deletions(-)

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index ee72beb052..babf101d65 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -118,7 +118,7 @@ typedef struct V4L2m2mContext {
     /* Ext data sent */
     int extdata_sent;
     /* Ext data sent in packet - overrides ctx */
-    uint8_t * extdata_data;
+    void * extdata_data;
     size_t extdata_size;

 #define FF_V4L2_QUIRK_REINIT_ALWAYS             1
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index bb183097f6..6bd9926b3f 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -46,6 +46,71 @@
 #define STATS_LAST_COUNT_MAX 64
 #define STATS_INTERVAL_MAX (1 << 30)

+#ifndef FF_API_BUFFER_SIZE_T
+#define FF_API_BUFFER_SIZE_T 1
+#endif
+
+#define DUMP_FAILED_EXTRADATA 0
+
+#if DUMP_FAILED_EXTRADATA
+static inline char hex1(unsigned int x)
+{
+    x &= 0xf;
+    return x <= 9 ? '0' + x : 'a' + x - 10;
+}
+
+static inline char * hex2(char * s, unsigned int x)
+{
+    *s++ = hex1(x >> 4);
+    *s++ = hex1(x);
+    return s;
+}
+
+static inline char * hex4(char * s, unsigned int x)
+{
+    s = hex2(s, x >> 8);
+    s = hex2(s, x);
+    return s;
+}
+
+static inline char * dash2(char * s)
+{
+    *s++ = '-';
+    *s++ = '-';
+    return s;
+}
+
+static void
+data16(char * s, const unsigned int offset, const uint8_t * m, const size_t len)
+{
+    size_t i;
+    s = hex4(s, offset);
+    m += offset;
+    for (i = 0; i != 8; ++i) {
+        *s++ = ' ';
+        s = len > i + offset ? hex2(s, *m++) : dash2(s);
+    }
+    *s++ = ' ';
+    *s++ = ':';
+    for (; i != 16; ++i) {
+        *s++ = ' ';
+        s = len > i + offset ? hex2(s, *m++) : dash2(s);
+    }
+    *s++ = 0;
+}
+
+static void
+log_dump(void * logctx, int lvl, const void * const data, const size_t len)
+{
+    size_t i;
+    for (i = 0; i < len; i += 16) {
+        char buf[80];
+        data16(buf, i, data, len);
+        av_log(logctx, lvl, "%s\n", buf);
+    }
+}
+#endif
+
 static int64_t pts_stats_guess(const pts_stats_t * const stats)
 {
     if (stats->last_pts == AV_NOPTS_VALUE ||
@@ -98,6 +163,98 @@ static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char
     };
 }

+// If abdata == NULL then this just counts space required
+// Unpacks avcC if detected
+static int
+h264_xd_copy(const uint8_t * const extradata, const int extrasize, uint8_t * abdata)
+{
+    const uint8_t * const xdend = extradata + extrasize;
+    const uint8_t * p = extradata;
+    uint8_t * d = abdata;
+    unsigned int n;
+    unsigned int len;
+    const unsigned int hdrlen = 4;
+    unsigned int need_pps = 1;
+
+    if (extrasize < 8)
+        return AVERROR(EINVAL);
+
+    if (p[0] == 0 && p[1] == 0) {
+        // Assume a couple of leading zeros are good enough to indicate NAL
+        if (abdata)
+            memcpy(d, p, extrasize);
+        return extrasize;
+    }
+
+    // avcC starts with a 1
+    if (p[0] != 1)
+        return AVERROR(EINVAL);
+
+    p += 5;
+    n = *p++ & 0x1f;
+
+doxps:
+    while (n--) {
+        if (xdend - p < 2)
+            return AVERROR(EINVAL);
+        len = (p[0] << 8) | p[1];
+        p += 2;
+        if (xdend - p < (ptrdiff_t)len)
+            return AVERROR(EINVAL);
+        if (abdata) {
+            d[0] = 0;
+            d[1] = 0;
+            d[2] = 0;
+            d[3] = 1;
+            memcpy(d + 4, p, len);
+        }
+        d += len + hdrlen;
+        p += len;
+    }
+    if (need_pps) {
+        need_pps = 0;
+        if (p >= xdend)
+            return AVERROR(EINVAL);
+        n = *p++;
+        goto doxps;
+    }
+
+    return d - abdata;
+}
+
+static int
+copy_extradata(AVCodecContext * const avctx,
+               const void * const src_data, const int src_len,
+               void ** const pdst_data, size_t * const pdst_len)
+{
+    int len;
+
+    *pdst_len = 0;
+    av_freep(pdst_data);
+
+    if (avctx->codec_id == AV_CODEC_ID_H264)
+        len = h264_xd_copy(src_data, src_len, NULL);
+    else
+        len = src_len < 0 ? AVERROR(EINVAL) : src_len;
+
+    // Zero length is OK but we swant to stop - -ve is error val
+    if (len <= 0)
+        return len;
+
+    if ((*pdst_data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL)
+        return AVERROR(ENOMEM);
+
+    if (avctx->codec_id == AV_CODEC_ID_H264)
+        h264_xd_copy(src_data, src_len, *pdst_data);
+    else
+        memcpy(*pdst_data, src_data, len);
+    *pdst_len = len;
+
+    return 0;
+}
+
+
+
 static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s)
 {
     int ret;
@@ -277,13 +434,8 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
             side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
             if (side_data) {
                 av_log(avctx, AV_LOG_DEBUG, "New extradata\n");
-                av_freep(&s->extdata_data);
-                if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) {
-                    av_log(avctx, AV_LOG_ERROR, "Failed to alloc %zd bytes of extra data\n", side_size);
-                    return AVERROR(ENOMEM);
-                }
-                memcpy(s->extdata_data, side_data, side_size);
-                s->extdata_size = side_size;
+                if ((ret = copy_extradata(avctx, side_data, (int)side_size, &s->extdata_data, &s->extdata_size)) < 0)
+                    av_log(avctx, AV_LOG_WARNING, "Failed to copy new extra data: %s\n", av_err2str(ret));
                 s->extdata_sent = 0;
             }

@@ -359,8 +511,6 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
         ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
     else if (s->extdata_data)
         ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);
-    else
-        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size);

     if (ret == AVERROR(EAGAIN)) {
         // Out of input buffers - keep packet
@@ -770,6 +920,15 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
         return ret;
     }

+    if (avctx->extradata &&
+        (ret = copy_extradata(avctx, avctx->extradata, avctx->extradata_size, &s->extdata_data, &s->extdata_size)) != 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to copy extradata from context: %s\n", av_err2str(ret));
+#if DUMP_FAILED_EXTRADATA
+        log_dump(avctx, AV_LOG_INFO, avctx->extradata, avctx->extradata_size);
+#endif
+        return ret;
+    }
+
     if ((ret = v4l2_prepare_decoder(s)) < 0)
         return ret;


From 70227ebbc2999bc49075a3b683392d94618ecd89 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 30 Sep 2022 14:20:23 +0000
Subject: [PATCH 080/161] v4l2_request_hevc: Fix up
 V4L2_CID_CODEC_STATELESS_BASE if missing

---
 libavcodec/hevc-ctrls-v4.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h
index 7829d82084..c02fdbe5a8 100644
--- a/libavcodec/hevc-ctrls-v4.h
+++ b/libavcodec/hevc-ctrls-v4.h
@@ -53,6 +53,13 @@
 #include <linux/const.h>
 #include <linux/types.h>

+#ifndef V4L2_CTRL_CLASS_CODEC_STATELESS
+#define V4L2_CTRL_CLASS_CODEC_STATELESS 0x00a40000	/* Stateless codecs controls */
+#endif
+#ifndef V4L2_CID_CODEC_STATELESS_BASE
+#define V4L2_CID_CODEC_STATELESS_BASE		(V4L2_CTRL_CLASS_CODEC_STATELESS | 0x900)
+#endif
+
 #define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */

 #define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_STATELESS_BASE + 400)

From 22d2000382839dbd04588af1bb20cc9d9b3a4362 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Sat, 1 Oct 2022 13:40:57 +0000
Subject: [PATCH 081/161] vf_deinterlace_v4l2m2m: Fix compile on m/c without
 V4L2 SAND

---
 libavfilter/vf_deinterlace_v4l2m2m.c | 33 +++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
index c52dae1c44..716789f988 100644
--- a/libavfilter/vf_deinterlace_v4l2m2m.c
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -35,6 +35,8 @@
 #include <sys/mman.h>
 #include <unistd.h>

+#include "config.h"
+
 #include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
 #include "libavutil/common.h"
@@ -59,6 +61,16 @@
 #define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel packed */
 #endif

+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
+// in drm_fourcc.h hopefully will be sometime in the future but until then...
+#ifndef V4L2_PIX_FMT_NV12_10_COL128
+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
+#endif
+
+#ifndef V4L2_PIX_FMT_NV12_COL128
+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
+#endif
+
 typedef struct V4L2Queue V4L2Queue;
 typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared;

@@ -176,9 +188,11 @@ fmt_av_to_v4l2(const enum AVPixelFormat avfmt)
         return V4L2_PIX_FMT_YUV420;
     case AV_PIX_FMT_NV12:
         return V4L2_PIX_FMT_NV12;
+#if CONFIG_SAND
     case AV_PIX_FMT_RPI4_8:
     case AV_PIX_FMT_SAND128:
         return V4L2_PIX_FMT_NV12_COL128;
+#endif
     default:
         break;
     }
@@ -193,8 +207,10 @@ fmt_v4l2_to_av(const uint32_t pixfmt)
         return AV_PIX_FMT_YUV420P;
     case V4L2_PIX_FMT_NV12:
         return AV_PIX_FMT_NV12;
+#if CONFIG_SAND
     case V4L2_PIX_FMT_NV12_COL128:
         return AV_PIX_FMT_RPI4_8;
+#endif
     default:
         break;
     }
@@ -823,6 +839,7 @@ static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame)
                 h = src->layers[0].planes[1].offset / bpl;
                 w = bpl;
             }
+#if CONFIG_SAND
             else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
                 if (src->layers[0].nb_planes != 2)
                     break;
@@ -831,9 +848,11 @@ static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame)
                 h = src->layers[0].planes[1].offset / 128;
                 bpl = fourcc_mod_broadcom_param(mod);
             }
+#endif
             break;

         case DRM_FORMAT_P030:
+#if CONFIG_SAND
             if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
                 if (src->layers[0].nb_planes != 2)
                     break;
@@ -842,6 +861,7 @@ static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame)
                 h = src->layers[0].planes[1].offset / 128;
                 bpl = fourcc_mod_broadcom_param(mod);
             }
+#endif
             break;

         default:
@@ -1048,7 +1068,6 @@ static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf)
     AVDRMLayerDescriptor * const layer = &drm_desc->layers[0];
     const struct v4l2_format *const fmt = &q->format;
     const uint32_t height = fmt_height(fmt);
-    const uint32_t width  = fmt_width(fmt);
     ptrdiff_t bpl0;

     /* fill the DRM frame descriptor */
@@ -1063,7 +1082,7 @@ static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf)
     bpl0 = layer->planes[0].pitch;

     switch (fmt_pixelformat(fmt)) {
-
+#if CONFIG_SAND
         case V4L2_PIX_FMT_NV12_COL128:
             mod = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl0);
             layer->format = V4L2_PIX_FMT_NV12;
@@ -1074,9 +1093,10 @@ static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf)
             layer->nb_planes = 2;
             layer->planes[1].object_index = 0;
             layer->planes[1].offset = height * 128;
-            layer->planes[0].pitch = width;
-            layer->planes[1].pitch = width;
+            layer->planes[0].pitch = fmt_width(fmt);
+            layer->planes[1].pitch = layer->planes[0].pitch;
             break;
+#endif

         case DRM_FORMAT_NV12:
             layer->format = V4L2_PIX_FMT_NV12;
@@ -1576,7 +1596,10 @@ static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc)
         return is_linear ? V4L2_PIX_FMT_YUV420 : 0;
     case DRM_FORMAT_NV12:
         return is_linear ? V4L2_PIX_FMT_NV12 :
-            fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 : 0;
+#if CONFIG_SAND
+            fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 :
+#endif
+            0;
     default:
         break;
     }

From f06f9ee41bf0f6f74240503f0cb427328cf6792f Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Sun, 2 Oct 2022 12:36:43 +0000
Subject: [PATCH 082/161] configure: Fix v4l2_req_hevc_vx setup; set after deps
 fixups

---
 configure | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/configure b/configure
index 5c00a183e3..94c8161b91 100755
--- a/configure
+++ b/configure
@@ -6914,12 +6914,6 @@ fi
 check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
 check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
 disable v4l2_req_hevc_vx
-if enabled hevc_v4l2request_hwaccel; then
-    enable v4l2_req_hevc_vx
-fi
-if enabled hevc_v4l2_request; then
-    disable v4l2_req_hevc_vx
-fi

 check_headers sys/videoio.h
 test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
@@ -7415,6 +7409,9 @@ check_deps $CONFIG_LIST       \

 enabled threads && ! enabled pthreads && ! enabled atomics_native && die "non pthread threading without atomics not supported, try adding --enable-pthreads or --cpu=i486 or higher if you are on x86"

+# Sub-feature of hevc_v4l2request_hwaccel - can only be set once deps are done
+enabled hevc_v4l2request_hwaccel && disabled hevc_v4l2_request && enable v4l2_req_hevc_vx
+
 case $target_os in
 haiku)
     disable memalign

From 7d7709fb68561711f893269227147974fd6a46f3 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Sat, 1 Oct 2022 12:39:45 +0000
Subject: [PATCH 083/161] vf_deinterlace_v4l2m2m: Ensure we get consistent
 final frames

On getting EOS at the input of the filster do not simply drop everything
in transit on the floor but attempt to retrieve everything possible from
the capture Q before on-signalling EOS.
If we know that we expect 1 frame in to always produce 1 frame out then
match CAPTURE frame to the last OUTPUT frame Qed (scale)
If frames out have an unknown relation to source frames (deinterlace) try
an encode stop and wait for the last frame marker to emerge from CAPTURE
---
 libavfilter/vf_deinterlace_v4l2m2m.c | 172 +++++++++++++++++++++++----
 1 file changed, 148 insertions(+), 24 deletions(-)

diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
index 716789f988..ce875c2c61 100644
--- a/libavfilter/vf_deinterlace_v4l2m2m.c
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -94,6 +94,7 @@ typedef struct V4L2Buffer {
 typedef struct V4L2Queue {
     struct v4l2_format format;
     struct v4l2_selection sel;
+    int eos;
     int num_buffers;
     V4L2Buffer *buffers;
     const char * name;
@@ -127,20 +128,41 @@ typedef struct pts_track_s
     pts_track_el_t a[PTS_TRACK_SIZE];
 } pts_track_t;

+typedef enum drain_state_e
+{
+    DRAIN_NONE = 0,     // Not draining
+    DRAIN_TIMEOUT,      // Drain until normal timeout setup yields no frame
+    DRAIN_LAST,         // Drain with long timeout last_frame in received on output expected
+    DRAIN_EOS,          // Drain with long timeout EOS expected
+    DRAIN_DONE          // Drained
+} drain_state_t;
+
 typedef struct DeintV4L2M2MContextShared {
     void * logctx;  // For logging - will be NULL when done
     filter_type_v4l2_t filter_type;

     int fd;
-    int done;
+    int done;   // fd closed - awating all refs dropped
     int width;
     int height;

+    int drain;          // EOS received (inlink status)
+    drain_state_t drain_state;
+    int64_t drain_pts;  // PTS associated with inline status
+
+    unsigned int frames_rx;
+    unsigned int frames_tx;
+
     // from options
     int output_width;
     int output_height;
     enum AVPixelFormat output_format;

+    int has_enc_stop;
+    // We expect to get exactly the same number of frames out as we put in
+    // We can drain by matching input to output
+    int one_to_one;
+
     int orig_width;
     int orig_height;
     atomic_uint refcount;
@@ -179,6 +201,12 @@ typedef struct DeintV4L2M2MContext {
     enum AVChromaLocation chroma_location;
 } DeintV4L2M2MContext;

+
+static inline int drain_frame_expected(const drain_state_t d)
+{
+    return d == DRAIN_EOS || d == DRAIN_LAST;
+}
+
 // These just list the ones we know we can cope with
 static uint32_t
 fmt_av_to_v4l2(const enum AVPixelFormat avfmt)
@@ -334,6 +362,13 @@ fail:
     return 0;
 }

+// We are only ever expecting in-order frames so nothing more clever is required
+static unsigned int
+pts_track_count(const pts_track_t * const trk)
+{
+    return (trk->n - trk->last_n) & (PTS_TRACK_SIZE - 1);
+}
+
 static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src)
 {
     const uint32_t n = pts_track_next_n(trk);
@@ -406,6 +441,12 @@ fmt_pixelformat(const struct v4l2_format * const fmt)
     return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
 }

+static inline uint32_t
+buf_bytesused0(const struct v4l2_buffer * const buf)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(buf->type) ? buf->m.planes[0].bytesused : buf->bytesused;
+}
+
 static void
 init_format(V4L2Queue * const q, const uint32_t format_type)
 {
@@ -1469,12 +1510,24 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim

     av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__);

+    if (queue->eos) {
+        av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: EOS\n", __func__);
+        return AVERROR_EOF;
+    }
+
     avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout);
     if (!avbuf) {
         av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout);
         return AVERROR(EAGAIN);
     }

+    if (V4L2_TYPE_IS_CAPTURE(avbuf->buffer.type)) {
+        if ((avbuf->buffer.flags & V4L2_BUF_FLAG_LAST) != 0)
+            queue->eos = 1;
+        if (buf_bytesused0(&avbuf->buffer) == 0)
+            return queue->eos ? AVERROR_EOF : AVERROR(EINVAL);
+    }
+
     // Fill in PTS and anciliary info from src frame
     pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame);

@@ -1686,6 +1739,20 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
         else
             ctx->field_order = V4L2_FIELD_INTERLACED_BT;

+        {
+            struct v4l2_encoder_cmd ecmd = {
+                .cmd = V4L2_ENC_CMD_STOP
+            };
+            ctx->has_enc_stop = 0;
+            if (ioctl(ctx->fd, VIDIOC_TRY_ENCODER_CMD, &ecmd) == 0) {
+                av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop succeeded\n");
+                ctx->has_enc_stop = 1;
+            }
+            else {
+                av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop fail: %s\n", av_err2str(AVERROR(errno)));
+            }
+
+        }
     }

     ret = deint_v4l2m2m_enqueue_frame(output, in);
@@ -1694,6 +1761,41 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
     return ret;
 }

+static int
+ack_inlink(AVFilterContext * const avctx, DeintV4L2M2MContextShared *const s,
+           AVFilterLink * const inlink)
+{
+    int instatus;
+    int64_t inpts;
+
+    if (ff_inlink_acknowledge_status(inlink, &instatus, &inpts) <= 0)
+        return 0;
+
+    s->drain      = instatus;
+    s->drain_pts  = inpts;
+    s->drain_state = DRAIN_TIMEOUT;
+
+    if (s->field_order == V4L2_FIELD_ANY) {  // Not yet started
+        s->drain_state = DRAIN_DONE;
+    }
+    else if (s->one_to_one) {
+        s->drain_state = DRAIN_LAST;
+    }
+    else if (s->has_enc_stop) {
+        struct v4l2_encoder_cmd ecmd = {
+            .cmd = V4L2_ENC_CMD_STOP
+        };
+        if (ioctl(s->fd, VIDIOC_ENCODER_CMD, &ecmd) == 0) {
+            av_log(avctx->priv, AV_LOG_DEBUG, "Do Encode stop\n");
+            s->drain_state = DRAIN_EOS;
+        }
+        else {
+            av_log(avctx->priv, AV_LOG_WARNING, "Encode stop fail: %s\n", av_err2str(AVERROR(errno)));
+        }
+    }
+    return 1;
+}
+
 static int deint_v4l2m2m_activate(AVFilterContext *avctx)
 {
     DeintV4L2M2MContext * const priv = avctx->priv;
@@ -1702,15 +1804,13 @@ static int deint_v4l2m2m_activate(AVFilterContext *avctx)
     AVFilterLink * const inlink = avctx->inputs[0];
     int n = 0;
     int cn = 99;
-    int instatus = 0;
-    int64_t inpts = 0;
     int did_something = 0;

     av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__);

     FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx);

-    ff_inlink_acknowledge_status(inlink, &instatus, &inpts);
+    ack_inlink(avctx, s, inlink);

     if (!ff_outlink_frame_wanted(outlink)) {
         av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__);
@@ -1720,7 +1820,6 @@ static int deint_v4l2m2m_activate(AVFilterContext *avctx)
         AVFrame * frame = av_frame_alloc();
         int rv;

-again:
         recycle_q(&s->output);
         n = count_enqueued(&s->output);

@@ -1729,10 +1828,21 @@ again:
             return AVERROR(ENOMEM);
         }

-        rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, n > 4 ? 300 : 0);
+        rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame,
+                                         drain_frame_expected(s->drain_state) || n > 4 ? 300 : 0);
         if (rv != 0) {
             av_frame_free(&frame);
-            if (rv != AVERROR(EAGAIN)) {
+            if (rv == AVERROR_EOF) {
+                av_log(priv, AV_LOG_DEBUG, "%s: --- DQ EOF\n", __func__);
+                s->drain_state = DRAIN_DONE;
+            }
+            else if (rv == AVERROR(EAGAIN)) {
+                if (s->drain_state != DRAIN_NONE) {
+                    av_log(priv, AV_LOG_DEBUG, "%s: --- DQ empty - drain done\n", __func__);
+                    s->drain_state = DRAIN_DONE;
+                }
+            }
+            else {
                 av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv));
                 return rv;
             }
@@ -1742,29 +1852,30 @@ again:
             // frame is always consumed by filter_frame - even on error despite
             // a somewhat confusing comment in the header
             rv = ff_filter_frame(outlink, frame);
-
-            if (instatus != 0) {
-                av_log(priv, AV_LOG_TRACE, "%s: eof loop\n", __func__);
-                goto again;
-            }
+            ++s->frames_tx;

             av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv));
             did_something = 1;
+
+            if (s->drain_state != DRAIN_NONE && pts_track_count(&s->track) == 0) {
+                av_log(priv, AV_LOG_DEBUG, "%s: --- DQ last - drain done\n", __func__);
+                s->drain_state = DRAIN_DONE;
+            }
         }

         cn = count_enqueued(&s->capture);
     }

-    if (instatus != 0) {
-        ff_outlink_set_status(outlink, instatus, inpts);
-        av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(instatus));
+    if (s->drain_state == DRAIN_DONE) {
+        ff_outlink_set_status(outlink, s->drain, s->drain_pts);
+        av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(s->drain));
         return 0;
     }

     recycle_q(&s->output);
     n = count_enqueued(&s->output);

-    while (n < 6) {
+    while (n < 6 && !s->drain) {
         AVFrame * frame;
         int rv;

@@ -1775,8 +1886,13 @@ again:

         if (frame == NULL) {
             av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
+            if (!ack_inlink(avctx, s, inlink)) {
+                ff_inlink_request_frame(inlink);
+                av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__);
+            }
             break;
         }
+        ++s->frames_rx;

         rv = deint_v4l2m2m_filter_frame(inlink, frame);
         av_frame_free(&frame);
@@ -1785,16 +1901,11 @@ again:
             return rv;

         av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
-        ++n;
-    }
-
-    if (n < 6) {
-        ff_inlink_request_frame(inlink);
         did_something = 1;
-        av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__);
+        ++n;
     }

-    if (n > 4 && ff_outlink_frame_wanted(outlink)) {
+    if ((n > 4 || s->drain) && ff_outlink_frame_wanted(outlink)) {
         ff_filter_set_ready(avctx, 1);
         did_something = 1;
         av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__);
@@ -1873,7 +1984,18 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)

 static av_cold int scale_v4l2m2m_init(AVFilterContext *avctx)
 {
-    return common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE);
+    int rv;
+    DeintV4L2M2MContext * priv;
+    DeintV4L2M2MContextShared * ctx;
+
+    if ((rv = common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE)) != 0)
+        return rv;
+
+    priv = avctx->priv;
+    ctx = priv->shared;
+
+    ctx->one_to_one = 1;
+    return 0;
 }

 static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
@@ -1881,6 +2003,8 @@ static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
     DeintV4L2M2MContext *priv = avctx->priv;
     DeintV4L2M2MContextShared *ctx = priv->shared;

+    av_log(priv, AV_LOG_VERBOSE, "Frames Rx: %u, Frames Tx: %u\n",
+           ctx->frames_rx, ctx->frames_tx);
     ctx->done = 1;
     ctx->logctx = NULL;  // Log to NULL works, log to missing crashes
     pts_track_uninit(&ctx->track);

From f893891df8f4e7738b2d9b49df4386fb160eb25f Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 5 Oct 2022 16:12:02 +0000
Subject: [PATCH 084/161] v4l2_m2m_dec: Rework decode pending heuristic

The old code measured the length of the entire Q in the decoder and
attempted to dynamically guess an appropriate length. This was prone to
failure when the guesswork became confused.
The new code attempts to measure the Q length before insertion into decode
which, after all, is what we actually care about. It does this by
asserting that the decoder must have consumed all packets that came
before the one associated with the most recent CAPTURE frame.  This
avoids all need for reorder buffer size guesswork.
---
 libavcodec/v4l2_m2m.h     |  2 -
 libavcodec/v4l2_m2m_dec.c | 77 +++++++++++++++++----------------------
 2 files changed, 34 insertions(+), 45 deletions(-)

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index babf101d65..26a7161042 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -107,8 +107,6 @@ typedef struct V4L2m2mContext {

     /* Frame tracking */
     xlat_track_t xlat;
-    int pending_hw;
-    int pending_n;

     pts_stats_t pts_stat;

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 6bd9926b3f..bec9b22fcf 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -349,41 +349,54 @@ static void
 xlat_flush(xlat_track_t * const x)
 {
     unsigned int i;
+    // Do not reset track_no - this ensures that any frames left in the decoder
+    // that turn up later get discarded.
+
+    x->last_pts = AV_NOPTS_VALUE;
+    x->last_opaque = 0;
     for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) {
         x->track_els[i].pending = 0;
         x->track_els[i].discard = 1;
     }
-    x->last_pts = AV_NOPTS_VALUE;
+}
+
+static void
+xlat_init(xlat_track_t * const x)
+{
+    memset(x, 0, sizeof(*x));
+    xlat_flush(x);
 }

 static int
 xlat_pending(const xlat_track_t * const x)
 {
     unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
-    unsigned int i;
-    int r = 0;
-    int64_t now = AV_NOPTS_VALUE;
+    int i;
+    const int64_t now = x->last_pts;

-    for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) {
+    for (i = 0; i < FF_V4L2_M2M_TRACK_SIZE; ++i, n = (n - 1) & (FF_V4L2_M2M_TRACK_SIZE - 1)) {
         const V4L2m2mTrackEl * const t = x->track_els + n;

+        // Discard only set on never-set or flushed entries
+        // So if we get here we've never successfully decoded a frame so allow
+        // more frames into the buffer before stalling
+        if (t->discard)
+            return i - 16;
+
+        // If we've got this frame out then everything before this point
+        // must have entered the decoder
         if (!t->pending)
-            continue;
+            break;

+        // If we've never seen a pts all we can do is count frames
         if (now == AV_NOPTS_VALUE)
-            now = t->dts;
+            continue;

-        if (t->pts == AV_NOPTS_VALUE ||
-            ((now == AV_NOPTS_VALUE || t->pts <= now) &&
-             (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts)))
-            ++r;
+        if (t->dts != AV_NOPTS_VALUE && now >= t->dts)
+            break;
     }

-    // If we never get any ideas about PTS vs DTS allow a lot more buffer
-    if (now == AV_NOPTS_VALUE)
-        r -= 16;
-
-    return r;
+    return i;
 }

 static inline int stream_started(const V4L2m2mContext * const s) {
@@ -557,18 +570,6 @@ static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
     return rv;
 }

-// Number of frames over what xlat_pending returns that we keep *16
-// This is a min value - if it appears to be too small the threshold should
-// adjust dynamically.
-#define PENDING_HW_MIN      (3 * 16)
-// Offset to use when setting dynamically
-// Set to %16 == 15 to avoid the threshold changing immediately as we relax
-#define PENDING_HW_OFFSET   (PENDING_HW_MIN - 1)
-// Number of consecutive times we've failed to get a frame when we prefer it
-// before we increase the prefer threshold (5ms * N = max expected decode
-// time)
-#define PENDING_N_THRESHOLD 6
-
 static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 {
     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
@@ -578,9 +579,11 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)

     do {
         const int pending = xlat_pending(&s->xlat);
-        const int prefer_dq = (pending > s->pending_hw / 16);
+        const int prefer_dq = (pending > 3);
         const int last_src_rv = src_rv;

+        av_log(avctx, AV_LOG_TRACE, "Pending=%d, src_rv=%d, req_pkt=%d\n", pending, src_rv, s->req_pkt);
+
         // Enqueue another pkt for decode if
         // (a) We don't have a lot of stuff in the buffer already OR
         // (b) ... we (think we) do but we've failed to get a frame already OR
@@ -625,20 +628,8 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
                 }
             }

-            // Adjust dynamic pending threshold
-            if (dst_rv == 0) {
-                if (--s->pending_hw < PENDING_HW_MIN)
-                    s->pending_hw = PENDING_HW_MIN;
-                s->pending_n = 0;
-
+            if (dst_rv == 0)
                 set_best_effort_pts(avctx, &s->pts_stat, frame);
-            }
-            else if (dst_rv == AVERROR(EAGAIN)) {
-                if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
-                    s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
-                    s->pending_n = 0;
-                }
-            }

             if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
                 av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
@@ -857,8 +848,8 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     if (ret < 0)
         return ret;

+    xlat_init(&s->xlat);
     pts_stats_init(&s->pts_stat, avctx, "decoder");
-    s->pending_hw = PENDING_HW_MIN;

     capture = &s->capture;
     output = &s->output;

From 7048e7e6b8621cf09b96cc7e44b8d82ba8619913 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 21 Oct 2022 13:48:07 +0000
Subject: [PATCH 085/161] pthread_frame: Fix MT hwaccel. Recent change broke
 it.

Revert the effects of 35aa7e70e7ec350319e7634a30d8d8aa1e6ecdda if the
hwaccel is marked MT_SAFE.
---
 libavcodec/pthread_frame.c | 48 ++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 12 deletions(-)

diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c
index 2cc89a41f5..b14f8e9360 100644
--- a/libavcodec/pthread_frame.c
+++ b/libavcodec/pthread_frame.c
@@ -231,7 +231,7 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
             p->hwaccel_serializing = 0;
             pthread_mutex_unlock(&p->parent->hwaccel_mutex);
         }
-        av_assert0(!avctx->hwaccel);
+        av_assert0(!avctx->hwaccel || (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE));

         if (p->async_serializing) {
             p->async_serializing = 0;
@@ -319,6 +319,12 @@ FF_ENABLE_DEPRECATION_WARNINGS
         }

         dst->hwaccel_flags = src->hwaccel_flags;
+        if (src->hwaccel &&
+            (src->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
+            dst->hwaccel = src->hwaccel;
+            dst->hwaccel_context = src->hwaccel_context;
+            dst->internal->hwaccel_priv_data = src->internal->hwaccel_priv_data;
+        }

         err = av_buffer_replace(&dst->internal->pool, src->internal->pool);
         if (err < 0)
@@ -434,10 +440,13 @@ static int submit_packet(PerThreadContext *p, AVCodecContext *user_avctx,
     }

     /* transfer the stashed hwaccel state, if any */
-    av_assert0(!p->avctx->hwaccel);
-    FFSWAP(const AVHWAccel*, p->avctx->hwaccel,                     fctx->stash_hwaccel);
-    FFSWAP(void*,            p->avctx->hwaccel_context,             fctx->stash_hwaccel_context);
-    FFSWAP(void*,            p->avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
+    av_assert0(!p->avctx->hwaccel || (p->avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE));
+    if (p->avctx->hwaccel &&
+        !(p->avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
+        FFSWAP(const AVHWAccel*, p->avctx->hwaccel,                     fctx->stash_hwaccel);
+        FFSWAP(void*,            p->avctx->hwaccel_context,             fctx->stash_hwaccel_context);
+        FFSWAP(void*,            p->avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
+    }

     av_packet_unref(p->avpkt);
     ret = av_packet_ref(p->avpkt, avpkt);
@@ -610,9 +619,12 @@ void ff_thread_finish_setup(AVCodecContext *avctx) {
      * this is done here so that this worker thread can wipe its own hwaccel
      * state after decoding, without requiring synchronization */
     av_assert0(!p->parent->stash_hwaccel);
-    p->parent->stash_hwaccel         = avctx->hwaccel;
-    p->parent->stash_hwaccel_context = avctx->hwaccel_context;
-    p->parent->stash_hwaccel_priv    = avctx->internal->hwaccel_priv_data;
+    if (avctx->hwaccel &&
+        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
+        p->parent->stash_hwaccel         = avctx->hwaccel;
+        p->parent->stash_hwaccel_context = avctx->hwaccel_context;
+        p->parent->stash_hwaccel_priv    = avctx->internal->hwaccel_priv_data;
+    }

     pthread_mutex_lock(&p->progress_mutex);
     if(atomic_load(&p->state) == STATE_SETUP_FINISHED){
@@ -667,6 +679,15 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)

     park_frame_worker_threads(fctx, thread_count);

+     if (fctx->prev_thread &&
+         avctx->hwaccel && (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) &&
+         avctx->internal->hwaccel_priv_data !=
+                             fctx->prev_thread->avctx->internal->hwaccel_priv_data) {
+        if (update_context_from_thread(avctx, fctx->prev_thread->avctx, 1) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to update user thread.\n");
+        }
+    }
+
     for (i = 0; i < thread_count; i++) {
         PerThreadContext *p = &fctx->threads[i];
         AVCodecContext *ctx = p->avctx;
@@ -710,10 +731,13 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)

     /* if we have stashed hwaccel state, move it to the user-facing context,
      * so it will be freed in avcodec_close() */
-    av_assert0(!avctx->hwaccel);
-    FFSWAP(const AVHWAccel*, avctx->hwaccel,                     fctx->stash_hwaccel);
-    FFSWAP(void*,            avctx->hwaccel_context,             fctx->stash_hwaccel_context);
-    FFSWAP(void*,            avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
+    av_assert0(!avctx->hwaccel || (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE));
+    if (avctx->hwaccel &&
+        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
+        FFSWAP(const AVHWAccel*, avctx->hwaccel,                     fctx->stash_hwaccel);
+        FFSWAP(void*,            avctx->hwaccel_context,             fctx->stash_hwaccel_context);
+        FFSWAP(void*,            avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
+    }

     av_freep(&avctx->internal->thread_ctx);
 }

From 033056bd8ec63b16fe081446f70f41b5d5789b81 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 18 Oct 2022 13:18:27 +0000
Subject: [PATCH 086/161] v4l2_req: Add swfmt to init logging

(cherry picked from commit dfa03b702baaf2952bcd2bbf8badcc2f9c961ddf)
---
 libavcodec/v4l2_request_hevc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
index 614a1b4d99..767ecb036a 100644
--- a/libavcodec/v4l2_request_hevc.c
+++ b/libavcodec/v4l2_request_hevc.c
@@ -26,6 +26,7 @@
 #include "v4l2_request_hevc.h"

 #include "libavutil/hwcontext_drm.h"
+#include "libavutil/pixdesc.h"

 #include "v4l2_req_devscan.h"
 #include "v4l2_req_dmabufs.h"
@@ -306,10 +307,11 @@ retry_src_memtype:
     // Set our s/w format
     avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format;

-    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s\n",
+    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s; swfmt=%s\n",
            ctx->fns->name,
            decdev_media_path(decdev), decdev_video_path(decdev),
-           mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype));
+           mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype),
+           av_get_pix_fmt_name(avctx->sw_pix_fmt));

     return 0;


From 70779e742b93015e3e8aaa8f945a12d35917844d Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 18 Oct 2022 13:39:54 +0000
Subject: [PATCH 087/161] v4l2_m2m: Avoid polling on a queue that is streamoff

(cherry picked from commit b2658bc56d3034a17db7f39597fc7d71bfe9a43b)
---
 libavcodec/v4l2_context.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 4a359bf45e..b296dc111c 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -578,6 +578,11 @@ get_event(V4L2m2mContext * const m)
     return 0;
 }

+static inline int
+dq_ok(const V4L2Context * const c)
+{
+    return c->streamon && atomic_load(&c->q_count) != 0;
+}

 // Get a buffer
 // If output then just gets the buffer in the expected way
@@ -613,13 +618,13 @@ get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout
         }

         // If capture && timeout == -1 then also wait for rx buffer free
-        if (is_cap && timeout == -1 && m->output.streamon && !m->draining)
+        if (is_cap && timeout == -1 && dq_ok(&m->output) && !m->draining)
             pfd.events |= poll_out;

         // If nothing Qed all we will get is POLLERR - avoid that
-        if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) ||
-            (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) ||
-            (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) {
+        if ((pfd.events == poll_out && !dq_ok(&m->output)) ||
+            (pfd.events == poll_cap && !dq_ok(&m->capture)) ||
+            (pfd.events == (poll_cap | poll_out) && !dq_ok(&m->capture) && !dq_ok(&m->output))) {
             av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
             return AVERROR(ENOSPC);
         }

From 438fed3702eb689f836c885ebbd813e48d4d4c4a Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 18 Oct 2022 14:07:04 +0000
Subject: [PATCH 088/161] v4l2_m2m: Add function to get number of queued
 buffers

(cherry picked from commit f9ac6485c00b4531dcff354222aef450b29728f4)
---
 libavcodec/v4l2_context.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 523c53e97d..8e4f681643 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -220,4 +220,15 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f);

 void ff_v4l2_dq_all(V4L2Context *const ctx);

+/**
+ * Returns the number of buffers currently queued
+ *
+ * @param[in] ctx The V4L2Context to evaluate
+ */
+static inline int
+ff_v4l2_context_q_count(const V4L2Context* const ctx)
+{
+    return atomic_load(&ctx->q_count);
+}
+
 #endif // AVCODEC_V4L2_CONTEXT_H

From 95ff4a65ed4c88ea7e02ee55e260e37a0ce2ba88 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 18 Oct 2022 14:48:20 +0000
Subject: [PATCH 089/161] v4l2_m2m: Add timeouts to dq_all and dequeue_packet

Add timeouts and use them to have better flow control in encode

(cherry picked from commit c6173cad7f21697e12887982bda796de9719bb32)
---
 libavcodec/v4l2_context.c | 16 +++++++++++-----
 libavcodec/v4l2_context.h | 15 +++++++++++++--
 libavcodec/v4l2_m2m_enc.c | 28 +++++++++++++++++++---------
 3 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index b296dc111c..7031f3d340 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -712,13 +712,19 @@ clean_v4l2_buffer(V4L2Buffer * const avbuf)
     return avbuf;
 }

-void
-ff_v4l2_dq_all(V4L2Context *const ctx)
+int
+ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1)
 {
     V4L2Buffer * avbuf;
+    if (timeout1 != 0) {
+        int rv = get_qbuf(ctx, &avbuf, timeout1);
+        if (rv != 0)
+            return rv;
+    }
     do {
         get_qbuf(ctx, &avbuf, 0);
     } while (avbuf);
+    return 0;
 }

 static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
@@ -727,7 +733,7 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)

     /* get back as many output buffers as possible */
     if (V4L2_TYPE_IS_OUTPUT(ctx->type))
-        ff_v4l2_dq_all(ctx);
+        ff_v4l2_dq_all(ctx, 0);

     for (i = 0; i < ctx->num_buffers; i++) {
         V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
@@ -1047,7 +1053,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
    return 0;
 }

-int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
+int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout)
 {
     V4L2m2mContext *s = ctx_to_m2mctx(ctx);
     AVCodecContext *const avctx = s->avctx;
@@ -1055,7 +1061,7 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
     int rv;

     do {
-        if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
+        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
             return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
         if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0)
             return rv;
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 8e4f681643..5afed3e6ec 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -179,7 +179,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd);
  * @param[inout] pkt The AVPacket to dequeue to.
  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
  */
-int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
+int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout);

 /**
  * Dequeues a buffer from a V4L2Context to an AVFrame.
@@ -218,7 +218,18 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const
  */
 int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f);

-void ff_v4l2_dq_all(V4L2Context *const ctx);
+/**
+ * Dequeue all buffers on this queue
+ *
+ * Used to recycle output buffers
+ *
+ * @param[in] ctx The V4L2Context to dequeue from.
+ * @param[in] timeout1 A timeout on dequeuing the 1st buffer,
+ *       all others have a timeout of zero
+ * @return AVERROR(EAGAIN) if timeout1 non-zero then the return
+ *         of the first dequeue operation, 0 otherwise.
+ */
+int ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1);

 /**
  * Returns the number of buffers currently queued
diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
index a992a3cccc..d0d27e5bc2 100644
--- a/libavcodec/v4l2_m2m_enc.c
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -420,16 +420,24 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
 {
     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
     V4L2Context *const output = &s->output;
+    int rv;
+    int needs_slot = ff_v4l2_context_q_count(output) == output->num_buffers;

-    ff_v4l2_dq_all(output);
+    av_log(avctx, AV_LOG_TRACE, "<<< %s; needs_slot=%d\n", __func__, needs_slot);

-    // Signal EOF if needed
+    // Signal EOF if needed (doesn't need q slot)
     if (!frame) {
         return ff_v4l2_context_enqueue_frame(output, frame);
     }

+    if ((rv = ff_v4l2_dq_all(output, needs_slot? 500 : 0)) != 0) {
+        // We should be able to return AVERROR(EAGAIN) to indicate buffer
+        // exhaustion, but ffmpeg currently treats that as fatal.
+        av_log(avctx, AV_LOG_WARNING, "Failed to get buffer for src frame: %s\n", av_err2str(rv));
+        return rv;
+    }
+
     if (s->input_drm && !output->streamon) {
-        int rv;
         struct v4l2_format req_format = {.type = output->format.type};

         // Set format when we first get a buffer
@@ -494,7 +502,9 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
     AVFrame *frame = s->frame;
     int ret;

-    ff_v4l2_dq_all(output);
+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
+
+    ff_v4l2_dq_all(output, 0);

     if (s->draining)
         goto dequeue;
@@ -532,10 +542,10 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
     }

 dequeue:
-    ret = ff_v4l2_context_dequeue_packet(capture, avpkt);
-    ff_v4l2_dq_all(output);
+    ret = ff_v4l2_context_dequeue_packet(capture, avpkt, s->draining ? 300 : 0);
+    ff_v4l2_dq_all(output, 0);
     if (ret)
-        return ret;
+        return (s->draining && ret == AVERROR(EAGAIN)) ? AVERROR_EOF : ret;

     if (capture->first_buf == 1) {
         uint8_t * data;
@@ -566,8 +576,8 @@ dequeue:
             s->extdata_size = len;
         }

-        ret = ff_v4l2_context_dequeue_packet(capture, avpkt);
-        ff_v4l2_dq_all(output);
+        ret = ff_v4l2_context_dequeue_packet(capture, avpkt, 0);
+        ff_v4l2_dq_all(output, 0);
         if (ret)
             return ret;
     }

From e6654c1997a6f4dfd43b0f74b0168f5d644c1c74 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 18 Oct 2022 14:23:32 +0000
Subject: [PATCH 090/161] v4l2_m2m_enc: Improve debug trace

(cherry picked from commit 113e89daffb329a0cd3d920abd483a4025664bf5)
---
 libavcodec/v4l2_m2m_enc.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
index d0d27e5bc2..c8c2de3d47 100644
--- a/libavcodec/v4l2_m2m_enc.c
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -427,6 +427,7 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)

     // Signal EOF if needed (doesn't need q slot)
     if (!frame) {
+        av_log(avctx, AV_LOG_TRACE, "--- %s: EOS\n", __func__);
         return ff_v4l2_context_enqueue_frame(output, frame);
     }

@@ -491,7 +492,12 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
         v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1);
 #endif

-    return ff_v4l2_context_enqueue_frame(output, frame);
+    rv = ff_v4l2_context_enqueue_frame(output, frame);
+    if (rv) {
+        av_log(avctx, AV_LOG_ERROR, "Enqueue frame failed: %s\n", av_err2str(rv));
+    }
+
+    return rv;
 }

 static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
@@ -502,7 +508,8 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
     AVFrame *frame = s->frame;
     int ret;

-    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
+    av_log(avctx, AV_LOG_TRACE, "<<< %s: qlen out %d cap %d\n", __func__,
+           ff_v4l2_context_q_count(output), ff_v4l2_context_q_count(capture));

     ff_v4l2_dq_all(output, 0);

@@ -615,11 +622,11 @@ dequeue:
         avpkt->size = newlen;
     }

-//    av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret);
     capture->first_buf = 0;
     return 0;

 fail_no_mem:
+    av_log(avctx, AV_LOG_ERROR, "Rx pkt failed: No memory\n");
     ret = AVERROR(ENOMEM);
     av_packet_unref(avpkt);
     return ret;

From 02dca2b845125af7ec6dfb68bdc34726a45fee9c Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 18 Oct 2022 13:22:36 +0000
Subject: [PATCH 091/161] v4l2_m2m_enc: Copy dest packets to memory if short of
 v4l2 buffers

(cherry picked from commit aa4ebbda400b42db952fc713b26927fc8636b0e5)
---
 libavcodec/v4l2_m2m_enc.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
index c8c2de3d47..c23187e6e6 100644
--- a/libavcodec/v4l2_m2m_enc.c
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -621,6 +621,22 @@ dequeue:
         avpkt->data = buf->data;
         avpkt->size = newlen;
     }
+    else if (ff_v4l2_context_q_count(capture) < 2) {
+        // Avoid running out of capture buffers
+        // In most cases the buffers will be returned quickly in which case
+        // we don't copy and can use the v4l2 buffers directly but sometimes
+        // ffmpeg seems to hold onto all of them for a long time (.mkv
+        // creation?) so avoid deadlock in those cases.
+        AVBufferRef * const buf = av_buffer_alloc(avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE);
+        if (buf == NULL)
+            goto fail_no_mem;
+
+        memcpy(buf->data, avpkt->data, avpkt->size);
+        av_buffer_unref(&avpkt->buf);  // Will recycle the V4L2 buffer
+
+        avpkt->buf = buf;
+        avpkt->data = buf->data;
+    }

     capture->first_buf = 0;
     return 0;

From ced9a7d442a04be08fc23e0af310312299a5d5a0 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 19 Oct 2022 11:00:16 +0000
Subject: [PATCH 092/161] v4l2_m2m_dec: Fix pts_best_effort guessing for
 initial pts

(cherry picked from commit 1af32e5c87586a0f7e76cdf19a012ddbcf3eac67)
---
 libavcodec/v4l2_m2m_dec.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index bec9b22fcf..47b2735f82 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -113,6 +113,8 @@ log_dump(void * logctx, int lvl, const void * const data, const size_t len)

 static int64_t pts_stats_guess(const pts_stats_t * const stats)
 {
+    if (stats->last_count <= 1)
+        return stats->last_pts;
     if (stats->last_pts == AV_NOPTS_VALUE ||
             stats->last_interval == 0 ||
             stats->last_count >= STATS_LAST_COUNT_MAX)

From 3e3cf6ed7280d8ad4f3eed17a6d18c2df3c0cd31 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 19 Oct 2022 14:47:04 +0000
Subject: [PATCH 093/161] v4l2_m2m_enc: Wait for frame or space in src Q in
 rx_pkt

If receive_packet we should ensure that there is space in the source Q
if we return EAGAIN so wait for either an output packet or space if
the source Q is currently full.

(cherry picked from commit 82f0c55782a67a8cc665d937647706c2a75f5548)
---
 libavcodec/v4l2_m2m_enc.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
index c23187e6e6..524e9424a5 100644
--- a/libavcodec/v4l2_m2m_enc.c
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -415,13 +415,17 @@ static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format *
     return 1;
 }

+static inline int q_full(const V4L2Context *const output)
+{
+    return ff_v4l2_context_q_count(output) == output->num_buffers;
+}

 static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
 {
     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
     V4L2Context *const output = &s->output;
     int rv;
-    int needs_slot = ff_v4l2_context_q_count(output) == output->num_buffers;
+    const int needs_slot = q_full(output);

     av_log(avctx, AV_LOG_TRACE, "<<< %s; needs_slot=%d\n", __func__, needs_slot);

@@ -549,8 +553,20 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
     }

 dequeue:
-    ret = ff_v4l2_context_dequeue_packet(capture, avpkt, s->draining ? 300 : 0);
-    ff_v4l2_dq_all(output, 0);
+    // Dequeue a frame
+    for (;;) {
+        int t = q_full(output) ? -1 : s->draining ? 300 : 0;
+        int rv2;
+
+        // If output is full wait for either a packet or output to become not full
+        ret = ff_v4l2_context_dequeue_packet(capture, avpkt, t);
+
+        // If output was full retry packet dequeue
+        t = (ret != AVERROR(EAGAIN) || t != -1) ? 0 : 300;
+        rv2 = ff_v4l2_dq_all(output, t);
+        if (t == 0 || rv2 != 0)
+            break;
+    }
     if (ret)
         return (s->draining && ret == AVERROR(EAGAIN)) ? AVERROR_EOF : ret;


From de9ec2bf6421b199aad9ea9dc7896a46c8813d94 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 19 Oct 2022 14:54:29 +0000
Subject: [PATCH 094/161] vf_deinterlace_v4l2m2m: Print dts rather that NOPTS
 in trace

(cherry picked from commit e9b468f35f0c6ad9bfe96f5a05e449afa8ae074a)
---
 libavfilter/vf_deinterlace_v4l2m2m.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
index ce875c2c61..7c6751b69c 100644
--- a/libavfilter/vf_deinterlace_v4l2m2m.c
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -1668,8 +1668,8 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
     V4L2Queue *output              = &ctx->output;
     int ret;

-    av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" (%"PRId64") field :%d interlaced: %d aspect:%d/%d\n",
-          __func__, in->pts, AV_NOPTS_VALUE, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den);
+    av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" dts: %"PRId64" field :%d interlaced: %d aspect:%d/%d\n",
+           __func__, in->pts, in->pkt_dts, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den);
     av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__,
            avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out);


From d71a0a173240e18d518ae0b921ac43849524bd66 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 19 Oct 2022 14:55:21 +0000
Subject: [PATCH 095/161] vf_deinterlace_v4l2m2m: Ignore "wanted" when
 processing input

If we gate send a frame to the outlink on its frame_wanted flag then we
will sometimes stall as the flag may not get set by ffmpeg's filter
processing. So stuff the output whether or not it wants it which works
much better.

(cherry picked from commit 808254cc04e5e6574cbab9af254b6c2f3d4142e3)
---
 libavfilter/vf_deinterlace_v4l2m2m.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
index 7c6751b69c..a173a291f8 100644
--- a/libavfilter/vf_deinterlace_v4l2m2m.c
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -1812,10 +1812,7 @@ static int deint_v4l2m2m_activate(AVFilterContext *avctx)

     ack_inlink(avctx, s, inlink);

-    if (!ff_outlink_frame_wanted(outlink)) {
-        av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__);
-    }
-    else if (s->field_order != V4L2_FIELD_ANY)  // Can't DQ if no setup!
+    if (s->field_order != V4L2_FIELD_ANY)  // Can't DQ if no setup!
     {
         AVFrame * frame = av_frame_alloc();
         int rv;

From 842e0a00288f9a2a862720990791b8eca9546955 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 19 Oct 2022 15:00:43 +0000
Subject: [PATCH 096/161] conf_native: Add --enable-gpl

(cherry picked from commit bab9bf4a2e39391940d88af2ce5d70236ac21f15)
---
 pi-util/conf_native.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
index f22d531ca4..082d9b5832 100755
--- a/pi-util/conf_native.sh
+++ b/pi-util/conf_native.sh
@@ -94,6 +94,7 @@ $FFSRC/configure \
  --enable-libdrm\
  --enable-vout-egl\
  --enable-vout-drm\
+ --enable-gpl\
  $SHARED_LIBS\
  $RPIOPTS\
  --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\

From bf9aaf30818308a4651e00a2a64a0f65dc9a36e5 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 15 Nov 2022 13:33:00 +0000
Subject: [PATCH 097/161] egl_vout: Make formatting consistent - no code
 changes

---
 libavdevice/egl_vout.c | 741 ++++++++++++++++++++---------------------
 1 file changed, 369 insertions(+), 372 deletions(-)

diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c
index 7b9c610ace..a52cabb082 100644
--- a/libavdevice/egl_vout.c
+++ b/libavdevice/egl_vout.c
@@ -48,20 +48,20 @@
 #define TRACE_ALL 0

 struct egl_setup {
-   int conId;
-
-   Display *dpy;
-   EGLDisplay egl_dpy;
-   EGLContext ctx;
-   EGLSurface surf;
-   Window win;
-
-   uint32_t crtcId;
-   int crtcIdx;
-   uint32_t planeId;
-   struct {
-       int x, y, width, height;
-   } compose;
+    int conId;
+
+    Display *dpy;
+    EGLDisplay egl_dpy;
+    EGLContext ctx;
+    EGLSurface surf;
+    Window win;
+
+    uint32_t crtcId;
+    int crtcIdx;
+    uint32_t planeId;
+    struct {
+        int x, y, width, height;
+    } compose;
 };

 typedef struct egl_aux_s {
@@ -70,8 +70,7 @@ typedef struct egl_aux_s {

 } egl_aux_t;

-typedef struct egl_display_env_s
-{
+typedef struct egl_display_env_s {
     AVClass *class;

     struct egl_setup setup;
@@ -89,8 +88,8 @@ typedef struct egl_display_env_s
     sem_t display_start_sem;
     sem_t q_sem;
     int q_terminate;
-    AVFrame * q_this;
-    AVFrame * q_next;
+    AVFrame *q_this;
+    AVFrame *q_next;

 } egl_display_env_t;

@@ -99,45 +98,44 @@ typedef struct egl_display_env_s
  * Remove window border/decorations.
  */
 static void
-no_border( Display *dpy, Window w)
+no_border(Display *dpy, Window w)
 {
-   static const unsigned MWM_HINTS_DECORATIONS = (1 << 1);
-   static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5;
-
-   typedef struct
-   {
-      unsigned long       flags;
-      unsigned long       functions;
-      unsigned long       decorations;
-      long                inputMode;
-      unsigned long       status;
-   } PropMotifWmHints;
-
-   PropMotifWmHints motif_hints;
-   Atom prop, proptype;
-   unsigned long flags = 0;
-
-   /* setup the property */
-   motif_hints.flags = MWM_HINTS_DECORATIONS;
-   motif_hints.decorations = flags;
-
-   /* get the atom for the property */
-   prop = XInternAtom( dpy, "_MOTIF_WM_HINTS", True );
-   if (!prop) {
-      /* something went wrong! */
-      return;
-   }
-
-   /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */
-   proptype = prop;
-
-   XChangeProperty( dpy, w,                         /* display, window */
+    static const unsigned MWM_HINTS_DECORATIONS = (1 << 1);
+    static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5;
+
+    typedef struct {
+        unsigned long       flags;
+        unsigned long       functions;
+        unsigned long       decorations;
+        long                inputMode;
+        unsigned long       status;
+    } PropMotifWmHints;
+
+    PropMotifWmHints motif_hints;
+    Atom prop, proptype;
+    unsigned long flags = 0;
+
+    /* setup the property */
+    motif_hints.flags = MWM_HINTS_DECORATIONS;
+    motif_hints.decorations = flags;
+
+    /* get the atom for the property */
+    prop = XInternAtom(dpy, "_MOTIF_WM_HINTS", True);
+    if (!prop) {
+        /* something went wrong! */
+        return;
+    }
+
+    /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */
+    proptype = prop;
+
+    XChangeProperty(dpy, w,                         /* display, window */
                     prop, proptype,                 /* property, type */
                     32,                             /* format: 32-bit datums */
                     PropModeReplace,                /* mode */
-                    (unsigned char *) &motif_hints, /* data */
+                    (unsigned char *)&motif_hints, /* data */
                     PROP_MOTIF_WM_HINTS_ELEMENTS    /* nelements */
-                  );
+                   );
 }


@@ -146,247 +144,247 @@ no_border( Display *dpy, Window w)
  * Return the window and context handles.
  */
 static int
-make_window(struct AVFormatContext * const s,
-            egl_display_env_t * const de,
+make_window(struct AVFormatContext *const s,
+            egl_display_env_t *const de,
             Display *dpy, EGLDisplay egl_dpy, const char *name,
             Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet)
 {
-   int scrnum = DefaultScreen( dpy );
-   XSetWindowAttributes attr;
-   unsigned long mask;
-   Window root = RootWindow( dpy, scrnum );
-   Window win;
-   EGLContext ctx;
-   const int fullscreen = de->fullscreen;
-   EGLConfig config;
-   int x = de->window_x;
-   int y = de->window_y;
-   int width = de->window_width ? de->window_width : 1280;
-   int height = de->window_height ? de->window_height : 720;
-
-
-   if (fullscreen) {
-      int scrnum = DefaultScreen(dpy);
-
-      x = 0; y = 0;
-      width = DisplayWidth(dpy, scrnum);
-      height = DisplayHeight(dpy, scrnum);
-   }
-
-   {
-      EGLint num_configs;
-      static const EGLint attribs[] = {
-         EGL_RED_SIZE, 1,
-         EGL_GREEN_SIZE, 1,
-         EGL_BLUE_SIZE, 1,
-         EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
-         EGL_NONE
-      };
-
-      if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) {
-         av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n");
-         return -1;
-      }
-   }
-
-   {
-      EGLint vid;
-      if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) {
-         av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n");
-         return -1;
-      }
-
-      {
-         XVisualInfo visTemplate = {
-            .visualid = vid,
-         };
-         int num_visuals;
-         XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask,
-                                               &visTemplate, &num_visuals);
-
-         /* window attributes */
-         attr.background_pixel = 0;
-         attr.border_pixel = 0;
-         attr.colormap = XCreateColormap( dpy, root, visinfo->visual, AllocNone);
-         attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask;
-         /* XXX this is a bad way to get a borderless window! */
-         mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask;
-
-         win = XCreateWindow( dpy, root, x, y, width, height,
-                              0, visinfo->depth, InputOutput,
-                              visinfo->visual, mask, &attr );
-         XFree(visinfo);
-      }
-   }
-
-   if (fullscreen)
-      no_border(dpy, win);
-
-   /* set hints and properties */
-   {
-      XSizeHints sizehints;
-      sizehints.x = x;
-      sizehints.y = y;
-      sizehints.width  = width;
-      sizehints.height = height;
-      sizehints.flags = USSize | USPosition;
-      XSetNormalHints(dpy, win, &sizehints);
-      XSetStandardProperties(dpy, win, name, name,
-                              None, (char **)NULL, 0, &sizehints);
-   }
-
-   eglBindAPI(EGL_OPENGL_ES_API);
-
-   {
-      static const EGLint ctx_attribs[] = {
-         EGL_CONTEXT_CLIENT_VERSION, 2,
-         EGL_NONE
-      };
-      ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs );
-      if (!ctx) {
-         av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
-         return -1;
-      }
-   }
-
-
-   XMapWindow(dpy, win);
-
-   {
-      EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
-      if (!surf) {
-         av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
-         return -1;
-      }
-
-      if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) {
-         av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
-         return -1;
-      }
-
-      *winRet = win;
-      *ctxRet = ctx;
-      *surfRet = surf;
-   }
-
-   return 0;
+    int scrnum = DefaultScreen(dpy);
+    XSetWindowAttributes attr;
+    unsigned long mask;
+    Window root = RootWindow(dpy, scrnum);
+    Window win;
+    EGLContext ctx;
+    const int fullscreen = de->fullscreen;
+    EGLConfig config;
+    int x = de->window_x;
+    int y = de->window_y;
+    int width = de->window_width ? de->window_width : 1280;
+    int height = de->window_height ? de->window_height : 720;
+
+
+    if (fullscreen) {
+        int scrnum = DefaultScreen(dpy);
+
+        x = 0; y = 0;
+        width = DisplayWidth(dpy, scrnum);
+        height = DisplayHeight(dpy, scrnum);
+    }
+
+    {
+        EGLint num_configs;
+        static const EGLint attribs[] = {
+            EGL_RED_SIZE, 1,
+            EGL_GREEN_SIZE, 1,
+            EGL_BLUE_SIZE, 1,
+            EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
+            EGL_NONE
+        };
+
+        if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) {
+            av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n");
+            return -1;
+        }
+    }
+
+    {
+        EGLint vid;
+        if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) {
+            av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n");
+            return -1;
+        }
+
+        {
+            XVisualInfo visTemplate = {
+                .visualid = vid,
+            };
+            int num_visuals;
+            XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask,
+                                                  &visTemplate, &num_visuals);
+
+            /* window attributes */
+            attr.background_pixel = 0;
+            attr.border_pixel = 0;
+            attr.colormap = XCreateColormap(dpy, root, visinfo->visual, AllocNone);
+            attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask;
+            /* XXX this is a bad way to get a borderless window! */
+            mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask;
+
+            win = XCreateWindow(dpy, root, x, y, width, height,
+                                0, visinfo->depth, InputOutput,
+                                visinfo->visual, mask, &attr);
+            XFree(visinfo);
+        }
+    }
+
+    if (fullscreen)
+        no_border(dpy, win);
+
+    /* set hints and properties */
+    {
+        XSizeHints sizehints;
+        sizehints.x = x;
+        sizehints.y = y;
+        sizehints.width  = width;
+        sizehints.height = height;
+        sizehints.flags = USSize | USPosition;
+        XSetNormalHints(dpy, win, &sizehints);
+        XSetStandardProperties(dpy, win, name, name,
+                               None, (char **)NULL, 0, &sizehints);
+    }
+
+    eglBindAPI(EGL_OPENGL_ES_API);
+
+    {
+        static const EGLint ctx_attribs[] = {
+            EGL_CONTEXT_CLIENT_VERSION, 2,
+            EGL_NONE
+        };
+        ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs);
+        if (!ctx) {
+            av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
+            return -1;
+        }
+    }
+
+
+    XMapWindow(dpy, win);
+
+    {
+        EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
+        if (!surf) {
+            av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
+            return -1;
+        }
+
+        if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) {
+            av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
+            return -1;
+        }
+
+        *winRet = win;
+        *ctxRet = ctx;
+        *surfRet = surf;
+    }
+
+    return 0;
 }

 static GLint
-compile_shader(struct AVFormatContext * const avctx, GLenum target, const char *source)
+compile_shader(struct AVFormatContext *const avctx, GLenum target, const char *source)
 {
-   GLuint s = glCreateShader(target);
+    GLuint s = glCreateShader(target);

-   if (s == 0) {
-      av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n");
-      return 0;
-   }
+    if (s == 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n");
+        return 0;
+    }

-   glShaderSource(s, 1, (const GLchar **) &source, NULL);
-   glCompileShader(s);
+    glShaderSource(s, 1, (const GLchar **)&source, NULL);
+    glCompileShader(s);

-   {
-      GLint ok;
-      glGetShaderiv(s, GL_COMPILE_STATUS, &ok);
+    {
+        GLint ok;
+        glGetShaderiv(s, GL_COMPILE_STATUS, &ok);

-      if (!ok) {
-         GLchar *info;
-         GLint size;
+        if (!ok) {
+            GLchar *info;
+            GLint size;

-         glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size);
-         info = malloc(size);
+            glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size);
+            info = malloc(size);

-         glGetShaderInfoLog(s, size, NULL, info);
-         av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source);
+            glGetShaderInfoLog(s, size, NULL, info);
+            av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source);

-         return 0;
-      }
-   }
+            return 0;
+        }
+    }

-   return s;
+    return s;
 }

-static GLuint link_program(struct AVFormatContext * const s, GLint vs, GLint fs)
+static GLuint link_program(struct AVFormatContext *const s, GLint vs, GLint fs)
 {
-   GLuint prog = glCreateProgram();
-
-   if (prog == 0) {
-      av_log(s, AV_LOG_ERROR, "Failed to create program\n");
-      return 0;
-   }
-
-   glAttachShader(prog, vs);
-   glAttachShader(prog, fs);
-   glLinkProgram(prog);
-
-   {
-      GLint ok;
-      glGetProgramiv(prog, GL_LINK_STATUS, &ok);
-      if (!ok) {
-         /* Some drivers return a size of 1 for an empty log.  This is the size
-          * of a log that contains only a terminating NUL character.
-          */
-         GLint size;
-         GLchar *info = NULL;
-         glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size);
-         if (size > 1) {
-            info = malloc(size);
-            glGetProgramInfoLog(prog, size, NULL, info);
-         }
+    GLuint prog = glCreateProgram();

-         av_log(s, AV_LOG_ERROR, "Failed to link: %s\n",
-                 (info != NULL) ? info : "<empty log>");
-         return 0;
-      }
-   }
+    if (prog == 0) {
+        av_log(s, AV_LOG_ERROR, "Failed to create program\n");
+        return 0;
+    }
+
+    glAttachShader(prog, vs);
+    glAttachShader(prog, fs);
+    glLinkProgram(prog);
+
+    {
+        GLint ok;
+        glGetProgramiv(prog, GL_LINK_STATUS, &ok);
+        if (!ok) {
+            /* Some drivers return a size of 1 for an empty log.  This is the size
+             * of a log that contains only a terminating NUL character.
+             */
+            GLint size;
+            GLchar *info = NULL;
+            glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size);
+            if (size > 1) {
+                info = malloc(size);
+                glGetProgramInfoLog(prog, size, NULL, info);
+            }

-   return prog;
+            av_log(s, AV_LOG_ERROR, "Failed to link: %s\n",
+                   (info != NULL) ? info : "<empty log>");
+            return 0;
+        }
+    }
+
+    return prog;
 }

 static int
-gl_setup(struct AVFormatContext * const s)
+gl_setup(struct AVFormatContext *const s)
 {
-   const char *vs =
-      "attribute vec4 pos;\n"
-      "varying vec2 texcoord;\n"
-      "\n"
-      "void main() {\n"
-      "  gl_Position = pos;\n"
-      "  texcoord.x = (pos.x + 1.0) / 2.0;\n"
-      "  texcoord.y = (-pos.y + 1.0) / 2.0;\n"
-      "}\n";
-   const char *fs =
-      "#extension GL_OES_EGL_image_external : enable\n"
-      "precision mediump float;\n"
-      "uniform samplerExternalOES s;\n"
-      "varying vec2 texcoord;\n"
-      "void main() {\n"
-      "  gl_FragColor = texture2D(s, texcoord);\n"
-      "}\n";
-
-   GLuint vs_s;
-   GLuint fs_s;
-   GLuint prog;
-
-   if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) ||
-       !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) ||
-       !(prog = link_program(s, vs_s, fs_s)))
-      return -1;
-
-   glUseProgram(prog);
-
-   {
-      static const float verts[] = {
-         -1, -1,
-         1, -1,
-         1, 1,
-         -1, 1,
-      };
-      glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts);
-   }
-
-   glEnableVertexAttribArray(0);
-   return 0;
+    const char *vs =
+        "attribute vec4 pos;\n"
+        "varying vec2 texcoord;\n"
+        "\n"
+        "void main() {\n"
+        "  gl_Position = pos;\n"
+        "  texcoord.x = (pos.x + 1.0) / 2.0;\n"
+        "  texcoord.y = (-pos.y + 1.0) / 2.0;\n"
+        "}\n";
+    const char *fs =
+        "#extension GL_OES_EGL_image_external : enable\n"
+        "precision mediump float;\n"
+        "uniform samplerExternalOES s;\n"
+        "varying vec2 texcoord;\n"
+        "void main() {\n"
+        "  gl_FragColor = texture2D(s, texcoord);\n"
+        "}\n";
+
+    GLuint vs_s;
+    GLuint fs_s;
+    GLuint prog;
+
+    if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) ||
+        !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) ||
+        !(prog = link_program(s, vs_s, fs_s)))
+        return -1;
+
+    glUseProgram(prog);
+
+    {
+        static const float verts[] = {
+            -1, -1,
+            1, -1,
+            1,  1,
+            -1,  1,
+        };
+        glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts);
+    }
+
+    glEnableVertexAttribArray(0);
+    return 0;
 }

 static int egl_vout_write_trailer(AVFormatContext *s)
@@ -400,12 +398,12 @@ static int egl_vout_write_trailer(AVFormatContext *s)

 static int egl_vout_write_header(AVFormatContext *s)
 {
-    const AVCodecParameters * const par = s->streams[0]->codecpar;
+    const AVCodecParameters *const par = s->streams[0]->codecpar;

 #if TRACE_ALL
     av_log(s, AV_LOG_INFO, "%s\n", __func__);
 #endif
-    if (   s->nb_streams > 1
+    if (s->nb_streams > 1
         || par->codec_type != AVMEDIA_TYPE_VIDEO
         || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
         av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
@@ -416,10 +414,10 @@ static int egl_vout_write_header(AVFormatContext *s)
 }


-static int do_display(AVFormatContext * const s, egl_display_env_t * const de, AVFrame * const frame)
+static int do_display(AVFormatContext *const s, egl_display_env_t *const de, AVFrame *const frame)
 {
-    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
-    egl_aux_t * da = NULL;
+    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor *)frame->data[0];
+    egl_aux_t *da = NULL;
     unsigned int i;

 #if TRACE_ALL
@@ -440,26 +438,26 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A

     if (da->texture == 0) {
         EGLint attribs[50];
-        EGLint * a = attribs;
+        EGLint *a = attribs;
         int i, j;
         static const EGLint anames[] = {
-           EGL_DMA_BUF_PLANE0_FD_EXT,
-           EGL_DMA_BUF_PLANE0_OFFSET_EXT,
-           EGL_DMA_BUF_PLANE0_PITCH_EXT,
-           EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
-           EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
-           EGL_DMA_BUF_PLANE1_FD_EXT,
-           EGL_DMA_BUF_PLANE1_OFFSET_EXT,
-           EGL_DMA_BUF_PLANE1_PITCH_EXT,
-           EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT,
-           EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
-           EGL_DMA_BUF_PLANE2_FD_EXT,
-           EGL_DMA_BUF_PLANE2_OFFSET_EXT,
-           EGL_DMA_BUF_PLANE2_PITCH_EXT,
-           EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT,
-           EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT,
+            EGL_DMA_BUF_PLANE0_FD_EXT,
+            EGL_DMA_BUF_PLANE0_OFFSET_EXT,
+            EGL_DMA_BUF_PLANE0_PITCH_EXT,
+            EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
+            EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
+            EGL_DMA_BUF_PLANE1_FD_EXT,
+            EGL_DMA_BUF_PLANE1_OFFSET_EXT,
+            EGL_DMA_BUF_PLANE1_PITCH_EXT,
+            EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT,
+            EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
+            EGL_DMA_BUF_PLANE2_FD_EXT,
+            EGL_DMA_BUF_PLANE2_OFFSET_EXT,
+            EGL_DMA_BUF_PLANE2_PITCH_EXT,
+            EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT,
+            EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT,
         };
-        const EGLint * b = anames;
+        const EGLint *b = anames;

         *a++ = EGL_WIDTH;
         *a++ = av_frame_cropped_width(frame);
@@ -470,8 +468,8 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A

         for (i = 0; i < desc->nb_layers; ++i) {
             for (j = 0; j < desc->layers[i].nb_planes; ++j) {
-                const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
-                const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
+                const AVDRMPlaneDescriptor *const p = desc->layers[i].planes + j;
+                const AVDRMObjectDescriptor *const obj = desc->objects + p->object_index;
                 *a++ = *b++;
                 *a++ = obj->fd;
                 *a++ = *b++;
@@ -479,13 +477,13 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A
                 *a++ = *b++;
                 *a++ = p->pitch;
                 if (obj->format_modifier == 0) {
-                   b += 2;
+                    b += 2;
                 }
                 else {
-                   *a++ = *b++;
-                   *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF);
-                   *a++ = *b++;
-                   *a++ = (EGLint)(obj->format_modifier >> 32);
+                    *a++ = *b++;
+                    *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF);
+                    *a++ = *b++;
+                    *a++ = (EGLint)(obj->format_modifier >> 32);
                 }
             }
         }
@@ -494,26 +492,26 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A

 #if TRACE_ALL
         for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) {
-           av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]);
+            av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]);
         }
 #endif
         {
-           const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy,
-                                              EGL_NO_CONTEXT,
-                                              EGL_LINUX_DMA_BUF_EXT,
-                                              NULL, attribs);
-           if (!image) {
-              av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd);
-              return -1;
-           }
-
-           glGenTextures(1, &da->texture);
-           glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
-           glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-           glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
-           glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
-
-           eglDestroyImageKHR(de->setup.egl_dpy, image);
+            const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy,
+                                                     EGL_NO_CONTEXT,
+                                                     EGL_LINUX_DMA_BUF_EXT,
+                                                     NULL, attribs);
+            if (!image) {
+                av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd);
+                return -1;
+            }
+
+            glGenTextures(1, &da->texture);
+            glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
+            glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+            glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+            glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
+
+            eglDestroyImageKHR(de->setup.egl_dpy, image);
         }

         da->fd = desc->objects[0].fd;
@@ -540,7 +538,7 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A
                (long long)modifiers[1],
                (long long)modifiers[2],
                (long long)modifiers[3]
-               );
+              );
 #endif
     }

@@ -558,55 +556,55 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A
     return 0;
 }

-static void * display_thread(void * v)
+static void* display_thread(void *v)
 {
-    AVFormatContext * const s = v;
-    egl_display_env_t * const de = s->priv_data;
+    AVFormatContext *const s = v;
+    egl_display_env_t *const de = s->priv_data;

 #if TRACE_ALL
     av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
 #endif
     {
-       EGLint egl_major, egl_minor;
-
-       de->setup.dpy = XOpenDisplay(NULL);
-       if (!de->setup.dpy) {
-          av_log(s, AV_LOG_ERROR, "Couldn't open X display\n");
-          goto fail;
-       }
-
-       de->setup.egl_dpy = eglGetDisplay(de->setup.dpy);
-       if (!de->setup.egl_dpy) {
-          av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n");
-          goto fail;
-       }
-
-       if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) {
-           av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n");
-           goto fail;
-       }
-
-       av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor);
-
-       if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) {
-          av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n");
-          goto fail;
-       }
+        EGLint egl_major, egl_minor;
+
+        de->setup.dpy = XOpenDisplay(NULL);
+        if (!de->setup.dpy) {
+            av_log(s, AV_LOG_ERROR, "Couldn't open X display\n");
+            goto fail;
+        }
+
+        de->setup.egl_dpy = eglGetDisplay(de->setup.dpy);
+        if (!de->setup.egl_dpy) {
+            av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n");
+            goto fail;
+        }
+
+        if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) {
+            av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n");
+            goto fail;
+        }
+
+        av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor);
+
+        if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) {
+            av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n");
+            goto fail;
+        }
     }

     if (!de->window_width || !de->window_height) {
-       de->window_width = 1280;
-       de->window_height = 720;
+        de->window_width = 1280;
+        de->window_height = 720;
     }
     if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout",
                     &de->setup.win, &de->setup.ctx, &de->setup.surf)) {
-       av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__);
-       goto fail;
+        av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__);
+        goto fail;
     }

     if (gl_setup(s)) {
-       av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__);
-       goto fail;
+        av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__);
+        goto fail;
     }

 #if TRACE_ALL
@@ -615,7 +613,7 @@ static void * display_thread(void * v)
     sem_post(&de->display_start_sem);

     for (;;) {
-        AVFrame * frame;
+        AVFrame *frame;

         while (sem_wait(&de->q_sem) != 0) {
             av_assert0(errno == EINTR);
@@ -653,9 +651,9 @@ fail:

 static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
 {
-    const AVFrame * const src_frame = (AVFrame *)pkt->data;
-    AVFrame * frame;
-    egl_display_env_t * const de = s->priv_data;
+    const AVFrame *const src_frame = (AVFrame *)pkt->data;
+    AVFrame *frame;
+    egl_display_env_t *const de = s->priv_data;

 #if TRACE_ALL
     av_log(s, AV_LOG_INFO, "%s\n", __func__);
@@ -668,8 +666,7 @@ static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
     else if (src_frame->format == AV_PIX_FMT_VAAPI) {
         frame = av_frame_alloc();
         frame->format = AV_PIX_FMT_DRM_PRIME;
-        if (av_hwframe_map(frame, src_frame, 0) != 0)
-        {
+        if (av_hwframe_map(frame, src_frame, 0) != 0) {
             av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
             av_frame_free(&frame);
             return AVERROR(EINVAL);
@@ -682,12 +679,12 @@ static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)

     // Really hacky sync
     while (de->show_all && de->q_next) {
-       usleep(3000);
+        usleep(3000);
     }

     pthread_mutex_lock(&de->q_lock);
     {
-        AVFrame * const t = de->q_next;
+        AVFrame *const t = de->q_next;
         de->q_next = frame;
         frame = t;
     }
@@ -702,7 +699,7 @@ static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
 }

 static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
-                          unsigned flags)
+                                unsigned flags)
 {
     av_log(s, AV_LOG_ERROR, "%s: NIF: idx=%d, flags=%#x\n", __func__, stream_index, flags);
     return AVERROR_PATCHWELCOME;
@@ -713,7 +710,7 @@ static int egl_vout_control_message(AVFormatContext *s, int type, void *data, si
 #if TRACE_ALL
     av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
 #endif
-    switch(type) {
+    switch (type) {
     case AV_APP_TO_DEV_WINDOW_REPAINT:
         return 0;
     default:
@@ -723,14 +720,14 @@ static int egl_vout_control_message(AVFormatContext *s, int type, void *data, si
 }

 // deinit is called if init fails so no need to clean up explicity here
-static int egl_vout_init(struct AVFormatContext * s)
+static int egl_vout_init(struct AVFormatContext *s)
 {
-    egl_display_env_t * const de = s->priv_data;
+    egl_display_env_t *const de = s->priv_data;
     unsigned int i;

     av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);

-    de->setup = (struct egl_setup){0};
+    de->setup = (struct egl_setup) { 0 };

     for (i = 0; i != 32; ++i) {
         de->aux[i].fd = -1;
@@ -744,8 +741,8 @@ static int egl_vout_init(struct AVFormatContext * s)

     sem_wait(&de->display_start_sem);
     if (de->q_terminate) {
-       av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__);
-       return -1;
+        av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__);
+        return -1;
     }

     av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
@@ -753,9 +750,9 @@ static int egl_vout_init(struct AVFormatContext * s)
     return 0;
 }

-static void egl_vout_deinit(struct AVFormatContext * s)
+static void egl_vout_deinit(struct AVFormatContext *s)
 {
-    egl_display_env_t * const de = s->priv_data;
+    egl_display_env_t *const de = s->priv_data;

     av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);

@@ -773,11 +770,11 @@ static void egl_vout_deinit(struct AVFormatContext * s)

 #define OFFSET(x) offsetof(egl_display_env_t, x)
 static const AVOption options[] = {
-   { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
-   { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
-   { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-   { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-   { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
+    { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
+    { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, { .str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
+    { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
+    { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
+    { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
     { NULL }

 };

From 4d3a3973a07994b0a6ec35626e514fc40f439fe3 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 12 Dec 2022 16:49:43 +0000
Subject: [PATCH 098/161] v4l2m2m: reporganise get_raw_format for loop logic

---
 libavcodec/v4l2_context.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 7031f3d340..79a31cf930 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -828,28 +828,22 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
             return 0;
     }

-    for (;;) {
+    for (;; ++fdesc.index) {
         ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc);
         if (ret)
             return AVERROR(EINVAL);

         if (priv->pix_fmt != AV_PIX_FMT_NONE) {
-            if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) {
-                fdesc.index++;
+            if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt))
                 continue;
-            }
         }

         pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
         ret = v4l2_try_raw_format(ctx, pixfmt);
-        if (ret){
-            fdesc.index++;
-            continue;
+        if (ret == 0) {
+            *p = pixfmt;
+            return 0;
         }
-
-        *p = pixfmt;
-
-        return 0;
     }

     return AVERROR(EINVAL);

From 123c5ef429ec6bd7d1875d621df88bb2ad7af0bd Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 12 Dec 2022 17:49:12 +0000
Subject: [PATCH 099/161] drm_vout: Set zpos on the plane we pick to ensure it
 is at the front

---
 libavdevice/drm_vout.c | 38 +++++++++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
index cfb33ce7c3..9bd9e04421 100644
--- a/libavdevice/drm_vout.c
+++ b/libavdevice/drm_vout.c
@@ -115,9 +115,11 @@ static int find_plane(struct AVFormatContext * const avctx,
 {
    drmModePlaneResPtr planes;
    drmModePlanePtr plane;
+   drmModeObjectPropertiesPtr props = NULL;
+   drmModePropertyPtr prop = NULL;
    unsigned int i;
    unsigned int j;
-   int ret = 0;
+   int ret = -1;

    planes = drmModeGetPlaneResources(drmfd);
    if (!planes)
@@ -154,11 +156,37 @@ static int find_plane(struct AVFormatContext * const avctx,
       break;
    }

-   if (i == planes->count_planes)
-      ret = -1;
+   if (i == planes->count_planes) {
+       ret = -1;
+       goto fail;
+   }

-   drmModeFreePlaneResources(planes);
-   return ret;
+    props = drmModeObjectGetProperties(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE);
+    if (!props)
+        goto fail;
+    for (i = 0; i != props->count_props; ++i) {
+        if (prop)
+            drmModeFreeProperty(prop);
+        prop = drmModeGetProperty(drmfd, props->props[i]);
+        if (!prop)
+            goto fail;
+        if (strcmp("zpos", prop->name) == 0) {
+            if (drmModeObjectSetProperty(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE, props->props[i], prop->values[1]) == 0)
+                av_log(avctx, AV_LOG_DEBUG, "ZPOS set to %d\n", (int)prop->values[1]);
+            else
+                av_log(avctx, AV_LOG_WARNING, "Failed to set ZPOS on DRM plane\n");
+            break;
+        }
+    }
+
+    ret = 0;
+fail:
+    if (props)
+        drmModeFreeObjectProperties(props);
+    if (prop)
+        drmModeFreeProperty(prop);
+    drmModeFreePlaneResources(planes);
+    return ret;
 }

 static void da_uninit(drm_display_env_t * const de, drm_aux_t * da)

From 0ee1c3b41774d05595376f8d25de2a901dbb12c7 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 12 Dec 2022 17:51:46 +0000
Subject: [PATCH 100/161] drm_vout: Only set modifier flag and pass modifiers
 if there are some

---
 libavdevice/drm_vout.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
index 9bd9e04421..a56adea866 100644
--- a/libavdevice/drm_vout.c
+++ b/libavdevice/drm_vout.c
@@ -34,6 +34,7 @@

 #include <xf86drm.h>
 #include <xf86drmMode.h>
+#include <drm_fourcc.h>

 #define TRACE_ALL 0

@@ -249,6 +250,7 @@ static int do_display(AVFormatContext * const s, drm_display_env_t * const de, A
         uint32_t offsets[4] = {0};
         uint64_t modifiers[4] = {0};
         uint32_t bo_handles[4] = {0};
+        int has_mods = 0;
         int i, j, n;

         da->frame = frame;
@@ -258,6 +260,9 @@ static int do_display(AVFormatContext * const s, drm_display_env_t * const de, A
                 av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR);
                 return -1;
             }
+            if (desc->objects[i].format_modifier != DRM_FORMAT_MOD_LINEAR &&
+                desc->objects[i].format_modifier != DRM_FORMAT_MOD_INVALID)
+                has_mods = 1;
         }

         n = 0;
@@ -299,11 +304,13 @@ static int do_display(AVFormatContext * const s, drm_display_env_t * const de, A
 #endif

         if (drmModeAddFB2WithModifiers(de->drm_fd,
-                                         av_frame_cropped_width(frame),
-                                         av_frame_cropped_height(frame),
-                                         desc->layers[0].format, bo_handles,
-                                         pitches, offsets, modifiers,
-                                         &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) {
+                                       av_frame_cropped_width(frame),
+                                       av_frame_cropped_height(frame),
+                                       desc->layers[0].format, bo_handles,
+                                       pitches, offsets,
+                                       has_mods ? modifiers : NULL,
+                                       &da->fb_handle,
+                                       has_mods ? DRM_MODE_FB_MODIFIERS : 0) != 0) {
             av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR);
             return -1;
         }

From 4534e6981c1718eaeec4c5f58cdf5592ee7f0329 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 12 Dec 2022 17:52:58 +0000
Subject: [PATCH 101/161] drm_vout: Fix typo in error message

---
 libavdevice/drm_vout.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
index a56adea866..351abf1d60 100644
--- a/libavdevice/drm_vout.c
+++ b/libavdevice/drm_vout.c
@@ -596,7 +596,7 @@ static int drm_vout_init(struct AVFormatContext * s)
     sem_init(&de->q_sem_out, 0, 0);
     if (pthread_create(&de->q_thread, NULL, display_thread, s)) {
         rv = AVERROR(errno);
-        av_log(s, AV_LOG_ERROR, "Failed to creatye display thread: %s\n", av_err2str(rv));
+        av_log(s, AV_LOG_ERROR, "Failed to create display thread: %s\n", av_err2str(rv));
         goto fail_close;
     }


From 0469d1fb132a0d55593611c56e83733efe58045b Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 12 Dec 2022 18:00:41 +0000
Subject: [PATCH 102/161] drm_vout: Add option to name the drm_module to use

---
 libavdevice/drm_vout.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
index 351abf1d60..491e1dc608 100644
--- a/libavdevice/drm_vout.c
+++ b/libavdevice/drm_vout.c
@@ -70,7 +70,9 @@ typedef struct drm_display_env_s
     uint32_t con_id;
     struct drm_setup setup;
     enum AVPixelFormat avfmt;
+
     int show_all;
+    const char * drm_module;

     unsigned int ano;
     drm_aux_t aux[AUX_SIZE];
@@ -569,7 +571,6 @@ static int drm_vout_init(struct AVFormatContext * s)
 {
     drm_display_env_t * const de = s->priv_data;
     int rv;
-    const char * drm_module = DRM_MODULE;

     av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);

@@ -578,10 +579,10 @@ static int drm_vout_init(struct AVFormatContext * s)
     de->setup = (struct drm_setup){0};
     de->q_terminate = 0;

-    if ((de->drm_fd = drmOpen(drm_module, NULL)) < 0)
+    if ((de->drm_fd = drmOpen(de->drm_module, NULL)) < 0)
     {
         rv = AVERROR(errno);
-        av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", drm_module, av_err2str(rv));
+        av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", de->drm_module, av_err2str(rv));
         return rv;
     }

@@ -641,6 +642,7 @@ static void drm_vout_deinit(struct AVFormatContext * s)
 #define OFFSET(x) offsetof(drm_display_env_t, x)
 static const AVOption options[] = {
     { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
+    { "drm_module", "drm_module name to use, default=" DRM_MODULE, OFFSET(drm_module), AV_OPT_TYPE_STRING, { .str = DRM_MODULE }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
     { NULL }
 };


From 61cb9fc3ce06e0ecaeeec3add143bc3a82956853 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 13 Dec 2022 13:01:00 +0000
Subject: [PATCH 103/161] dmabufs: Rework to allow for non-CMA backends

---
 libavcodec/v4l2_req_dmabufs.c | 161 ++++++++++++++++++++++++----------
 1 file changed, 116 insertions(+), 45 deletions(-)

diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
index c4bbed18c6..1c3a5e861f 100644
--- a/libavcodec/v4l2_req_dmabufs.c
+++ b/libavcodec/v4l2_req_dmabufs.c
@@ -1,3 +1,4 @@
+#include <stdatomic.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -19,9 +20,21 @@

 #define TRACE_ALLOC 0

+struct dmabufs_ctl;
+struct dmabuf_h;
+
+struct dmabuf_fns {
+    int (*buf_alloc)(struct dmabufs_ctl * dbsc, struct dmabuf_h * dh, size_t size);
+    void (*buf_free)(struct dmabuf_h * dh);
+    int (*ctl_new)(struct dmabufs_ctl * dbsc);
+    void (*ctl_free)(struct dmabufs_ctl * dbsc);
+};
+
 struct dmabufs_ctl {
     int fd;
     size_t page_size;
+    void * v;
+    const struct dmabuf_fns * fns;
 };

 struct dmabuf_h {
@@ -29,6 +42,8 @@ struct dmabuf_h {
     size_t size;
     size_t len;
     void * mapptr;
+    void * v;
+    const struct dmabuf_fns * fns;
 };

 #if TRACE_ALLOC
@@ -88,15 +103,8 @@ struct dmabuf_h * dmabuf_import(int fd, size_t size)
 struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size)
 {
     struct dmabuf_h * dh;
-    struct dma_heap_allocation_data data = {
-        .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1),
-        .fd = 0,
-        .fd_flags = O_RDWR,
-        .heap_flags = 0
-    };
-
     if (old != NULL) {
-        if (old->size == data.len) {
+        if (old->size >= size) {
             return old;
         }
         dmabuf_free(old);
@@ -106,24 +114,16 @@ struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * ol
         (dh = malloc(sizeof(*dh))) == NULL)
         return NULL;

-    while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) {
-        int err = errno;
-        request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n",
-                (uint64_t)data.len,
-                dbsc->fd,
-                err,
-                strerror(err));
-        if (err == EINTR)
-            continue;
-        goto fail;
-    }
-
     *dh = (struct dmabuf_h){
-        .fd = data.fd,
-        .size = (size_t)data.len,
-        .mapptr = MAP_FAILED
+        .fd = -1,
+        .mapptr = MAP_FAILED,
+        .fns = dbsc->fns
     };

+    if (dh->fns->buf_alloc(dbsc, dh, size) != 0)
+        goto fail;
+
+
 #if TRACE_ALLOC
     ++total_bufs;
     total_size += dh->size;
@@ -220,8 +220,6 @@ void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len)
     dh->len = len;
 }

-
-
 void dmabuf_free(struct dmabuf_h * dh)
 {
     if (!dh)
@@ -233,20 +231,63 @@ void dmabuf_free(struct dmabuf_h * dh)
     request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
 #endif

-    if (dh->mapptr != MAP_FAILED)
+    dh->fns->buf_free(dh);
+
+    if (dh->mapptr != MAP_FAILED && dh->mapptr != NULL)
         munmap(dh->mapptr, dh->size);
-    while (close(dh->fd) == -1 && errno == EINTR)
-        /* loop */;
+    if (dh->fd != -1)
+        while (close(dh->fd) == -1 && errno == EINTR)
+            /* loop */;
     free(dh);
 }

-struct dmabufs_ctl * dmabufs_ctl_new(void)
+static struct dmabufs_ctl * dmabufs_ctl_new2(const struct dmabuf_fns * const fns)
 {
-    struct dmabufs_ctl * dbsc = malloc(sizeof(*dbsc));
+    struct dmabufs_ctl * dbsc = calloc(1, sizeof(*dbsc));

     if (!dbsc)
         return NULL;

+    dbsc->fd = -1;
+    dbsc->fns = fns;
+    dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE);
+
+    if (fns->ctl_new(dbsc) != 0)
+        goto fail;
+
+    return dbsc;
+
+fail:
+    free(dbsc);
+    return NULL;
+}
+
+static void dmabufs_ctl_free(struct dmabufs_ctl * const dbsc)
+{
+    request_debug(NULL, "Free dmabuf ctl\n");
+
+    dbsc->fns->ctl_free(dbsc);
+
+    free(dbsc);
+}
+
+void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
+{
+    struct dmabufs_ctl * const dbsc = *pDbsc;
+
+    if (!dbsc)
+        return;
+    *pDbsc = NULL;
+
+    dmabufs_ctl_free(dbsc);
+}
+
+//-----------------------------------------------------------------------------
+//
+// Alloc dmabuf via CMA
+
+static int ctl_cma_new(struct dmabufs_ctl * dbsc)
+{
     while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 &&
            errno == EINTR)
         /* Loop */;
@@ -258,31 +299,61 @@ struct dmabufs_ctl * dmabufs_ctl_new(void)
         if (dbsc->fd == -1) {
             request_log("Unable to open either %s or %s\n",
                     DMABUF_NAME1, DMABUF_NAME2);
-            goto fail;
+            return -1;
         }
     }
+    return 0;
+}

-    dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE);
-
-    return dbsc;
+static void ctl_cma_free(struct dmabufs_ctl * dbsc)
+{
+    if (dbsc->fd != -1)
+        while (close(dbsc->fd) == -1 && errno == EINTR)
+            /* loop */;

-fail:
-    free(dbsc);
-    return NULL;
 }

-void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
+static int buf_cma_alloc(struct dmabufs_ctl * const dbsc, struct dmabuf_h * dh, size_t size)
 {
-    struct dmabufs_ctl * const dbsc = *pDbsc;
+    struct dma_heap_allocation_data data = {
+        .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1),
+        .fd = 0,
+        .fd_flags = O_RDWR,
+        .heap_flags = 0
+    };

-    if (!dbsc)
-        return;
-    *pDbsc = NULL;
+    while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) {
+        int err = errno;
+        request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n",
+                (uint64_t)data.len,
+                dbsc->fd,
+                err,
+                strerror(err));
+        if (err == EINTR)
+            continue;
+        return -err;
+    }

-    while (close(dbsc->fd) == -1 && errno == EINTR)
-        /* loop */;
+    dh->fd = data.fd;
+    dh->size = (size_t)data.len;
+    return 0;
+}

-    free(dbsc);
+static void buf_cma_free(struct dmabuf_h * dh)
+{
+    // Nothing needed
 }

+static const struct dmabuf_fns dmabuf_cma_fns = {
+    .buf_alloc  = buf_cma_alloc,
+    .buf_free   = buf_cma_free,
+    .ctl_new    = ctl_cma_new,
+    .ctl_free   = ctl_cma_free,
+};
+
+struct dmabufs_ctl * dmabufs_ctl_new(void)
+{
+    request_debug(NULL, "Dmabufs using CMA\n");;
+    return dmabufs_ctl_new2(&dmabuf_cma_fns);
+}


From 288807720443bbddf4c83c3589d1877c7fd418c3 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 13 Dec 2022 13:07:58 +0000
Subject: [PATCH 104/161] dmabufs: Use unref rather than deleet on cmabufs_ctl

---
 libavcodec/v4l2_req_dmabufs.c  | 12 +++++++++++-
 libavcodec/v4l2_req_dmabufs.h  |  3 ++-
 libavcodec/v4l2_request_hevc.c |  4 ++--
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
index 1c3a5e861f..acc0366e76 100644
--- a/libavcodec/v4l2_req_dmabufs.c
+++ b/libavcodec/v4l2_req_dmabufs.c
@@ -31,6 +31,7 @@ struct dmabuf_fns {
 };

 struct dmabufs_ctl {
+    atomic_int ref_count;
     int fd;
     size_t page_size;
     void * v;
@@ -271,7 +272,7 @@ static void dmabufs_ctl_free(struct dmabufs_ctl * const dbsc)
     free(dbsc);
 }

-void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
+void dmabufs_ctl_unref(struct dmabufs_ctl ** const pDbsc)
 {
     struct dmabufs_ctl * const dbsc = *pDbsc;

@@ -279,9 +280,18 @@ void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
         return;
     *pDbsc = NULL;

+    if (atomic_fetch_sub(&dbsc->ref_count, 1) != 0)
+        return;
+
     dmabufs_ctl_free(dbsc);
 }

+struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc)
+{
+    atomic_fetch_add(&dbsc->ref_count, 1);
+    return dbsc;
+}
+
 //-----------------------------------------------------------------------------
 //
 // Alloc dmabuf via CMA
diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h
index c1d3d8c8d7..381ba2708d 100644
--- a/libavcodec/v4l2_req_dmabufs.h
+++ b/libavcodec/v4l2_req_dmabufs.h
@@ -7,7 +7,8 @@ struct dmabufs_ctl;
 struct dmabuf_h;

 struct dmabufs_ctl * dmabufs_ctl_new(void);
-void dmabufs_ctl_delete(struct dmabufs_ctl ** const pdbsc);
+void dmabufs_ctl_unref(struct dmabufs_ctl ** const pdbsc);
+struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc);

 // Need not preserve old contents
 // On NULL return old buffer is freed
diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
index 767ecb036a..db7ed13b6d 100644
--- a/libavcodec/v4l2_request_hevc.c
+++ b/libavcodec/v4l2_request_hevc.c
@@ -105,7 +105,7 @@ static int v4l2_request_hevc_uninit(AVCodecContext *avctx)
     mediabufs_ctl_unref(&ctx->mbufs);
     media_pool_delete(&ctx->mpool);
     pollqueue_unref(&ctx->pq);
-    dmabufs_ctl_delete(&ctx->dbufs);
+    dmabufs_ctl_unref(&ctx->dbufs);
     devscan_delete(&ctx->devscan);

     decode_q_uninit(&ctx->decode_q);
@@ -324,7 +324,7 @@ fail3:
 fail2:
     pollqueue_unref(&ctx->pq);
 fail1:
-    dmabufs_ctl_delete(&ctx->dbufs);
+    dmabufs_ctl_unref(&ctx->dbufs);
 fail0:
     devscan_delete(&ctx->devscan);
     return ret;

From 9115f40c5f55873102312085f2e328d1a2101ae4 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 13 Dec 2022 14:21:40 +0000
Subject: [PATCH 105/161] egl_vout: Remove redundant & completely broken debug

---
 libavdevice/egl_vout.c | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c
index a52cabb082..afc7afd13e 100644
--- a/libavdevice/egl_vout.c
+++ b/libavdevice/egl_vout.c
@@ -515,31 +515,6 @@ static int do_display(AVFormatContext *const s, egl_display_env_t *const de, AVF
         }

         da->fd = desc->objects[0].fd;
-
-#if 0
-        av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
-               " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
-               av_frame_cropped_width(frame),
-               av_frame_cropped_height(frame),
-               desc->layers[0].format,
-               bo_plane_handles[0],
-               bo_plane_handles[1],
-               bo_plane_handles[2],
-               bo_plane_handles[3],
-               pitches[0],
-               pitches[1],
-               pitches[2],
-               pitches[3],
-               offsets[0],
-               offsets[1],
-               offsets[2],
-               offsets[3],
-               (long long)modifiers[0],
-               (long long)modifiers[1],
-               (long long)modifiers[2],
-               (long long)modifiers[3]
-              );
-#endif
     }

     glClearColor(0.5, 0.5, 0.5, 0.5);

From 34711d5a1429213b6f4cf8ad163e8e8d108626e7 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 13 Dec 2022 16:12:12 +0000
Subject: [PATCH 106/161] v4l2m2m: Use offset from querybuf rather than always
 0

---
 libavcodec/v4l2_buffers.c | 4 +++-
 libavcodec/v4l2_buffers.h | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 9ef2f40e39..5ca58ea593 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -379,7 +379,7 @@ static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)

     for (int i = 0; i < avbuf->num_planes; i++) {
         layer->planes[i].object_index = i;
-        layer->planes[i].offset = 0;
+        layer->planes[i].offset = avbuf->plane_info[i].offset;
         layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
     }

@@ -934,6 +934,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct

         if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
+            avbuf->plane_info[i].offset = avbuf->buf.m.planes[i].data_offset;

             if (want_mmap)
                 avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
@@ -941,6 +942,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
                                                buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
         } else {
             avbuf->plane_info[i].length = avbuf->buf.length;
+            avbuf->plane_info[i].offset = 0;

             if (want_mmap)
                 avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
index 1ac32c5989..d91d5d1dd0 100644
--- a/libavcodec/v4l2_buffers.h
+++ b/libavcodec/v4l2_buffers.h
@@ -66,7 +66,8 @@ typedef struct V4L2Buffer {

     /* keep track of the mmap address and mmap length */
     struct V4L2Plane_info {
-        int bytesperline;
+        size_t bytesperline;
+        size_t offset;
         void * mm_addr;
         size_t length;
     } plane_info[VIDEO_MAX_PLANES];

From 15458be3fe79c14f4fdcc2ad786508d1b647c914 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 13 Dec 2022 17:57:27 +0000
Subject: [PATCH 107/161] v4l2m2m: Fix crash if init errors out before setting
 avctx

---
 libavcodec/v4l2_m2m.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index 1e30d15fd8..ac6bae0dc3 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -278,7 +278,7 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)

     av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n");

-    if (av_codec_is_decoder(s->avctx->codec))
+    if (s->avctx && av_codec_is_decoder(s->avctx->codec))
         av_packet_unref(&s->buf_pkt);

     if (s->fd >= 0) {

From 9f7f94c680b8aaedede9b3bcad37b645216cfcff Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 13 Dec 2022 18:10:30 +0000
Subject: [PATCH 108/161] v4l2_buffers: Add and use ctx_to_m2mctx + error debug

---
 libavcodec/v4l2_buffers.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 5ca58ea593..e28ef2d1e8 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -41,11 +41,16 @@
 #define USEC_PER_SEC 1000000
 static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };

+static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
+{
+    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
+        container_of(ctx, V4L2m2mContext, output) :
+        container_of(ctx, V4L2m2mContext, capture);
+}
+
 static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
 {
-    return V4L2_TYPE_IS_OUTPUT(buf->context->type) ?
-        container_of(buf->context, V4L2m2mContext, output) :
-        container_of(buf->context, V4L2m2mContext, capture);
+    return ctx_to_m2mctx(buf->context);
 }

 static inline AVCodecContext *logger(const V4L2Buffer * const buf)
@@ -883,6 +888,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
     int ret, i;
     V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf));
     AVBufferRef * bufref;
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);

     *pbufref = NULL;
     if (avbuf == NULL)
@@ -910,7 +916,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
         avbuf->buf.m.planes = avbuf->planes;
     }

-    ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf);
+    ret = ioctl(s->fd, VIDIOC_QUERYBUF, &avbuf->buf);
     if (ret < 0)
         goto fail;

@@ -969,10 +975,12 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
     }

     if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-        if (buf_to_m2mctx(avbuf)->output_drm) {
+        if (s->output_drm) {
             ret = v4l2_buffer_export_drm(avbuf);
-            if (ret)
-                    goto fail;
+            if (ret) {
+                av_log(logger(avbuf), AV_LOG_ERROR, "Failed to get exported drm handles\n");
+                goto fail;
+            }
         }
     }


From 6b8bb2c41828351cd3a6f40be353696ae36450b7 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 13 Dec 2022 18:53:22 +0000
Subject: [PATCH 109/161] v4l2m2m: Add ability to use cma alloced dmabufs as
 well as v4l2 mmap

---
 libavcodec/Makefile       |  2 +-
 libavcodec/v4l2_buffers.c | 65 ++++++++++++++++++++++++++-------------
 libavcodec/v4l2_buffers.h |  2 ++
 libavcodec/v4l2_m2m.c     |  6 +++-
 libavcodec/v4l2_m2m.h     |  4 +++
 libavcodec/v4l2_m2m_dec.c | 16 ++++++++++
 6 files changed, 71 insertions(+), 24 deletions(-)

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 11f183c9b9..8b1d669834 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -170,7 +170,7 @@ OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
 OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
 OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
 OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\
-                                          weak_link.o
+                                          weak_link.o v4l2_req_dmabufs.o
 OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\
 					  v4l2_req_devscan.o weak_link.o
 OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index e28ef2d1e8..8d80d19788 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -36,6 +36,7 @@
 #include "v4l2_context.h"
 #include "v4l2_buffers.h"
 #include "v4l2_m2m.h"
+#include "v4l2_req_dmabufs.h"
 #include "weak_link.h"

 #define USEC_PER_SEC 1000000
@@ -477,33 +478,46 @@ static void v4l2_free_bufref(void *opaque, uint8_t *data)
     av_buffer_unref(&bufref);
 }

+static inline uint32_t ff_v4l2_buf_len(const struct v4l2_buffer * b, unsigned int i)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(b->type) ? b->m.planes[i].length : b->length;
+}
+
 static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
 {
-    struct v4l2_exportbuffer expbuf;
     int i, ret;
+    const V4L2m2mContext * const s = buf_to_m2mctx(avbuf);

     for (i = 0; i < avbuf->num_planes; i++) {
-        memset(&expbuf, 0, sizeof(expbuf));
-
-        expbuf.index = avbuf->buf.index;
-        expbuf.type = avbuf->buf.type;
-        expbuf.plane = i;
+        int dma_fd = -1;
+        const uint32_t blen = ff_v4l2_buf_len(&avbuf->buf, i);
+
+        if (s->db_ctl != NULL) {
+            if ((avbuf->dmabuf[i] = dmabuf_alloc(s->db_ctl, blen)) == NULL)
+                return AVERROR(ENOMEM);
+            dma_fd = dmabuf_fd(avbuf->dmabuf[i]);
+            if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type))
+                avbuf->buf.m.planes[i].m.fd = dma_fd;
+            else
+                avbuf->buf.m.fd = dma_fd;
+        }
+        else {
+            struct v4l2_exportbuffer expbuf;
+            memset(&expbuf, 0, sizeof(expbuf));

-        ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf);
-        if (ret < 0)
-            return AVERROR(errno);
+            expbuf.index = avbuf->buf.index;
+            expbuf.type = avbuf->buf.type;
+            expbuf.plane = i;

-        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) {
-            /* drm frame */
-            avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length;
-            avbuf->drm_frame.objects[i].fd = expbuf.fd;
-            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
-        } else {
-            /* drm frame */
-            avbuf->drm_frame.objects[0].size = avbuf->buf.length;
-            avbuf->drm_frame.objects[0].fd = expbuf.fd;
-            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+            ret = ioctl(s->fd, VIDIOC_EXPBUF, &expbuf);
+            if (ret < 0)
+                return AVERROR(errno);
+            dma_fd = expbuf.fd;
         }
+
+        avbuf->drm_frame.objects[i].size = blen;
+        avbuf->drm_frame.objects[i].fd = dma_fd;
+        avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
     }

     return 0;
@@ -870,9 +884,16 @@ static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data)
             munmap(p->mm_addr, p->length);
     }

-    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
-        if (avbuf->drm_frame.objects[i].fd != -1)
-            close(avbuf->drm_frame.objects[i].fd);
+    if (avbuf->dmabuf[0] == NULL) {
+        for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
+            if (avbuf->drm_frame.objects[i].fd != -1)
+                close(avbuf->drm_frame.objects[i].fd);
+        }
+    }
+    else {
+        for (i = 0; i != FF_ARRAY_ELEMS(avbuf->dmabuf); ++i) {
+            dmabuf_free(avbuf->dmabuf[i]);
+        }
     }

     av_buffer_unref(&avbuf->ref_buf);
diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
index d91d5d1dd0..444ad94b14 100644
--- a/libavcodec/v4l2_buffers.h
+++ b/libavcodec/v4l2_buffers.h
@@ -46,6 +46,7 @@ enum V4L2Buffer_status {
  */
 struct V4L2Context;
 struct ff_weak_link_client;
+struct dmabuf_h;

 typedef struct V4L2Buffer {
     /* each buffer needs to have a reference to its context
@@ -80,6 +81,7 @@ typedef struct V4L2Buffer {

     enum V4L2Buffer_status status;

+    struct dmabuf_h * dmabuf[VIDEO_MAX_PLANES]; // If externally alloced dmabufs - stash other info here
 } V4L2Buffer;

 /**
diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index ac6bae0dc3..f802687b1b 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -34,6 +34,7 @@
 #include "v4l2_context.h"
 #include "v4l2_fmt.h"
 #include "v4l2_m2m.h"
+#include "v4l2_req_dmabufs.h"

 static void
 xlat_init(xlat_track_t * const x)
@@ -75,7 +76,7 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)

     s->capture.done = s->output.done = 0;
     s->capture.name = "capture";
-    s->capture.buf_mem = V4L2_MEMORY_MMAP;
+    s->capture.buf_mem = s->db_ctl != NULL ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
     s->output.name = "output";
     s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
     atomic_init(&s->refcount, 0);
@@ -94,12 +95,14 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)
     if (v4l2_mplane_video(&cap)) {
         s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
         s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
+        s->output.format.type = s->output.type;
         return 0;
     }

     if (v4l2_splane_video(&cap)) {
         s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
         s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
+        s->output.format.type = s->output.type;
         return 0;
     }

@@ -293,6 +296,7 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)

     ff_v4l2_context_release(&s->output);

+    dmabufs_ctl_unref(&s->db_ctl);
     close(s->fd);
     s->fd = -1;

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 26a7161042..0f41f94694 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -71,6 +71,8 @@ typedef struct xlat_track_s {
     V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
 } xlat_track_t;

+struct dmabufs_ctl;
+
 typedef struct V4L2m2mContext {
     char devname[PATH_MAX];
     int fd;
@@ -124,6 +126,7 @@ typedef struct V4L2m2mContext {
     /* Quirks */
     unsigned int quirks;

+    struct dmabufs_ctl * db_ctl;
 } V4L2m2mContext;

 typedef struct V4L2m2mPriv {
@@ -134,6 +137,7 @@ typedef struct V4L2m2mPriv {

     int num_output_buffers;
     int num_capture_buffers;
+    const char * dmabuf_alloc;
     enum AVPixelFormat pix_fmt;
 } V4L2m2mPriv;

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 47b2735f82..4d17057298 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -41,6 +41,7 @@
 #include "v4l2_context.h"
 #include "v4l2_m2m.h"
 #include "v4l2_fmt.h"
+#include "v4l2_req_dmabufs.h"

 // Pick 64 for max last count - that is >1sec at 60fps
 #define STATS_LAST_COUNT_MAX 64
@@ -896,6 +897,20 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
         s->output_drm = 0;
     }

+    s->db_ctl = NULL;
+    if (priv->dmabuf_alloc != NULL && strcmp(priv->dmabuf_alloc, "v4l2") != 0) {
+        if (strcmp(priv->dmabuf_alloc, "cma") == 0)
+            s->db_ctl = dmabufs_ctl_new();
+        else {
+            av_log(avctx, AV_LOG_ERROR, "Unknown dmabuf alloc method: '%s'\n", priv->dmabuf_alloc);
+            return AVERROR(EINVAL);
+        }
+        if (!s->db_ctl) {
+            av_log(avctx, AV_LOG_ERROR, "Can't open dmabuf provider '%s'\n", priv->dmabuf_alloc);
+            return AVERROR(ENOMEM);
+        }
+    }
+
     s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
     if (!s->device_ref) {
         ret = AVERROR(ENOMEM);
@@ -1000,6 +1015,7 @@ static const AVOption options[] = {
     { "num_capture_buffers", "Number of buffers in the capture context",
         OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS },
     { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS },
+    { "dmabuf_alloc", "Dmabuf alloc method", OFFSET(dmabuf_alloc), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS },
     { NULL},
 };


From 499bcdc4ed82c737ceab166a07b46e8ed8ccbc88 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 13 Dec 2022 19:05:47 +0000
Subject: [PATCH 110/161] testfilt: Skeleton of hw filter test code

---
 pi-util/testfilt.py | 83 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100755 pi-util/testfilt.py

diff --git a/pi-util/testfilt.py b/pi-util/testfilt.py
new file mode 100755
index 0000000000..b322dac0c2
--- /dev/null
+++ b/pi-util/testfilt.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+import string
+import os
+import subprocess
+import re
+import argparse
+import sys
+import csv
+from stat import *
+
+class validator:
+    def __init__(self):
+        self.ok = False
+
+    def isok(self):
+        return self.ok
+
+    def setok(self):
+        self.ok = True
+
+class valid_regex(validator):
+    def __init__(self, regex):
+        super().__init__()
+        self.regex = re.compile(regex)
+
+    def scanline(self, line):
+        if self.isok() or self.regex.search(line):
+            self.setok()
+
+
+def validate(validators, flog):
+    for line in flog:
+        for v in validators:
+            v.scanline(line)
+
+    ok = True
+    for v in validators:
+        if not v.isok():
+            ok = False
+            # complain
+            print("Test failed")
+
+    if ok:
+        print("OK")
+    return ok
+
+def runtest(name, ffmpeg, args, suffix, validators):
+    log_root = os.path.join("/tmp", "testfilt", name)
+    ofilename = os.path.join(log_root, name + suffix)
+
+    if not os.path.exists(log_root):
+        os.makedirs(log_root)
+
+    try:
+        os.remove(ofilename)
+    except:
+        pass
+
+    flog = open(os.path.join(log_root, name + ".log"), "wb")
+    ffargs = [ffmpeg] + args + [ofilename]
+
+    subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT, text=False)
+    flog.close
+
+    flog = open(os.path.join(log_root, name + ".log"), "rt")
+    return validate(validators, flog)
+
+def sayok(log_root, flog):
+    print("Woohoo")
+    return True
+
+if __name__ == '__main__':
+
+    argp = argparse.ArgumentParser(description="FFmpeg filter tester")
+    argp.add_argument("--ffmpeg", default="./ffmpeg", help="ffmpeg exec name")
+    args = argp.parse_args()
+
+    runtest("ATest", args.ffmpeg, ["-v", "verbose", "-no_cvt_hw", "-an", "-c:v", "h264_v4l2m2m", "-i",
+                                   "/home/johncox/server/TestMedia/Sony/jellyfish-10-mbps-hd-h264.mkv",
+#                                    "/home/jc/rpi/streams/jellyfish-3-mbps-hd-h264.mkv",
+                                   "-c:v", "h264_v4l2m2m", "-b:v", "2M"], ".mkv",
+            [valid_regex(r'Output stream #0:0 \(video\): 900 frames encoded; 900 packets muxed')])

From 50ac318a472fd98e1e58605316ea6a2e8cde0a04 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 5 Jan 2023 14:39:30 +0000
Subject: [PATCH 111/161] pixfmt: Add a #define to indicate presence of SAND
 formats

---
 libavutil/pixfmt.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
index 22f70007c3..5cc780e7d5 100644
--- a/libavutil/pixfmt.h
+++ b/libavutil/pixfmt.h
@@ -378,6 +378,8 @@ enum AVPixelFormat {
     AV_PIX_FMT_Y210BE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian
     AV_PIX_FMT_Y210LE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian
 // RPI - not on ifdef so can be got at by calling progs
+// #define so code that uses this can know it is there
+#define AVUTIL_HAVE_PIX_FMT_SAND 1
     AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
     AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
     AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding

From 23a3132e094d449ea05657704c0cffc3f0762c28 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 11 Jan 2023 16:30:37 +0000
Subject: [PATCH 112/161] v4l2_m2m_dec: Fix initial pkt send if no extradata

---
 libavcodec/v4l2_m2m_dec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 4d17057298..9daf05adfe 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -240,7 +240,7 @@ copy_extradata(AVCodecContext * const avctx,
     else
         len = src_len < 0 ? AVERROR(EINVAL) : src_len;

-    // Zero length is OK but we swant to stop - -ve is error val
+    // Zero length is OK but we want to stop - -ve is error val
     if (len <= 0)
         return len;

@@ -525,7 +525,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const

     if (s->extdata_sent)
         ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
-    else if (s->extdata_data)
+    else
         ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);

     if (ret == AVERROR(EAGAIN)) {

From f4f6b9f1af137153e574c704804033e83f2ed1a8 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 16 Jan 2023 16:05:09 +0000
Subject: [PATCH 113/161] v4l2m2m_dec: Make capture timeout long once pending
 count > 31

For some applications (ffmpeg command line) the current heuristic of adding
a short timeout and preferring DQ over Q once we think we have buffers
Qed in V4L2 is insufficient to prevent arbitrary buffer growth.
Unfortunately the current method of guessing the number of Qed buffers isn't
reliable enough to allow for a long timeout with only a few few buffers
believed pending so only do so once the number of buffers believed pending
exceeds plausible inaccuracies caused by buffer reordering.

The limit could be optimised by codec or apparent latency but a simple
number should reduce the  unexpected consequences.
---
 libavcodec/v4l2_m2m.h     |  3 ++-
 libavcodec/v4l2_m2m_dec.c | 18 ++++++++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 0f41f94694..ded1478a49 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -66,7 +66,7 @@ typedef struct pts_stats_s

 typedef struct xlat_track_s {
     unsigned int track_no;
-    int64_t last_pts;
+    int64_t last_pts;    // Last valid PTS decoded
     int64_t last_opaque;
     V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
 } xlat_track_t;
@@ -88,6 +88,7 @@ typedef struct V4L2m2mContext {

     /* null frame/packet received */
     int draining;
+    int running;
     AVPacket buf_pkt;

     /* Reference to a frame. Only used during encoding */
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 9daf05adfe..c8ab883d7e 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -582,7 +582,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)

     do {
         const int pending = xlat_pending(&s->xlat);
-        const int prefer_dq = (pending > 3);
+        const int prefer_dq = (pending > 4);
         const int last_src_rv = src_rv;

         av_log(avctx, AV_LOG_TRACE, "Pending=%d, src_rv=%d, req_pkt=%d\n", pending, src_rv, s->req_pkt);
@@ -611,10 +611,14 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
         // (b) enqueue returned a status indicating that decode should be attempted
         if (dst_rv != 0 && TRY_DQ(src_rv)) {
             // Pick a timeout depending on state
+            // The pending count isn't completely reliable so it is good enough
+            // hint that we want a frame but not good enough to require it in
+            // all cases; however if it has got > 31 that exceeds its margin of
+            // error so require a frame to prevent ridiculous levels of latency
             const int t =
                 src_rv == NQ_Q_FULL ? -1 :
                 src_rv == NQ_DRAINING ? 300 :
-                prefer_dq ? 5 : 0;
+                prefer_dq ? (s->running && pending > 31 ? 100 : 5) : 0;

             // Dequeue frame will unref any previous contents of frame
             // if it returns success so we don't need an explicit unref
@@ -631,8 +635,13 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
                 }
             }

-            if (dst_rv == 0)
+            if (dst_rv == 0) {
                 set_best_effort_pts(avctx, &s->pts_stat, frame);
+                if (!s->running) {
+                    s->running = 1;
+                    av_log(avctx, AV_LOG_VERBOSE, "Decode running\n");
+                }
+            }

             if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
                 av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
@@ -998,7 +1007,8 @@ static void v4l2_decode_flush(AVCodecContext *avctx)

     // resend extradata
     s->extdata_sent = 0;
-    // clear EOS status vars
+    // clear status vars
+    s->running = 0;
     s->draining = 0;
     output->done = 0;
     capture->done = 0;

From 39f49cdaefa4483914f703c3f352c8894b3b81fd Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 6 Feb 2023 19:23:16 +0000
Subject: [PATCH 114/161] Initial buffersink alloc callback code

(cherry picked from commit dde8d3c8f3cc279b9b92ed4f10a2e3990f4aadeb)
---
 libavfilter/buffersink.c | 44 ++++++++++++++++++++++++++++++++++++++++
 libavfilter/buffersink.h |  3 +++
 2 files changed, 47 insertions(+)

diff --git a/libavfilter/buffersink.c b/libavfilter/buffersink.c
index 306c283f77..d3c82aabf3 100644
--- a/libavfilter/buffersink.c
+++ b/libavfilter/buffersink.c
@@ -62,6 +62,11 @@ typedef struct BufferSinkContext {
     int sample_rates_size;

     AVFrame *peeked_frame;
+
+    union {
+        av_buffersink_alloc_video_frame * video;
+    } alloc_cb;
+    void * alloc_v;
 } BufferSinkContext;

 #define NB_ITEMS(list) (list ## _size / sizeof(*list))
@@ -154,6 +159,44 @@ int attribute_align_arg av_buffersink_get_samples(AVFilterContext *ctx,
     return get_frame_internal(ctx, frame, 0, nb_samples);
 }

+static AVFrame * alloc_video_buffer(AVFilterLink *link, int w, int h)
+{
+    AVFilterContext * const ctx = link->dst;
+    BufferSinkContext * const bs = ctx->priv;
+    return bs->alloc_cb.video ? bs->alloc_cb.video(ctx, bs->alloc_v, w, h) :
+        ff_default_get_video_buffer(link, w, h);
+}
+
+int av_buffersink_set_alloc_video_frame(AVFilterContext *ctx, av_buffersink_alloc_video_frame * cb, void * v)
+{
+    BufferSinkContext * const bs = ctx->priv;
+    bs->alloc_cb.video = cb;
+    bs->alloc_v = v;
+    return 0;
+}
+
+#if FF_API_BUFFERSINK_ALLOC
+AVBufferSinkParams *av_buffersink_params_alloc(void)
+{
+    static const int pixel_fmts[] = { AV_PIX_FMT_NONE };
+    AVBufferSinkParams *params = av_malloc(sizeof(AVBufferSinkParams));
+    if (!params)
+        return NULL;
+
+    params->pixel_fmts = pixel_fmts;
+    return params;
+}
+
+AVABufferSinkParams *av_abuffersink_params_alloc(void)
+{
+    AVABufferSinkParams *params = av_mallocz(sizeof(AVABufferSinkParams));
+
+    if (!params)
+        return NULL;
+    return params;
+}
+#endif
+
 static av_cold int common_init(AVFilterContext *ctx)
 {
     BufferSinkContext *buf = ctx->priv;
@@ -381,6 +424,7 @@ static const AVFilterPad avfilter_vsink_buffer_inputs[] = {
     {
         .name = "default",
         .type = AVMEDIA_TYPE_VIDEO,
+        .get_buffer = {.video = alloc_video_buffer},
     },
 };

diff --git a/libavfilter/buffersink.h b/libavfilter/buffersink.h
index 64e08de53e..09737d322f 100644
--- a/libavfilter/buffersink.h
+++ b/libavfilter/buffersink.h
@@ -166,6 +166,9 @@ int av_buffersink_get_frame(AVFilterContext *ctx, AVFrame *frame);
  */
 int av_buffersink_get_samples(AVFilterContext *ctx, AVFrame *frame, int nb_samples);

+typedef AVFrame * av_buffersink_alloc_video_frame(AVFilterContext * ctx, void * v, int w, int h);
+int av_buffersink_set_alloc_video_frame(AVFilterContext *ctx, av_buffersink_alloc_video_frame * cb, void * v);
+
 /**
  * @}
  */

From a63ae21e74ae48f1aedac53c18142b7596d041ad Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 30 Jan 2023 17:23:12 +0000
Subject: [PATCH 115/161] v4l2_m2m_dec: Add a profile check

Check the profile in avctx aginst what the v4l2 driver advertises. If
the driver doesn't support the check then just accept anything.

(cherry picked from commit 6dd83dead9ebce419fdea152db0c9f5e9a94e9ef)
---
 libavcodec/v4l2_m2m_dec.c | 125 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index c8ab883d7e..098adf4821 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -715,6 +715,127 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 }
 #endif

+static uint32_t
+avprofile_to_v4l2(const enum AVCodecID codec_id, const int avprofile)
+{
+    switch (codec_id) {
+        case AV_CODEC_ID_H264:
+            switch (avprofile) {
+                case FF_PROFILE_H264_BASELINE:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_BASELINE;
+                case FF_PROFILE_H264_CONSTRAINED_BASELINE:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_CONSTRAINED_BASELINE;
+                case FF_PROFILE_H264_MAIN:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_MAIN;
+                case FF_PROFILE_H264_EXTENDED:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_EXTENDED;
+                case FF_PROFILE_H264_HIGH:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH;
+                case FF_PROFILE_H264_HIGH_10:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_10;
+                case FF_PROFILE_H264_HIGH_10_INTRA:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_10_INTRA;
+                case FF_PROFILE_H264_MULTIVIEW_HIGH:
+                case FF_PROFILE_H264_HIGH_422:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_422;
+                case FF_PROFILE_H264_HIGH_422_INTRA:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_422_INTRA;
+                case FF_PROFILE_H264_STEREO_HIGH:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_STEREO_HIGH;
+                case FF_PROFILE_H264_HIGH_444_PREDICTIVE:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_444_PREDICTIVE;
+                case FF_PROFILE_H264_HIGH_444_INTRA:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_444_INTRA;
+                case FF_PROFILE_H264_CAVLC_444:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_CAVLC_444_INTRA;
+                case FF_PROFILE_H264_HIGH_444:
+                default:
+                    break;
+//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_BASELINE		= 12,
+//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_HIGH		= 13,
+//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_HIGH_INTRA	= 14,
+//                    V4L2_MPEG_VIDEO_H264_PROFILE_MULTIVIEW_HIGH		= 16,
+//                    V4L2_MPEG_VIDEO_H264_PROFILE_CONSTRAINED_HIGH		= 17,
+            }
+            break;
+        case AV_CODEC_ID_MPEG2VIDEO:
+        case AV_CODEC_ID_MPEG4:
+        case AV_CODEC_ID_VC1:
+        case AV_CODEC_ID_VP8:
+        case AV_CODEC_ID_VP9:
+        case AV_CODEC_ID_AV1:
+            // Most profiles are a simple number that matches the V4L2 enum
+            return avprofile;
+        default:
+            break;
+    }
+    return ~(uint32_t)0;
+}
+
+// This check mirrors Chrome's profile check by testing to see if the profile
+// exists as a possible value for the V4L2 profile control
+static int
+check_profile(AVCodecContext *const avctx, V4L2m2mContext *const s)
+{
+    struct v4l2_queryctrl query_ctrl;
+    struct v4l2_querymenu query_menu;
+    uint32_t profile_id;
+
+    // An unset profile is almost certainly zero or -99 - do not reject
+    if (avctx->profile <= 0) {
+        av_log(avctx, AV_LOG_VERBOSE, "Profile <= 0 - check skipped\n");
+        return 0;
+    }
+
+    memset(&query_ctrl, 0, sizeof(query_ctrl));
+    switch (avctx->codec_id) {
+        case AV_CODEC_ID_MPEG2VIDEO:
+            profile_id = V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE;
+            break;
+        case AV_CODEC_ID_MPEG4:
+            profile_id = V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE;
+            break;
+        case AV_CODEC_ID_H264:
+            profile_id = V4L2_CID_MPEG_VIDEO_H264_PROFILE;
+            break;
+        case AV_CODEC_ID_VP8:
+            profile_id = V4L2_CID_MPEG_VIDEO_VP8_PROFILE;
+            break;
+        case AV_CODEC_ID_VP9:
+            profile_id = V4L2_CID_MPEG_VIDEO_VP9_PROFILE;
+            break;
+#ifdef V4L2_CID_MPEG_VIDEO_AV1_PROFILE
+        case AV_CODEC_ID_AV1:
+            profile_id = V4L2_CID_MPEG_VIDEO_AV1_PROFILE;
+            break;
+#endif
+        default:
+            av_log(avctx, AV_LOG_VERBOSE, "Can't map profile for codec id %d; profile check skipped\n", avctx->codec_id);
+            return 0;
+    }
+
+    query_ctrl = (struct v4l2_queryctrl){.id = profile_id};
+    if (ioctl(s->fd, VIDIOC_QUERYCTRL, &query_ctrl) != 0) {
+        av_log(avctx, AV_LOG_VERBOSE, "Query profile ctrl (%#x) not supported: assume OK\n", query_ctrl.id);
+    }
+    else {
+        av_log(avctx, AV_LOG_DEBUG, "%s: Control supported: %#x\n", __func__, query_ctrl.id);
+
+        query_menu = (struct v4l2_querymenu){
+            .id = query_ctrl.id,
+            .index = avprofile_to_v4l2(avctx->codec_id, avctx->profile),
+        };
+
+        if (query_menu.index > query_ctrl.maximum ||
+            query_menu.index < query_ctrl.minimum ||
+            ioctl(s->fd, VIDIOC_QUERYMENU, &query_menu) != 0) {
+            return AVERROR(ENOENT);
+        }
+    }
+
+    return 0;
+};
+
 static int
 check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
 {
@@ -955,6 +1076,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     if ((ret = check_size(avctx, s)) != 0)
         return ret;

+    if ((ret = check_profile(avctx, s)) != 0) {
+        av_log(avctx, AV_LOG_WARNING, "Profile %d not supported by decode\n", avctx->profile);
+        return ret;
+    }
     return 0;
 }


From f734a6ead04a8381fccfae53066866a02a9516d2 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 1 Feb 2023 17:24:39 +0000
Subject: [PATCH 116/161] v4l2_m2m_dec: Add extradata parse for h264 & hevc

If we have extradata we can extract profile & level and potentailly
other useful info from it. Use the codec parser to get it if the decoder
is configured.

(cherry picked from commit 6d431e79adeb246c2ed8cebce9011d81175a3906)
---
 libavcodec/v4l2_m2m_dec.c | 84 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 098adf4821..e64bc707d3 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -21,6 +21,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

+#include "config.h"
+
 #include <linux/videodev2.h>
 #include <sys/ioctl.h>

@@ -43,6 +45,13 @@
 #include "v4l2_fmt.h"
 #include "v4l2_req_dmabufs.h"

+#if CONFIG_H264_DECODER
+#include "h264_parse.h"
+#endif
+#if CONFIG_HEVC_DECODER
+#include "hevc_parse.h"
+#endif
+
 // Pick 64 for max last count - that is >1sec at 60fps
 #define STATS_LAST_COUNT_MAX 64
 #define STATS_INTERVAL_MAX (1 << 30)
@@ -956,6 +965,78 @@ static uint32_t max_coded_size(const AVCodecContext * const avctx)
     return size + (1 << 16);
 }

+static void
+parse_extradata(AVCodecContext *avctx)
+{
+    if (!avctx->extradata || !avctx->extradata_size)
+        return;
+
+    switch (avctx->codec_id) {
+#if CONFIG_H264_DECODER
+        case AV_CODEC_ID_H264:
+        {
+            H264ParamSets ps = {{NULL}};
+            int is_avc = 0;
+            int nal_length_size = 0;
+            int ret;
+
+            ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size,
+                                           &ps, &is_avc, &nal_length_size,
+                                           avctx->err_recognition, avctx);
+            if (ret > 0) {
+                const SPS * sps = NULL;
+                unsigned int i;
+                for (i = 0; i != MAX_SPS_COUNT; ++i) {
+                    if (ps.sps_list[i]) {
+                        sps = (const SPS *)ps.sps_list[i]->data;
+                        break;
+                    }
+                }
+                if (sps) {
+                    avctx->profile = ff_h264_get_profile(sps);
+                    avctx->level = sps->level_idc;
+                }
+            }
+            ff_h264_ps_uninit(&ps);
+            break;
+        }
+#endif
+#if CONFIG_HEVC_DECODER
+        case AV_CODEC_ID_HEVC:
+        {
+            HEVCParamSets ps = {{NULL}};
+            HEVCSEI sei = {{{{0}}}};
+            int is_nalff = 0;
+            int nal_length_size = 0;
+            int ret;
+
+            ret = ff_hevc_decode_extradata(avctx->extradata, avctx->extradata_size,
+                                           &ps, &sei, &is_nalff, &nal_length_size,
+                                           avctx->err_recognition, 0, avctx);
+            if (ret > 0) {
+                const HEVCSPS * sps = NULL;
+                unsigned int i;
+                for (i = 0; i != HEVC_MAX_SPS_COUNT; ++i) {
+                    if (ps.sps_list[i]) {
+                        sps = (const HEVCSPS *)ps.sps_list[i]->data;
+                        break;
+                    }
+                }
+                if (sps) {
+                    avctx->profile = sps->ptl.general_ptl.profile_idc;
+                    avctx->level   = sps->ptl.general_ptl.level_idc;
+                }
+            }
+            ff_hevc_ps_uninit(&ps);
+            ff_hevc_reset_sei(&sei);
+            break;
+        }
+#endif
+        default:
+            break;
+    }
+}
+
 static av_cold int v4l2_decode_init(AVCodecContext *avctx)
 {
     V4L2Context *capture, *output;
@@ -976,7 +1057,8 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
         avctx->ticks_per_frame = 2;
     }

-    av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level);
+    parse_extradata(avctx);
+
     ret = ff_v4l2_m2m_create_context(priv, &s);
     if (ret < 0)
         return ret;

From e28421e397743a94f5e37327ad234f59b6ae613d Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 20 Mar 2023 18:12:51 +0000
Subject: [PATCH 117/161] clean_usr_libs: Now wipes the include files too

When swapping ffmpeg versions obsolete makefiles could confuse
configure utilities.
---
 pi-util/clean_usr_libs.sh | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/pi-util/clean_usr_libs.sh b/pi-util/clean_usr_libs.sh
index b3b2d5509d..01bd6a6a22 100755
--- a/pi-util/clean_usr_libs.sh
+++ b/pi-util/clean_usr_libs.sh
@@ -1,4 +1,20 @@
 set -e
+U=/usr/include/arm-linux-gnueabihf
+rm -rf $U/libavcodec
+rm -rf $U/libavdevice
+rm -rf $U/libavfilter
+rm -rf $U/libavformat
+rm -rf $U/libavutil
+rm -rf $U/libswresample
+rm -rf $U/libswscale
+U=/usr/include/aarch64-linux-gnu
+rm -rf $U/libavcodec
+rm -rf $U/libavdevice
+rm -rf $U/libavfilter
+rm -rf $U/libavformat
+rm -rf $U/libavutil
+rm -rf $U/libswresample
+rm -rf $U/libswscale
 U=/usr/lib/arm-linux-gnueabihf
 rm -f $U/libavcodec.*
 rm -f $U/libavdevice.*

From dcabd30310b88b45359609bac27d5d0f9bbc6dc1 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 20 Mar 2023 18:15:08 +0000
Subject: [PATCH 118/161] vulkan: Add missing decode extension defines

When building on bookworm the video decode extension names
were missing. This adds them. I expect this patch will be
obsolete shortly but it solves a current problem.
---
 libavutil/hwcontext_vulkan.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index 2a9b5f4aac..11e7945f18 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -57,6 +57,14 @@
 #define CHECK_CU(x) FF_CUDA_CHECK_DL(cuda_cu, cu, x)
 #endif

+// Sometimes missing definitions
+#ifndef VK_EXT_VIDEO_DECODE_H264_EXTENSION_NAME
+#define VK_EXT_VIDEO_DECODE_H264_EXTENSION_NAME "VK_EXT_video_decode_h264"
+#endif
+#ifndef VK_EXT_VIDEO_DECODE_H265_EXTENSION_NAME
+#define VK_EXT_VIDEO_DECODE_H265_EXTENSION_NAME "VK_EXT_video_decode_h265"
+#endif
+
 typedef struct VulkanQueueCtx {
     VkFence fence;
     VkQueue queue;

From 0231c208843a5badc799590eb5b9de907d1c26b2 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 21 Mar 2023 14:20:05 +0000
Subject: [PATCH 119/161] v4l2_m2m_dec: Fix config file for finding if decoder
 enabled

Fixes parsing of extradata for profile testing. 5.x changed where that
info is defined.
---
 libavcodec/v4l2_m2m_dec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index e64bc707d3..91136f03da 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -21,7 +21,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

-#include "config.h"
+#include "config_components.h"

 #include <linux/videodev2.h>
 #include <sys/ioctl.h>

From 822baefed69372b3380144ab44226e2c6ad3e298 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 21 Mar 2023 14:23:20 +0000
Subject: [PATCH 120/161] v4l2_m2m_dec: Display profile given if skipped in
 debug

---
 libavcodec/v4l2_m2m_dec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 91136f03da..d124c7b1fc 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -792,7 +792,7 @@ check_profile(AVCodecContext *const avctx, V4L2m2mContext *const s)

     // An unset profile is almost certainly zero or -99 - do not reject
     if (avctx->profile <= 0) {
-        av_log(avctx, AV_LOG_VERBOSE, "Profile <= 0 - check skipped\n");
+        av_log(avctx, AV_LOG_VERBOSE, "Profile %d <= 0 - check skipped\n", avctx->profile);
         return 0;
     }


From 6859fc2a8791c0fcc25851b77fed15a691ceb332 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 22 Mar 2023 16:08:08 +0000
Subject: [PATCH 121/161] conf_native: Fix for 64-bit kernel with 32-bit
 userspace

(cherry picked from commit 5bb1e09cea95b4215c6904b9b1a726e83bc5d327)
---
 pi-util/conf_native.sh | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
index 082d9b5832..0a7d230f1b 100755
--- a/pi-util/conf_native.sh
+++ b/pi-util/conf_native.sh
@@ -33,18 +33,28 @@ RPI_LIBDIRS=
 RPI_DEFINES=
 RPI_EXTRALIBS=

-if [ "$MC" == "arm64" ]; then
-  echo "M/C aarch64"
-  A=aarch64-linux-gnu
-  B=arm64
-elif [ "$MC" == "armhf" ]; then
-  echo "M/C armv7"
-  A=arm-linux-gnueabihf
-  B=armv7
-  MCOPTS="--arch=armv6t2 --cpu=cortex-a7"
-  RPI_DEFINES=-mfpu=neon-vfpv4
+# uname -m gives kernel type which may not have the same
+# 32/64bitness as userspace :-( getconf shoudl provide the answer
+# but use uname to check we are on the right processor
+MC=`uname -m`
+LB=`getconf LONG_BIT`
+if [ "$MC" == "armv7l" ] || [ "$MC" == "aarch64" ]; then
+  if [ "$LB" == "32" ]; then
+    echo "M/C armv7"
+    A=arm-linux-gnueabihf
+    B=armv7
+    MCOPTS="--arch=armv6t2 --cpu=cortex-a7"
+    RPI_DEFINES=-mfpu=neon-vfpv4
+  elif [ "$LB" == "64" ]; then
+    echo "M/C aarch64"
+    A=aarch64-linux-gnu
+    B=arm64
+  else
+    echo "Unknown LONG_BIT name: $LB"
+    exit 1
+  fi
 else
-  echo Unexpected architecture $MC
+  echo "Unknown machine name: $MC"
   exit 1
 fi


From c35f074854a922c0c025159ddddd1abfc562a3d2 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 20 Apr 2023 11:48:25 +0000
Subject: [PATCH 122/161] conf_native: Add install prefix variation

(cherry picked from commit 73c3019b534cb8f4b4e4c21995653f6ce440086d)
---
 pi-util/BUILD.txt      | 32 ++++++++++++++++++++------------
 pi-util/conf_native.sh | 14 ++++++++++++--
 2 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
index b050971f63..2b62d660c0 100644
--- a/pi-util/BUILD.txt
+++ b/pi-util/BUILD.txt
@@ -24,6 +24,8 @@ There are a few choices here
          paths being confused and therefore running the wrong code,  Shared
          is what is needed, in most cases, when building for use by other
          programs.
+ --usr   Set install dir to /usr (i.e. system default) rather than in
+         <builddir>/install

 So for a static build
 ---------------------
@@ -37,23 +39,29 @@ You can now run ffmpeg directly from where it was built
 For a shared build
 ------------------

-$ pi-util/conf_native.sh
-
-You will normally want an install target if shared. Note that the script has
-set this up to be generated in out/<builddir>/install, you don't have to worry
-about overwriting your system libs.
+There are two choices here

+$ pi-util/conf_native.sh
 $ make -j8 -C out/<builddir> install

-You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was
-built or install the image on the system - you have to be careful to get rid
-of all other ffmpeg libs or confusion may result.  There is a little script
-that wipes all other versions - obviously use with care!
+This sets the install prefix to <builddir>/install and is probably what you
+want if you don't want to overwrite the system files.

-$ sudo pi-util/clean_usr_libs.sh
+You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was
+built. You can copy the contents of <build dir>/install to /usr and that mostly
+works. The only downside is that paths in pkgconfig end up being set to the
+install directory in your build directory which may be less than ideal when
+building other packages.

-Then simply copying from the install to /usr works
+The alternative if you just want to replace the system libs is:

-$ sudo cp -r out/<builddir>/install/* /usr
+$ pi-util/conf_native.sh --usr
+$ make -j8 -C out/<builddir>
+$ sudo pi-util/clean_usr_libs.sh
+$ sudo make -j8 -C out/<builddir> install

+The clean_usr_libs.sh step wipes any existing libs & includes (for all
+architectures) from the system which helps avoid confusion when running other
+progs as you can be sure you're not running old code which is unfortunately
+easy to do otherwise.

diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
index 0a7d230f1b..f0ed159594 100755
--- a/pi-util/conf_native.sh
+++ b/pi-util/conf_native.sh
@@ -9,6 +9,7 @@ RPI_KEEPS=""

 NOSHARED=
 MMAL=
+USR_PREFIX=

 while [ "$1" != "" ] ; do
     case $1 in
@@ -18,8 +19,14 @@ while [ "$1" != "" ] ; do
 	--mmal)
 	    MMAL=1
 	    ;;
+	--usr)
+	    USR_PREFIX=/usr
+	    ;;
 	*)
-	    echo "Usage $0: [--noshared] [--mmal]"
+	    echo "Usage $0: [--noshared] [--mmal] [--usr]"
+	    echo "  noshared  Build static libs and executable - good for testing"
+	    echo "  mmal      Build mmal decoders"
+	    echo "  usr       Set install prefix to /usr [default=<build-dir>/install]"
 	    exit 1
 	    ;;
     esac
@@ -82,7 +89,9 @@ else
   OUT=$BUILDBASE/$B-$C-$V-shared-rel
 fi

-USR_PREFIX=$OUT/install
+if [ ! $USR_PREFIX ]; then
+  USR_PREFIX=$OUT/install
+fi
 LIB_PREFIX=$USR_PREFIX/lib/$A
 INC_PREFIX=$USR_PREFIX/include/$A

@@ -113,6 +122,7 @@ $FFSRC/configure \
  --extra-libs="$RPI_EXTRALIBS"\
  --extra-version="rpi"

+echo "Configured into $OUT"

 # gcc option for getting asm listing
 # -Wa,-ahls

From 91ea652a95370a428f1353932b2a55dae7158acc Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 19 Apr 2023 10:47:58 +0000
Subject: [PATCH 123/161] swcale: Add explicit bgr24->yv12 conversion

(cherry picked from commit 9a22d429f46a038321c66a0cd54737177641b434)
---
 libswscale/rgb2rgb.c          |  5 +++++
 libswscale/rgb2rgb.h          |  7 +++++++
 libswscale/rgb2rgb_template.c | 36 ++++++++++++++++++++++++++++++-----
 libswscale/swscale_unscaled.c | 22 +++++++++++++++++++++
 4 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
index e98fdac8ea..84bb56e60e 100644
--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -83,6 +83,11 @@ void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst,
                        int width, int height,
                        int lumStride, int chromStride, int srcStride,
                        int32_t *rgb2yuv);
+void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst,
+                       uint8_t *udst, uint8_t *vdst,
+                       int width, int height,
+                       int lumStride, int chromStride, int srcStride,
+                       int32_t *rgb2yuv);
 void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
                  int srcStride, int dstStride);
 void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
index f3951d523e..0028ab345f 100644
--- a/libswscale/rgb2rgb.h
+++ b/libswscale/rgb2rgb.h
@@ -79,6 +79,9 @@ void    rgb12to15(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                       uint8_t *vdst, int width, int height, int lumStride,
                       int chromStride, int srcStride, int32_t *rgb2yuv);
+void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                      uint8_t *vdst, int width, int height, int lumStride,
+                      int chromStride, int srcStride, int32_t *rgb2yuv);

 /**
  * Height should be a multiple of 2 and width should be a multiple of 16.
@@ -128,6 +131,10 @@ extern void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                               int width, int height,
                               int lumStride, int chromStride, int srcStride,
                               int32_t *rgb2yuv);
+extern void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                              int width, int height,
+                              int lumStride, int chromStride, int srcStride,
+                              int32_t *rgb2yuv);
 extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
                         int srcStride, int dstStride);

diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
index 42c69801ba..e2437826dd 100644
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -646,13 +646,14 @@ static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst,
  * others are ignored in the C version.
  * FIXME: Write HQ version.
  */
-void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                    uint8_t *vdst, int width, int height, int lumStride,
-                   int chromStride, int srcStride, int32_t *rgb2yuv)
+                   int chromStride, int srcStride, int32_t *rgb2yuv,
+                   const uint8_t x[9])
 {
-    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
-    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
-    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+    int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]];
+    int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]];
+    int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]];
     int y;
     const int chromWidth = width >> 1;

@@ -707,6 +708,30 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
     }
 }

+void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv)
+{
+    static const uint8_t x[9] = {
+        RY_IDX, GY_IDX, BY_IDX,
+        RU_IDX, GU_IDX, BU_IDX,
+        RV_IDX, GV_IDX, BV_IDX,
+    };
+    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x);
+}
+
+void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv)
+{
+    static const uint8_t x[9] = {
+         BY_IDX, GY_IDX, RY_IDX,
+         BU_IDX, GU_IDX, RU_IDX,
+         BV_IDX, GV_IDX, RV_IDX,
+    };
+    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x);
+}
+
 static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2,
                               uint8_t *dest, int width, int height,
                               int src1Stride, int src2Stride, int dstStride)
@@ -980,6 +1005,7 @@ static av_cold void rgb2rgb_init_c(void)
     yuy2toyv12         = yuy2toyv12_c;
     planar2x           = planar2x_c;
     ff_rgb24toyv12     = ff_rgb24toyv12_c;
+    ff_bgr24toyv12     = ff_bgr24toyv12_c;
     interleaveBytes    = interleaveBytes_c;
     deinterleaveBytes  = deinterleaveBytes_c;
     vu9_to_vu12        = vu9_to_vu12_c;
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index 9af2e7ecc3..9047030ae4 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -1654,6 +1654,23 @@ static int bgr24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
     return srcSliceH;
 }

+static int rgb24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+                              int srcStride[], int srcSliceY, int srcSliceH,
+                              uint8_t *dst[], int dstStride[])
+{
+    ff_bgr24toyv12(
+        src[0],
+        dst[0] +  srcSliceY       * dstStride[0],
+        dst[1] + (srcSliceY >> 1) * dstStride[1],
+        dst[2] + (srcSliceY >> 1) * dstStride[2],
+        c->srcW, srcSliceH,
+        dstStride[0], dstStride[1], srcStride[0],
+        c->input_rgb2yuv_table);
+    if (dst[3])
+        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
+    return srcSliceH;
+}
+
 static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
                              int srcStride[], int srcSliceY, int srcSliceH,
                              uint8_t *dst[], int dstStride[])
@@ -2037,6 +2054,11 @@ void ff_get_unscaled_swscale(SwsContext *c)
         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
         !(flags & SWS_ACCURATE_RND) && !(dstW&1))
         c->convert_unscaled = bgr24ToYv12Wrapper;
+    /* rgb24toYV12 */
+    if (srcFormat == AV_PIX_FMT_RGB24 &&
+        (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
+        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        c->convert_unscaled = rgb24ToYv12Wrapper;

     /* RGB/BGR -> RGB/BGR (no dither needed forms) */
     if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c)

From 207ea47b2153b276b53cd5a87528dbc532a9f551 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 20 Apr 2023 11:26:10 +0000
Subject: [PATCH 124/161] swscale: Add unscaled XRGB->YUV420P functions

(cherry picked from commit 04cc32ee3f390de513ad8c6156c0c66b2c60abc8)
---
 libswscale/rgb2rgb.c          |  20 ++++++
 libswscale/rgb2rgb.h          |  16 +++++
 libswscale/rgb2rgb_template.c | 123 ++++++++++++++++++++++++++++++----
 libswscale/swscale_unscaled.c |  89 ++++++++++++++++++++++++
 4 files changed, 236 insertions(+), 12 deletions(-)

diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
index 84bb56e60e..c3b9079d2b 100644
--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -88,6 +88,26 @@ void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst,
                        int width, int height,
                        int lumStride, int chromStride, int srcStride,
                        int32_t *rgb2yuv);
+void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst,
+					  uint8_t *udst, uint8_t *vdst,
+					  int width, int height,
+					  int lumStride, int chromStride, int srcStride,
+					  int32_t *rgb2yuv);
+void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst,
+					  uint8_t *udst, uint8_t *vdst,
+					  int width, int height,
+					  int lumStride, int chromStride, int srcStride,
+					  int32_t *rgb2yuv);
+void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst,
+					  uint8_t *udst, uint8_t *vdst,
+					  int width, int height,
+					  int lumStride, int chromStride, int srcStride,
+					  int32_t *rgb2yuv);
+void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst,
+					  uint8_t *udst, uint8_t *vdst,
+					  int width, int height,
+					  int lumStride, int chromStride, int srcStride,
+					  int32_t *rgb2yuv);
 void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
                  int srcStride, int dstStride);
 void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
index 0028ab345f..a0dd3ffb79 100644
--- a/libswscale/rgb2rgb.h
+++ b/libswscale/rgb2rgb.h
@@ -135,6 +135,22 @@ extern void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                               int width, int height,
                               int lumStride, int chromStride, int srcStride,
                               int32_t *rgb2yuv);
+extern void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                             int width, int height,
+                             int lumStride, int chromStride, int srcStride,
+                             int32_t *rgb2yuv);
+extern void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                             int width, int height,
+                             int lumStride, int chromStride, int srcStride,
+                             int32_t *rgb2yuv);
+extern void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                             int width, int height,
+                             int lumStride, int chromStride, int srcStride,
+                             int32_t *rgb2yuv);
+extern void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                             int width, int height,
+                             int lumStride, int chromStride, int srcStride,
+                             int32_t *rgb2yuv);
 extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
                         int srcStride, int dstStride);

diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
index e2437826dd..703de90690 100644
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -708,30 +708,125 @@ static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
     }
 }

+static const uint8_t x_rgb[9] = {
+    RY_IDX, GY_IDX, BY_IDX,
+    RU_IDX, GU_IDX, BU_IDX,
+    RV_IDX, GV_IDX, BV_IDX,
+};
+
+static const uint8_t x_bgr[9] = {
+     BY_IDX, GY_IDX, RY_IDX,
+     BU_IDX, GU_IDX, RU_IDX,
+     BV_IDX, GV_IDX, RV_IDX,
+};
+
 void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                    uint8_t *vdst, int width, int height, int lumStride,
                    int chromStride, int srcStride, int32_t *rgb2yuv)
 {
-    static const uint8_t x[9] = {
-        RY_IDX, GY_IDX, BY_IDX,
-        RU_IDX, GU_IDX, BU_IDX,
-        RV_IDX, GV_IDX, BV_IDX,
-    };
-    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x);
+    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
 }

 void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                    uint8_t *vdst, int width, int height, int lumStride,
                    int chromStride, int srcStride, int32_t *rgb2yuv)
 {
-    static const uint8_t x[9] = {
-         BY_IDX, GY_IDX, RY_IDX,
-         BU_IDX, GU_IDX, RU_IDX,
-         BV_IDX, GV_IDX, RV_IDX,
-    };
-    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x);
+    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
 }

+static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv,
+                   const uint8_t x[9])
+{
+    int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]];
+    int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]];
+    int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]];
+    int y;
+    const int chromWidth = width >> 1;
+
+    for (y = 0; y < height; y += 2) {
+        int i;
+        for (i = 0; i < chromWidth; i++) {
+            unsigned int b = src[8 * i + 2];
+            unsigned int g = src[8 * i + 1];
+            unsigned int r = src[8 * i + 0];
+
+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
+            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
+            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
+
+            udst[i]     = U;
+            vdst[i]     = V;
+            ydst[2 * i] = Y;
+
+            b = src[8 * i + 6];
+            g = src[8 * i + 5];
+            r = src[8 * i + 4];
+
+            Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+            ydst[2 * i + 1] = Y;
+        }
+        ydst += lumStride;
+        src  += srcStride;
+
+        if (y+1 == height)
+            break;
+
+        for (i = 0; i < chromWidth; i++) {
+            unsigned int b = src[8 * i + 2];
+            unsigned int g = src[8 * i + 1];
+            unsigned int r = src[8 * i + 0];
+
+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+
+            ydst[2 * i] = Y;
+
+            b = src[8 * i + 6];
+            g = src[8 * i + 5];
+            r = src[8 * i + 4];
+
+            Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+            ydst[2 * i + 1] = Y;
+        }
+        udst += chromStride;
+        vdst += chromStride;
+        ydst += lumStride;
+        src  += srcStride;
+    }
+}
+
+static void ff_rgbxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv)
+{
+    rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
+}
+
+static void ff_bgrxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv)
+{
+    rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
+}
+
+// As the general code does no SIMD-like ops simply adding 1 to the src address
+// will fix the ignored alpha position
+static void ff_xrgbtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv)
+{
+    rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
+}
+
+static void ff_xbgrtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv)
+{
+    rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
+}
+
+
 static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2,
                               uint8_t *dest, int width, int height,
                               int src1Stride, int src2Stride, int dstStride)
@@ -1006,6 +1101,10 @@ static av_cold void rgb2rgb_init_c(void)
     planar2x           = planar2x_c;
     ff_rgb24toyv12     = ff_rgb24toyv12_c;
     ff_bgr24toyv12     = ff_bgr24toyv12_c;
+    ff_rgbxtoyv12      = ff_rgbxtoyv12_c;
+    ff_bgrxtoyv12      = ff_bgrxtoyv12_c;
+    ff_xrgbtoyv12      = ff_xrgbtoyv12_c;
+    ff_xbgrtoyv12      = ff_xbgrtoyv12_c;
     interleaveBytes    = interleaveBytes_c;
     deinterleaveBytes  = deinterleaveBytes_c;
     vu9_to_vu12        = vu9_to_vu12_c;
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index 9047030ae4..053c06adf5 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -1671,6 +1671,74 @@ static int rgb24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
     return srcSliceH;
 }

+static int bgrxToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+                             int srcStride[], int srcSliceY, int srcSliceH,
+                             uint8_t *dst[], int dstStride[])
+{
+    ff_bgrxtoyv12(
+        src[0],
+        dst[0] +  srcSliceY       * dstStride[0],
+        dst[1] + (srcSliceY >> 1) * dstStride[1],
+        dst[2] + (srcSliceY >> 1) * dstStride[2],
+        c->srcW, srcSliceH,
+        dstStride[0], dstStride[1], srcStride[0],
+        c->input_rgb2yuv_table);
+    if (dst[3])
+        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
+    return srcSliceH;
+}
+
+static int rgbxToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+                             int srcStride[], int srcSliceY, int srcSliceH,
+                             uint8_t *dst[], int dstStride[])
+{
+    ff_rgbxtoyv12(
+        src[0],
+        dst[0] +  srcSliceY       * dstStride[0],
+        dst[1] + (srcSliceY >> 1) * dstStride[1],
+        dst[2] + (srcSliceY >> 1) * dstStride[2],
+        c->srcW, srcSliceH,
+        dstStride[0], dstStride[1], srcStride[0],
+        c->input_rgb2yuv_table);
+    if (dst[3])
+        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
+    return srcSliceH;
+}
+
+static int xbgrToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+                             int srcStride[], int srcSliceY, int srcSliceH,
+                             uint8_t *dst[], int dstStride[])
+{
+    ff_xbgrtoyv12(
+        src[0],
+        dst[0] +  srcSliceY       * dstStride[0],
+        dst[1] + (srcSliceY >> 1) * dstStride[1],
+        dst[2] + (srcSliceY >> 1) * dstStride[2],
+        c->srcW, srcSliceH,
+        dstStride[0], dstStride[1], srcStride[0],
+        c->input_rgb2yuv_table);
+    if (dst[3])
+        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
+    return srcSliceH;
+}
+
+static int xrgbToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+                             int srcStride[], int srcSliceY, int srcSliceH,
+                             uint8_t *dst[], int dstStride[])
+{
+    ff_xrgbtoyv12(
+        src[0],
+        dst[0] +  srcSliceY       * dstStride[0],
+        dst[1] + (srcSliceY >> 1) * dstStride[1],
+        dst[2] + (srcSliceY >> 1) * dstStride[2],
+        c->srcW, srcSliceH,
+        dstStride[0], dstStride[1], srcStride[0],
+        c->input_rgb2yuv_table);
+    if (dst[3])
+        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
+    return srcSliceH;
+}
+
 static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
                              int srcStride[], int srcSliceY, int srcSliceH,
                              uint8_t *dst[], int dstStride[])
@@ -2060,6 +2128,27 @@ void ff_get_unscaled_swscale(SwsContext *c)
         !(flags & SWS_ACCURATE_RND) && !(dstW&1))
         c->convert_unscaled = rgb24ToYv12Wrapper;

+    /* bgrxtoYV12 */
+    if (((srcFormat == AV_PIX_FMT_BGRA && dstFormat == AV_PIX_FMT_YUV420P) ||
+         (srcFormat == AV_PIX_FMT_BGR0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
+        !(flags & SWS_ACCURATE_RND))
+        c->convert_unscaled = bgrxToYv12Wrapper;
+    /* rgbx24toYV12 */
+    if (((srcFormat == AV_PIX_FMT_RGBA && dstFormat == AV_PIX_FMT_YUV420P) ||
+         (srcFormat == AV_PIX_FMT_RGB0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
+        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        c->convert_unscaled = rgbxToYv12Wrapper;
+    /* xbgrtoYV12 */
+    if (((srcFormat == AV_PIX_FMT_ABGR && dstFormat == AV_PIX_FMT_YUV420P) ||
+         (srcFormat == AV_PIX_FMT_0BGR && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
+        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        c->convert_unscaled = xbgrToYv12Wrapper;
+    /* xrgb24toYV12 */
+    if (((srcFormat == AV_PIX_FMT_ARGB && dstFormat == AV_PIX_FMT_YUV420P) ||
+         (srcFormat == AV_PIX_FMT_0RGB && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
+        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        c->convert_unscaled = xrgbToYv12Wrapper;
+
     /* RGB/BGR -> RGB/BGR (no dither needed forms) */
     if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c)
         && (!needsDither || (c->flags&(SWS_FAST_BILINEAR|SWS_POINT))))

From b5672a2d361ec4f064ae116a3452282996cc87a0 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 20 Apr 2023 11:35:44 +0000
Subject: [PATCH 125/161] swscale: Add aarch64 unscaled RGB24->YUV420P

(cherry picked from commit 0cf416312095ce5bea3d2f7e9b14736d4b3ed160)
---
 libswscale/aarch64/rgb2rgb.c      |  40 +++++++
 libswscale/aarch64/rgb2rgb_neon.S | 181 ++++++++++++++++++++++++++++++
 2 files changed, 221 insertions(+)

diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index a9bf6ff9e0..6d3e0000dc 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -30,6 +30,44 @@
 void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
                               uint8_t *dest, int width, int height,
                               int src1Stride, int src2Stride, int dstStride);
+void ff_bgr24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv);
+void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv);
+
+// RGB to YUV asm fns process 16 pixels at once so ensure that the output
+// will fit into the stride. ARM64 should cope with unaligned SIMD r/w so
+// don't test for that
+// Fall back to C if we cannot use asm
+
+static inline int chkw(const int width, const int lumStride, const int chromStride)
+{
+    const int aw = FFALIGN(width, 16);
+    return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2;
+}
+
+static void rgb24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv)
+{
+    if (chkw(width, lumStride, chromStride))
+        ff_rgb24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv);
+    else
+        ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv);
+}
+
+static void bgr24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *bgr2yuv)
+{
+    if (chkw(width, lumStride, chromStride))
+        ff_bgr24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv);
+    else
+        ff_bgr24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv);
+}
+

 av_cold void rgb2rgb_init_aarch64(void)
 {
@@ -37,5 +75,7 @@ av_cold void rgb2rgb_init_aarch64(void)

     if (have_neon(cpu_flags)) {
         interleaveBytes = ff_interleave_bytes_neon;
+        ff_rgb24toyv12 = rgb24toyv12_check;
+        ff_bgr24toyv12 = bgr24toyv12_check;
     }
 }
diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index d81110ec57..8cf40b65f5 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -77,3 +77,184 @@ function ff_interleave_bytes_neon, export=1
 0:
         ret
 endfunc
+
+// void ff_rgb24toyv12_aarch64(
+//              const uint8_t *src,             // x0
+//              uint8_t *ydst,                  // x1
+//              uint8_t *udst,                  // x2
+//              uint8_t *vdst,                  // x3
+//              int width,                      // w4
+//              int height,                     // w5
+//              int lumStride,                  // w6
+//              int chromStride,                // w7
+//              int srcStr,                     // [sp, #0]
+//              int32_t *rgb2yuv);              // [sp, #8]
+
+function ff_rgb24toyv12_aarch64, export=1
+        ldr             x15, [sp, #8]
+        ld1             {v3.s}[2], [x15], #4
+        ld1             {v3.s}[1], [x15], #4
+        ld1             {v3.s}[0], [x15], #4
+        ld1             {v4.s}[2], [x15], #4
+        ld1             {v4.s}[1], [x15], #4
+        ld1             {v4.s}[0], [x15], #4
+        ld1             {v5.s}[2], [x15], #4
+        ld1             {v5.s}[1], [x15], #4
+        ld1             {v5.s}[0], [x15]
+        b               99f
+endfunc
+
+// void ff_bgr24toyv12_aarch64(
+//              const uint8_t *src,             // x0
+//              uint8_t *ydst,                  // x1
+//              uint8_t *udst,                  // x2
+//              uint8_t *vdst,                  // x3
+//              int width,                      // w4
+//              int height,                     // w5
+//              int lumStride,                  // w6
+//              int chromStride,                // w7
+//              int srcStr,                     // [sp, #0]
+//              int32_t *rgb2yuv);              // [sp, #8]
+
+function ff_bgr24toyv12_aarch64, export=1
+        ldr             x15, [sp, #8]
+        ld3             {v3.s, v4.s, v5.s}[0], [x15], #12
+        ld3             {v3.s, v4.s, v5.s}[1], [x15], #12
+        ld3             {v3.s, v4.s, v5.s}[2], [x15]
+99:
+        ldr             w14, [sp, #0]
+        movi            v18.8b, #128
+        uxtl            v17.8h, v18.8b
+
+        // Even line - YUV
+1:
+        mov             x10, x0
+        mov             x11, x1
+        mov             x12, x2
+        mov             x13, x3
+        mov             w9,  w4
+
+0:
+        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
+
+        uxtl2           v20.8h, v0.16b
+        uxtl2           v21.8h, v1.16b
+        uxtl2           v22.8h, v2.16b
+
+        uxtl            v0.8h, v0.8b
+        uxtl            v1.8h, v1.8b
+        uxtl            v2.8h, v2.8b
+        // Y0
+        smull           v6.4s, v0.4h, v3.h[0]
+        smull2          v7.4s, v0.8h, v3.h[0]
+        smlal           v6.4s, v1.4h, v4.h[0]
+        smlal2          v7.4s, v1.8h, v4.h[0]
+        smlal           v6.4s, v2.4h, v5.h[0]
+        smlal2          v7.4s, v2.8h, v5.h[0]
+        shrn            v6.4h, v6.4s, #12
+        shrn2           v6.8h, v7.4s, #12
+        add             v6.8h, v6.8h, v17.8h     // +128 (>> 3 = 16)
+        uqrshrn         v16.8b, v6.8h, #3
+        // Y1
+        smull           v6.4s, v20.4h, v3.h[0]
+        smull2          v7.4s, v20.8h, v3.h[0]
+        smlal           v6.4s, v21.4h, v4.h[0]
+        smlal2          v7.4s, v21.8h, v4.h[0]
+        smlal           v6.4s, v22.4h, v5.h[0]
+        smlal2          v7.4s, v22.8h, v5.h[0]
+        shrn            v6.4h, v6.4s, #12
+        shrn2           v6.8h, v7.4s, #12
+        add             v6.8h, v6.8h, v17.8h
+        uqrshrn2        v16.16b, v6.8h, #3
+        // Y0/Y1
+        st1             {v16.16b}, [x11], #16
+
+        uzp1            v0.8h, v0.8h, v20.8h
+        uzp1            v1.8h, v1.8h, v21.8h
+        uzp1            v2.8h, v2.8h, v22.8h
+
+        // U
+        // Vector subscript *2 as we loaded into S but are only using H
+        smull           v6.4s, v0.4h, v3.h[2]
+        smull2          v7.4s, v0.8h, v3.h[2]
+        smlal           v6.4s, v1.4h, v4.h[2]
+        smlal2          v7.4s, v1.8h, v4.h[2]
+        smlal           v6.4s, v2.4h, v5.h[2]
+        smlal2          v7.4s, v2.8h, v5.h[2]
+        shrn            v6.4h, v6.4s, #14
+        shrn2           v6.8h, v7.4s, #14
+        sqrshrn         v6.8b, v6.8h, #1
+        add             v6.8b, v6.8b, v18.8b     // +128
+        st1             {v6.8b}, [x12], #8
+
+        // V
+        smull           v6.4s, v0.4h, v3.h[4]
+        smull2          v7.4s, v0.8h, v3.h[4]
+        smlal           v6.4s, v1.4h, v4.h[4]
+        smlal2          v7.4s, v1.8h, v4.h[4]
+        smlal           v6.4s, v2.4h, v5.h[4]
+        smlal2          v7.4s, v2.8h, v5.h[4]
+        shrn            v6.4h, v6.4s, #14
+        shrn2           v6.8h, v7.4s, #14
+        sqrshrn         v6.8b, v6.8h, #1
+        add             v6.8b, v6.8b, v18.8b     // +128
+        st1             {v6.8b}, [x13], #8
+
+        subs            w9, w9, #16
+        b.gt            0b
+
+        // Odd line - Y only
+
+        add             x0, x0, w14, SXTX
+        add             x1, x1, w6, SXTX
+        mov             x10, x0
+        mov             x11, x1
+        mov             w9,  w4
+
+0:
+        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
+
+        uxtl2           v20.8h, v0.16b
+        uxtl2           v21.8h, v1.16b
+        uxtl2           v22.8h, v2.16b
+
+        uxtl            v0.8h, v0.8b
+        uxtl            v1.8h, v1.8b
+        uxtl            v2.8h, v2.8b
+        // Y0
+        smull           v6.4s, v0.4h, v3.h[0]
+        smull2          v7.4s, v0.8h, v3.h[0]
+        smlal           v6.4s, v1.4h, v4.h[0]
+        smlal2          v7.4s, v1.8h, v4.h[0]
+        smlal           v6.4s, v2.4h, v5.h[0]
+        smlal2          v7.4s, v2.8h, v5.h[0]
+        shrn            v6.4h, v6.4s, #12
+        shrn2           v6.8h, v7.4s, #12
+        add             v6.8h, v6.8h, v17.8h
+        uqrshrn         v16.8b, v6.8h, #3
+        // Y1
+        smull           v6.4s, v20.4h, v3.h[0]
+        smull2          v7.4s, v20.8h, v3.h[0]
+        smlal           v6.4s, v21.4h, v4.h[0]
+        smlal2          v7.4s, v21.8h, v4.h[0]
+        smlal           v6.4s, v22.4h, v5.h[0]
+        smlal2          v7.4s, v22.8h, v5.h[0]
+        shrn            v6.4h, v6.4s, #12
+        shrn2           v6.8h, v7.4s, #12
+        add             v6.8h, v6.8h, v17.8h
+        uqrshrn2        v16.16b, v6.8h, #3
+        // Y0/Y1
+        st1             {v16.16b}, [x11], #16
+
+        subs            w9, w9, #16
+        b.gt            0b
+
+        add             x0, x0, w14, SXTX
+        add             x1, x1, w6, SXTX
+        add             x2, x2, w7, SXTX
+        add             x3, x3, w7, SXTX
+        subs            w5, w5, #2
+        b.gt            1b
+
+        ret
+endfunc

From f62603136ee2eaf781519bd70e445b03f80960da Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 27 Apr 2023 13:03:52 +0000
Subject: [PATCH 126/161] rgb2rgb: Fix rgb24->yuv420p with arbitrary wxh

(cherry picked from commit 58771fdf0218dc670d8a343824f540e2f6e8785d)
---
 libswscale/aarch64/rgb2rgb.c      |   5 +-
 libswscale/aarch64/rgb2rgb_neon.S | 440 ++++++++++++++++++++++++------
 2 files changed, 355 insertions(+), 90 deletions(-)

diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index 6d3e0000dc..f10c4ef2de 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -44,8 +44,9 @@ void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,

 static inline int chkw(const int width, const int lumStride, const int chromStride)
 {
-    const int aw = FFALIGN(width, 16);
-    return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2;
+//    const int aw = FFALIGN(width, 16);
+//    return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2;
+    return 1;
 }

 static void rgb24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 8cf40b65f5..978ab443ea 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -116,6 +116,25 @@ endfunc
 //              int srcStr,                     // [sp, #0]
 //              int32_t *rgb2yuv);              // [sp, #8]

+// regs
+// v0-2         Src bytes - reused as chroma src
+// v3-5         Coeffs (packed very inefficiently - could be squashed)
+// v6           128b
+// v7           128h
+// v8-15        Reserved
+// v16-18       Lo Src expanded as H
+// v19          -
+// v20-22       Hi Src expanded as H
+// v23          -
+// v24          U out
+// v25          U tmp
+// v26          Y out
+// v27-29       Y tmp
+// v30          V out
+// v31          V tmp
+
+// Assumes Little Endian in tail stores & conversion matrix
+
 function ff_bgr24toyv12_aarch64, export=1
         ldr             x15, [sp, #8]
         ld3             {v3.s, v4.s, v5.s}[0], [x15], #12
@@ -123,138 +142,383 @@ function ff_bgr24toyv12_aarch64, export=1
         ld3             {v3.s, v4.s, v5.s}[2], [x15]
 99:
         ldr             w14, [sp, #0]
-        movi            v18.8b, #128
-        uxtl            v17.8h, v18.8b
-
-        // Even line - YUV
+        movi            v7.8b, #128
+        uxtl            v6.8h, v7.8b
+        // Ensure if nothing to do then we do nothing
+        cmp             w4, #0
+        b.le            90f
+        cmp             w5, #0
+        b.le            90f
+        // If w % 16 != 0 then -16 so we do main loop 1 fewer times with
+        // the remainder done in the tail
+        tst             w4, #15
+        b.eq            1f
+        sub             w4, w4, #16
 1:
+
+// -------------------- Even line body - YUV
+11:
+        subs            w9,  w4, #0
         mov             x10, x0
         mov             x11, x1
         mov             x12, x2
         mov             x13, x3
-        mov             w9,  w4
+        b.lt            12f

-0:
         ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
+        subs            w9, w9, #16
+        b.le            13f
+
+10:
+        uxtl            v16.8h, v0.8b
+        uxtl            v17.8h, v1.8b
+        uxtl            v18.8h, v2.8b

         uxtl2           v20.8h, v0.16b
         uxtl2           v21.8h, v1.16b
         uxtl2           v22.8h, v2.16b

-        uxtl            v0.8h, v0.8b
-        uxtl            v1.8h, v1.8b
-        uxtl            v2.8h, v2.8b
+        bic             v0.8h, #0xff, LSL #8
+        bic             v1.8h, #0xff, LSL #8
+        bic             v2.8h, #0xff, LSL #8
+
+        // Testing shows it is faster to stack the smull/smlal ops together
+        // rather than interleave them between channels and indeed even the
+        // shift/add sections seem happier not interleaved
+
         // Y0
-        smull           v6.4s, v0.4h, v3.h[0]
-        smull2          v7.4s, v0.8h, v3.h[0]
-        smlal           v6.4s, v1.4h, v4.h[0]
-        smlal2          v7.4s, v1.8h, v4.h[0]
-        smlal           v6.4s, v2.4h, v5.h[0]
-        smlal2          v7.4s, v2.8h, v5.h[0]
-        shrn            v6.4h, v6.4s, #12
-        shrn2           v6.8h, v7.4s, #12
-        add             v6.8h, v6.8h, v17.8h     // +128 (>> 3 = 16)
-        uqrshrn         v16.8b, v6.8h, #3
+        smull           v26.4s, v16.4h, v3.h[0]
+        smlal           v26.4s, v17.4h, v4.h[0]
+        smlal           v26.4s, v18.4h, v5.h[0]
+        smull2          v27.4s, v16.8h, v3.h[0]
+        smlal2          v27.4s, v17.8h, v4.h[0]
+        smlal2          v27.4s, v18.8h, v5.h[0]
         // Y1
-        smull           v6.4s, v20.4h, v3.h[0]
-        smull2          v7.4s, v20.8h, v3.h[0]
-        smlal           v6.4s, v21.4h, v4.h[0]
-        smlal2          v7.4s, v21.8h, v4.h[0]
-        smlal           v6.4s, v22.4h, v5.h[0]
-        smlal2          v7.4s, v22.8h, v5.h[0]
-        shrn            v6.4h, v6.4s, #12
-        shrn2           v6.8h, v7.4s, #12
-        add             v6.8h, v6.8h, v17.8h
-        uqrshrn2        v16.16b, v6.8h, #3
+        smull           v28.4s, v20.4h, v3.h[0]
+        smlal           v28.4s, v21.4h, v4.h[0]
+        smlal           v28.4s, v22.4h, v5.h[0]
+        smull2          v29.4s, v20.8h, v3.h[0]
+        smlal2          v29.4s, v21.8h, v4.h[0]
+        smlal2          v29.4s, v22.8h, v5.h[0]
+        shrn            v26.4h, v26.4s, #12
+        shrn2           v26.8h, v27.4s, #12
+        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
+        uqrshrn         v26.8b, v26.8h, #3
+        shrn            v28.4h, v28.4s, #12
+        shrn2           v28.8h, v29.4s, #12
+        add             v28.8h, v28.8h, v6.8h
+        uqrshrn2        v26.16b, v28.8h, #3
         // Y0/Y1
-        st1             {v16.16b}, [x11], #16
-
-        uzp1            v0.8h, v0.8h, v20.8h
-        uzp1            v1.8h, v1.8h, v21.8h
-        uzp1            v2.8h, v2.8h, v22.8h

         // U
         // Vector subscript *2 as we loaded into S but are only using H
-        smull           v6.4s, v0.4h, v3.h[2]
-        smull2          v7.4s, v0.8h, v3.h[2]
-        smlal           v6.4s, v1.4h, v4.h[2]
-        smlal2          v7.4s, v1.8h, v4.h[2]
-        smlal           v6.4s, v2.4h, v5.h[2]
-        smlal2          v7.4s, v2.8h, v5.h[2]
-        shrn            v6.4h, v6.4s, #14
-        shrn2           v6.8h, v7.4s, #14
-        sqrshrn         v6.8b, v6.8h, #1
-        add             v6.8b, v6.8b, v18.8b     // +128
-        st1             {v6.8b}, [x12], #8
+        smull           v24.4s, v0.4h, v3.h[2]
+        smlal           v24.4s, v1.4h, v4.h[2]
+        smlal           v24.4s, v2.4h, v5.h[2]
+        smull2          v25.4s, v0.8h, v3.h[2]
+        smlal2          v25.4s, v1.8h, v4.h[2]
+        smlal2          v25.4s, v2.8h, v5.h[2]

         // V
-        smull           v6.4s, v0.4h, v3.h[4]
-        smull2          v7.4s, v0.8h, v3.h[4]
-        smlal           v6.4s, v1.4h, v4.h[4]
-        smlal2          v7.4s, v1.8h, v4.h[4]
-        smlal           v6.4s, v2.4h, v5.h[4]
-        smlal2          v7.4s, v2.8h, v5.h[4]
-        shrn            v6.4h, v6.4s, #14
-        shrn2           v6.8h, v7.4s, #14
-        sqrshrn         v6.8b, v6.8h, #1
-        add             v6.8b, v6.8b, v18.8b     // +128
-        st1             {v6.8b}, [x13], #8
+        smull           v30.4s, v0.4h, v3.h[4]
+        smlal           v30.4s, v1.4h, v4.h[4]
+        smlal           v30.4s, v2.4h, v5.h[4]
+        smull2          v31.4s, v0.8h, v3.h[4]
+        smlal2          v31.4s, v1.8h, v4.h[4]
+        smlal2          v31.4s, v2.8h, v5.h[4]
+
+        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
+
+        shrn            v24.4h, v24.4s, #14
+        shrn2           v24.8h, v25.4s, #14
+        sqrshrn         v24.8b, v24.8h, #1
+        add             v24.8b, v24.8b, v7.8b     // +128
+        shrn            v30.4h, v30.4s, #14
+        shrn2           v30.8h, v31.4s, #14
+        sqrshrn         v30.8b, v30.8h, #1
+        add             v30.8b, v30.8b, v7.8b     // +128

         subs            w9, w9, #16
-        b.gt            0b

-        // Odd line - Y only
+        st1             {v26.16b}, [x11], #16
+        st1             {v24.8b}, [x12], #8
+        st1             {v30.8b}, [x13], #8
+
+        b.gt            10b
+
+// -------------------- Even line tail - YUV
+// If width % 16 == 0 then simply runs once with preloaded RGB
+// If other then deals with preload & then does remaining tail
+
+13:
+        // Body is simple copy of main loop body minus preload
+
+        uxtl            v16.8h, v0.8b
+        uxtl            v17.8h, v1.8b
+        uxtl            v18.8h, v2.8b
+
+        uxtl2           v20.8h, v0.16b
+        uxtl2           v21.8h, v1.16b
+        uxtl2           v22.8h, v2.16b
+
+        bic             v0.8h, #0xff, LSL #8
+        bic             v1.8h, #0xff, LSL #8
+        bic             v2.8h, #0xff, LSL #8
+
+        // Y0
+        smull           v26.4s, v16.4h, v3.h[0]
+        smlal           v26.4s, v17.4h, v4.h[0]
+        smlal           v26.4s, v18.4h, v5.h[0]
+        smull2          v27.4s, v16.8h, v3.h[0]
+        smlal2          v27.4s, v17.8h, v4.h[0]
+        smlal2          v27.4s, v18.8h, v5.h[0]
+        // Y1
+        smull           v28.4s, v20.4h, v3.h[0]
+        smlal           v28.4s, v21.4h, v4.h[0]
+        smlal           v28.4s, v22.4h, v5.h[0]
+        smull2          v29.4s, v20.8h, v3.h[0]
+        smlal2          v29.4s, v21.8h, v4.h[0]
+        smlal2          v29.4s, v22.8h, v5.h[0]
+        shrn            v26.4h, v26.4s, #12
+        shrn2           v26.8h, v27.4s, #12
+        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
+        uqrshrn         v26.8b, v26.8h, #3
+        shrn            v28.4h, v28.4s, #12
+        shrn2           v28.8h, v29.4s, #12
+        add             v28.8h, v28.8h, v6.8h
+        uqrshrn2        v26.16b, v28.8h, #3
+        // Y0/Y1
+
+        // U
+        // Vector subscript *2 as we loaded into S but are only using H
+        smull           v24.4s, v0.4h, v3.h[2]
+        smlal           v24.4s, v1.4h, v4.h[2]
+        smlal           v24.4s, v2.4h, v5.h[2]
+        smull2          v25.4s, v0.8h, v3.h[2]
+        smlal2          v25.4s, v1.8h, v4.h[2]
+        smlal2          v25.4s, v2.8h, v5.h[2]

+        // V
+        smull           v30.4s, v0.4h, v3.h[4]
+        smlal           v30.4s, v1.4h, v4.h[4]
+        smlal           v30.4s, v2.4h, v5.h[4]
+        smull2          v31.4s, v0.8h, v3.h[4]
+        smlal2          v31.4s, v1.8h, v4.h[4]
+        smlal2          v31.4s, v2.8h, v5.h[4]
+
+        cmp             w9, #-16
+
+        shrn            v24.4h, v24.4s, #14
+        shrn2           v24.8h, v25.4s, #14
+        sqrshrn         v24.8b, v24.8h, #1
+        add             v24.8b, v24.8b, v7.8b     // +128
+        shrn            v30.4h, v30.4s, #14
+        shrn2           v30.8h, v31.4s, #14
+        sqrshrn         v30.8b, v30.8h, #1
+        add             v30.8b, v30.8b, v7.8b     // +128
+
+        // Here:
+        // w9 == 0      width % 16 == 0, tail done
+        // w9 > -16     1st tail done (16 pels), remainder still to go
+        // w9 == -16    shouldn't happen
+        // w9 > -32     2nd tail done
+        // w9 <= -32    shouldn't happen
+
+        b.lt            2f
+        st1             {v26.16b}, [x11], #16
+        st1             {v24.8b}, [x12], #8
+        st1             {v30.8b}, [x13], #8
+        cbz             w9, 3f
+
+12:
+        sub             w9, w9, #16
+
+        tbz             w9, #3, 1f
+        ld3             {v0.8b, v1.8b, v2.8b},  [x10], #24
+1:      tbz             w9, #2, 1f
+        ld3             {v0.b, v1.b, v2.b}[8],  [x10], #3
+        ld3             {v0.b, v1.b, v2.b}[9],  [x10], #3
+        ld3             {v0.b, v1.b, v2.b}[10], [x10], #3
+        ld3             {v0.b, v1.b, v2.b}[11], [x10], #3
+1:      tbz             w9, #1, 1f
+        ld3             {v0.b, v1.b, v2.b}[12], [x10], #3
+        ld3             {v0.b, v1.b, v2.b}[13], [x10], #3
+1:      tbz             w9, #0, 13b
+        ld3             {v0.b, v1.b, v2.b}[14], [x10], #3
+        b               13b
+
+2:
+        tbz             w9, #3, 1f
+        st1             {v26.8b},    [x11], #8
+        st1             {v24.s}[0],  [x12], #4
+        st1             {v30.s}[0],  [x13], #4
+1:      tbz             w9, #2, 1f
+        st1             {v26.s}[2],  [x11], #4
+        st1             {v24.h}[2],  [x12], #2
+        st1             {v30.h}[2],  [x13], #2
+1:      tbz             w9, #1, 1f
+        st1             {v26.h}[6],  [x11], #2
+        st1             {v24.b}[6],  [x12], #1
+        st1             {v30.b}[6],  [x13], #1
+1:      tbz             w9, #0, 1f
+        st1             {v26.b}[14], [x11]
+        st1             {v24.b}[7],  [x12]
+        st1             {v30.b}[7],  [x13]
+1:
+3:
+
+// -------------------- Odd line body - Y only
+
+        subs            w5, w5, #1
+        b.eq            90f
+
+        subs            w9,  w4, #0
         add             x0, x0, w14, SXTX
         add             x1, x1, w6, SXTX
         mov             x10, x0
         mov             x11, x1
-        mov             w9,  w4
+        b.lt            12f

-0:
         ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
+        subs            w9, w9, #16
+        b.le            13f
+
+10:
+        uxtl            v16.8h, v0.8b
+        uxtl            v17.8h, v1.8b
+        uxtl            v18.8h, v2.8b

         uxtl2           v20.8h, v0.16b
         uxtl2           v21.8h, v1.16b
         uxtl2           v22.8h, v2.16b

-        uxtl            v0.8h, v0.8b
-        uxtl            v1.8h, v1.8b
-        uxtl            v2.8h, v2.8b
+        // Testing shows it is faster to stack the smull/smlal ops together
+        // rather than interleave them between channels and indeed even the
+        // shift/add sections seem happier not interleaved
+
         // Y0
-        smull           v6.4s, v0.4h, v3.h[0]
-        smull2          v7.4s, v0.8h, v3.h[0]
-        smlal           v6.4s, v1.4h, v4.h[0]
-        smlal2          v7.4s, v1.8h, v4.h[0]
-        smlal           v6.4s, v2.4h, v5.h[0]
-        smlal2          v7.4s, v2.8h, v5.h[0]
-        shrn            v6.4h, v6.4s, #12
-        shrn2           v6.8h, v7.4s, #12
-        add             v6.8h, v6.8h, v17.8h
-        uqrshrn         v16.8b, v6.8h, #3
+        smull           v26.4s, v16.4h, v3.h[0]
+        smlal           v26.4s, v17.4h, v4.h[0]
+        smlal           v26.4s, v18.4h, v5.h[0]
+        smull2          v27.4s, v16.8h, v3.h[0]
+        smlal2          v27.4s, v17.8h, v4.h[0]
+        smlal2          v27.4s, v18.8h, v5.h[0]
         // Y1
-        smull           v6.4s, v20.4h, v3.h[0]
-        smull2          v7.4s, v20.8h, v3.h[0]
-        smlal           v6.4s, v21.4h, v4.h[0]
-        smlal2          v7.4s, v21.8h, v4.h[0]
-        smlal           v6.4s, v22.4h, v5.h[0]
-        smlal2          v7.4s, v22.8h, v5.h[0]
-        shrn            v6.4h, v6.4s, #12
-        shrn2           v6.8h, v7.4s, #12
-        add             v6.8h, v6.8h, v17.8h
-        uqrshrn2        v16.16b, v6.8h, #3
+        smull           v28.4s, v20.4h, v3.h[0]
+        smlal           v28.4s, v21.4h, v4.h[0]
+        smlal           v28.4s, v22.4h, v5.h[0]
+        smull2          v29.4s, v20.8h, v3.h[0]
+        smlal2          v29.4s, v21.8h, v4.h[0]
+        smlal2          v29.4s, v22.8h, v5.h[0]
+
+        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
+
+        shrn            v26.4h, v26.4s, #12
+        shrn2           v26.8h, v27.4s, #12
+        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
+        uqrshrn         v26.8b, v26.8h, #3
+        shrn            v28.4h, v28.4s, #12
+        shrn2           v28.8h, v29.4s, #12
+        add             v28.8h, v28.8h, v6.8h
+        uqrshrn2        v26.16b, v28.8h, #3
         // Y0/Y1
-        st1             {v16.16b}, [x11], #16

         subs            w9, w9, #16
-        b.gt            0b
+
+        st1             {v26.16b}, [x11], #16
+
+        b.gt            10b
+
+// -------------------- Odd line tail - Y
+// If width % 16 == 0 then simply runs once with preloaded RGB
+// If other then deals with preload & then does remaining tail
+
+13:
+        // Body is simple copy of main loop body minus preload
+
+        uxtl            v16.8h, v0.8b
+        uxtl            v17.8h, v1.8b
+        uxtl            v18.8h, v2.8b
+
+        uxtl2           v20.8h, v0.16b
+        uxtl2           v21.8h, v1.16b
+        uxtl2           v22.8h, v2.16b
+
+        // Y0
+        smull           v26.4s, v16.4h, v3.h[0]
+        smlal           v26.4s, v17.4h, v4.h[0]
+        smlal           v26.4s, v18.4h, v5.h[0]
+        smull2          v27.4s, v16.8h, v3.h[0]
+        smlal2          v27.4s, v17.8h, v4.h[0]
+        smlal2          v27.4s, v18.8h, v5.h[0]
+        // Y1
+        smull           v28.4s, v20.4h, v3.h[0]
+        smlal           v28.4s, v21.4h, v4.h[0]
+        smlal           v28.4s, v22.4h, v5.h[0]
+        smull2          v29.4s, v20.8h, v3.h[0]
+        smlal2          v29.4s, v21.8h, v4.h[0]
+        smlal2          v29.4s, v22.8h, v5.h[0]
+
+        cmp             w9, #-16
+
+        shrn            v26.4h, v26.4s, #12
+        shrn2           v26.8h, v27.4s, #12
+        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
+        uqrshrn         v26.8b, v26.8h, #3
+        shrn            v28.4h, v28.4s, #12
+        shrn2           v28.8h, v29.4s, #12
+        add             v28.8h, v28.8h, v6.8h
+        uqrshrn2        v26.16b, v28.8h, #3
+        // Y0/Y1
+
+        // Here:
+        // w9 == 0      width % 16 == 0, tail done
+        // w9 > -16     1st tail done (16 pels), remainder still to go
+        // w9 == -16    shouldn't happen
+        // w9 > -32     2nd tail done
+        // w9 <= -32    shouldn't happen
+
+        b.lt            2f
+        st1             {v26.16b}, [x11], #16
+        cbz             w9, 3f
+
+12:
+        sub             w9, w9, #16
+
+        tbz             w9, #3, 1f
+        ld3             {v0.8b, v1.8b, v2.8b},  [x10], #24
+1:      tbz             w9, #2, 1f
+        ld3             {v0.b, v1.b, v2.b}[8],  [x10], #3
+        ld3             {v0.b, v1.b, v2.b}[9],  [x10], #3
+        ld3             {v0.b, v1.b, v2.b}[10], [x10], #3
+        ld3             {v0.b, v1.b, v2.b}[11], [x10], #3
+1:      tbz             w9, #1, 1f
+        ld3             {v0.b, v1.b, v2.b}[12], [x10], #3
+        ld3             {v0.b, v1.b, v2.b}[13], [x10], #3
+1:      tbz             w9, #0, 13b
+        ld3             {v0.b, v1.b, v2.b}[14], [x10], #3
+        b               13b
+
+2:
+        tbz             w9, #3, 1f
+        st1             {v26.8b},    [x11], #8
+1:      tbz             w9, #2, 1f
+        st1             {v26.s}[2],  [x11], #4
+1:      tbz             w9, #1, 1f
+        st1             {v26.h}[6],  [x11], #2
+1:      tbz             w9, #0, 1f
+        st1             {v26.b}[14], [x11]
+1:
+3:
+
+// ------------------- Loop to start

         add             x0, x0, w14, SXTX
         add             x1, x1, w6, SXTX
         add             x2, x2, w7, SXTX
         add             x3, x3, w7, SXTX
-        subs            w5, w5, #2
-        b.gt            1b
-
+        subs            w5, w5, #1
+        b.gt            11b
+90:
         ret
 endfunc

From cf020c89ac47620c4a5390d0333e9ea70fbfa7b8 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 26 Apr 2023 15:36:07 +0000
Subject: [PATCH 127/161] rgb2rgb: Use asm unconditionally

(cherry picked from commit 7c216c0804836b31c0ea093bb1dde5ab387724b1)
---
 libswscale/aarch64/rgb2rgb.c | 37 ++----------------------------------
 1 file changed, 2 insertions(+), 35 deletions(-)

diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index f10c4ef2de..6a0e2dcc09 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -37,46 +37,13 @@ void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                    uint8_t *vdst, int width, int height, int lumStride,
                    int chromStride, int srcStride, int32_t *rgb2yuv);

-// RGB to YUV asm fns process 16 pixels at once so ensure that the output
-// will fit into the stride. ARM64 should cope with unaligned SIMD r/w so
-// don't test for that
-// Fall back to C if we cannot use asm
-
-static inline int chkw(const int width, const int lumStride, const int chromStride)
-{
-//    const int aw = FFALIGN(width, 16);
-//    return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2;
-    return 1;
-}
-
-static void rgb24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-                   uint8_t *vdst, int width, int height, int lumStride,
-                   int chromStride, int srcStride, int32_t *rgb2yuv)
-{
-    if (chkw(width, lumStride, chromStride))
-        ff_rgb24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv);
-    else
-        ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv);
-}
-
-static void bgr24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-                   uint8_t *vdst, int width, int height, int lumStride,
-                   int chromStride, int srcStride, int32_t *bgr2yuv)
-{
-    if (chkw(width, lumStride, chromStride))
-        ff_bgr24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv);
-    else
-        ff_bgr24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv);
-}
-
-
 av_cold void rgb2rgb_init_aarch64(void)
 {
     int cpu_flags = av_get_cpu_flags();

     if (have_neon(cpu_flags)) {
         interleaveBytes = ff_interleave_bytes_neon;
-        ff_rgb24toyv12 = rgb24toyv12_check;
-        ff_bgr24toyv12 = bgr24toyv12_check;
+        ff_rgb24toyv12 = ff_rgb24toyv12_aarch64;
+        ff_bgr24toyv12 = ff_bgr24toyv12_aarch64;
     }
 }

From 1895fdcaf403f403736ab52d1cb69dce7c964b66 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 27 Apr 2023 13:01:43 +0000
Subject: [PATCH 128/161] tests/swscale: Add options for width and height on
 the command line

(cherry picked from commit eb8a09779688fc05bf204fdfcd063b04cda07271)
---
 libswscale/tests/swscale.c | 84 ++++++++++++++++++++++++++------------
 1 file changed, 59 insertions(+), 25 deletions(-)

diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c
index 6c38041ddb..4cf41d9f64 100644
--- a/libswscale/tests/swscale.c
+++ b/libswscale/tests/swscale.c
@@ -355,56 +355,71 @@ static int fileTest(const uint8_t * const ref[4], int refStride[4],
     return 0;
 }

-#define W 96
-#define H 96
-
 int main(int argc, char **argv)
 {
+    unsigned int W = 96;
+    unsigned int H = 96;
+    unsigned int W2;
+    unsigned int H2;
+    unsigned int S;
     enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE;
     enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE;
-    uint8_t *rgb_data   = av_malloc(W * H * 4);
-    const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL };
-    int rgb_stride[4]   = { 4 * W, 0, 0, 0 };
-    uint8_t *data       = av_malloc(4 * W * H);
-    const uint8_t * const src[4] = { data, data + W * H, data + W * H * 2, data + W * H * 3 };
-    int stride[4]       = { W, W, W, W };
     int x, y;
     struct SwsContext *sws;
     AVLFG rand;
     int res = -1;
     int i;
     FILE *fp = NULL;
-
-    if (!rgb_data || !data)
-        return -1;
+    uint8_t *rgb_data;
+    uint8_t * rgb_src[4] = { NULL };
+    int rgb_stride[4]   = { 0 };
+    uint8_t *data;
+    uint8_t * src[4] = { NULL };
+    int stride[4]       = { 0 };

     for (i = 1; i < argc; i += 2) {
+        const char * const arg2 = argv[i+1];
+
         if (argv[i][0] != '-' || i + 1 == argc)
             goto bad_option;
         if (!strcmp(argv[i], "-ref")) {
-            fp = fopen(argv[i + 1], "r");
+            fp = fopen(arg2, "r");
             if (!fp) {
-                fprintf(stderr, "could not open '%s'\n", argv[i + 1]);
+                fprintf(stderr, "could not open '%s'\n", arg2);
                 goto error;
             }
         } else if (!strcmp(argv[i], "-cpuflags")) {
             unsigned flags = av_get_cpu_flags();
-            int ret = av_parse_cpu_caps(&flags, argv[i + 1]);
+            int ret = av_parse_cpu_caps(&flags, arg2);
             if (ret < 0) {
-                fprintf(stderr, "invalid cpu flags %s\n", argv[i + 1]);
+                fprintf(stderr, "invalid cpu flags %s\n", arg2);
                 return ret;
             }
             av_force_cpu_flags(flags);
         } else if (!strcmp(argv[i], "-src")) {
-            srcFormat = av_get_pix_fmt(argv[i + 1]);
+            srcFormat = av_get_pix_fmt(arg2);
             if (srcFormat == AV_PIX_FMT_NONE) {
-                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
+                fprintf(stderr, "invalid pixel format %s\n", arg2);
                 return -1;
             }
         } else if (!strcmp(argv[i], "-dst")) {
-            dstFormat = av_get_pix_fmt(argv[i + 1]);
+            dstFormat = av_get_pix_fmt(arg2);
             if (dstFormat == AV_PIX_FMT_NONE) {
-                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
+                fprintf(stderr, "invalid pixel format %s\n", arg2);
+                return -1;
+            }
+        } else if (!strcmp(argv[i], "-w")) {
+            char * p = NULL;
+            W = strtoul(arg2, &p, 0);
+            if (!W || *p) {
+                fprintf(stderr, "bad width %s\n", arg2);
+                return -1;
+            }
+        } else if (!strcmp(argv[i], "-h")) {
+            char * p = NULL;
+            H = strtoul(arg2, &p, 0);
+            if (!H || *p) {
+                fprintf(stderr, "bad height '%s' (H=%d, *p=%d)\n", arg2, H, *p);
                 return -1;
             }
         } else {
@@ -414,15 +429,34 @@ bad_option:
         }
     }

-    sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H,
+    S = (W + 15) & ~15;
+    rgb_data   = av_mallocz(S * H * 4);
+    rgb_src[0] = rgb_data;
+    rgb_stride[0]   = 4 * S;
+    data       = av_mallocz(4 * S * H);
+    src[0] = data;
+    src[1] = data + S * H;
+    src[2] = data + S * H * 2;
+    src[3] = data + S * H * 3;
+    stride[0] = S;
+    stride[1] = S;
+    stride[2] = S;
+    stride[3] = S;
+    H2 = H < 96 ? 8 : H / 12;
+    W2 = W < 96 ? 8 : W / 12;
+
+    if (!rgb_data || !data)
+        return -1;
+
+    sws = sws_getContext(W2, H2, AV_PIX_FMT_RGB32, W, H,
                          AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);

     av_lfg_init(&rand, 1);

     for (y = 0; y < H; y++)
         for (x = 0; x < W * 4; x++)
-            rgb_data[ x + y * 4 * W] = av_lfg_get(&rand);
-    res = sws_scale(sws, rgb_src, rgb_stride, 0, H / 12, (uint8_t * const *) src, stride);
+            rgb_data[ x + y * 4 * S] = av_lfg_get(&rand);
+    res = sws_scale(sws, (const uint8_t * const *)rgb_src, rgb_stride, 0, H2, (uint8_t * const *) src, stride);
     if (res < 0 || res != H) {
         res = -1;
         goto error;
@@ -431,10 +465,10 @@ bad_option:
     av_free(rgb_data);

     if(fp) {
-        res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat);
+        res = fileTest((const uint8_t * const *)src, stride, W, H, fp, srcFormat, dstFormat);
         fclose(fp);
     } else {
-        selfTest(src, stride, W, H, srcFormat, dstFormat);
+        selfTest((const uint8_t * const *)src, stride, W, H, srcFormat, dstFormat);
         res = 0;
     }
 error:

From 94e48653a6bd1b8438887b486927e87b56651455 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 26 Apr 2023 16:31:23 +0000
Subject: [PATCH 129/161] tests/swscale: Add a timing option

-t <n>   Where n is the number of time to loop the scale op.
         Often useful to do it 10 times or so for better resolution

(cherry picked from commit 50cd60a23a66254f911376602d07b30fcafbde96)
---
 libswscale/tests/swscale.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c
index 4cf41d9f64..12776ffec7 100644
--- a/libswscale/tests/swscale.c
+++ b/libswscale/tests/swscale.c
@@ -23,6 +23,7 @@
 #include <string.h>
 #include <inttypes.h>
 #include <stdarg.h>
+#include <time.h>

 #undef HAVE_AV_CONFIG_H
 #include "libavutil/cpu.h"
@@ -78,6 +79,15 @@ struct Results {
     uint32_t crc;
 };

+static int time_rep = 0;
+
+static uint64_t utime(void)
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_nsec / 1000 + (uint64_t)ts.tv_sec * 1000000;
+}
+
 // test by ref -> src -> dst -> out & compare out against ref
 // ref & out are YV12
 static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
@@ -174,7 +184,7 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
         goto end;
     }

-    printf(" %s %dx%d -> %s %3dx%3d flags=%2d",
+    printf(" %s %4dx%4d -> %s %4dx%4d flags=%2d",
            desc_src->name, srcW, srcH,
            desc_dst->name, dstW, dstH,
            flags);
@@ -182,6 +192,17 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,

     sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);

+    if (time_rep != 0)
+    {
+        const uint64_t now = utime();
+        uint64_t done;
+        for (i = 1; i != time_rep; ++i) {
+            sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
+        }
+        done = utime();
+        printf(" T=%7"PRId64"us ", done-now);
+    }
+
     for (i = 0; i < 4 && dstStride[i]; i++)
         crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i],
                      dstStride[i] * dstH);
@@ -419,7 +440,14 @@ int main(int argc, char **argv)
             char * p = NULL;
             H = strtoul(arg2, &p, 0);
             if (!H || *p) {
-                fprintf(stderr, "bad height '%s' (H=%d, *p=%d)\n", arg2, H, *p);
+                fprintf(stderr, "bad height '%s'\n", arg2);
+                return -1;
+            }
+        } else if (!strcmp(argv[i], "-t")) {
+            char * p = NULL;
+            time_rep = (int)strtol(arg2, &p, 0);
+            if (*p) {
+                fprintf(stderr, "bad time repetitions '%s'\n", arg2);
                 return -1;
             }
         } else {

From 406806d0b9d9cb113deb0d083a28cbccabab6825 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 20 Apr 2023 13:40:36 +0000
Subject: [PATCH 130/161] swscale: RGB->YUV420 fix C template to allow odd
 widths

(cherry picked from commit 08b2023e7b5292df0adc6593e4d20087f9cef5c8)
---
 libswscale/rgb2rgb_template.c | 44 +++++++++++++++++++++++++++++++++++
 libswscale/swscale_unscaled.c | 11 ++++-----
 2 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
index 703de90690..e711589e1e 100644
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -679,6 +679,19 @@ static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
             ydst[2 * i + 1] = Y;
         }
+        if ((width & 1) != 0) {
+            unsigned int b = src[6 * i + 0];
+            unsigned int g = src[6 * i + 1];
+            unsigned int r = src[6 * i + 2];
+
+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
+            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
+            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
+
+            udst[i]     = U;
+            vdst[i]     = V;
+            ydst[2 * i] = Y;
+        }
         ydst += lumStride;
         src  += srcStride;

@@ -701,6 +714,15 @@ static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
             ydst[2 * i + 1] = Y;
         }
+        if ((width & 1) != 0) {
+            unsigned int b = src[6 * i + 0];
+            unsigned int g = src[6 * i + 1];
+            unsigned int r = src[6 * i + 2];
+
+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+
+            ydst[2 * i] = Y;
+        }
         udst += chromStride;
         vdst += chromStride;
         ydst += lumStride;
@@ -767,6 +789,19 @@ static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
             ydst[2 * i + 1] = Y;
         }
+        if ((width & 1) != 0) {
+            unsigned int b = src[8 * i + 2];
+            unsigned int g = src[8 * i + 1];
+            unsigned int r = src[8 * i + 0];
+
+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
+            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
+            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
+
+            udst[i]     = U;
+            vdst[i]     = V;
+            ydst[2 * i] = Y;
+        }
         ydst += lumStride;
         src  += srcStride;

@@ -789,6 +824,15 @@ static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
             ydst[2 * i + 1] = Y;
         }
+        if ((width & 1) != 0) {
+            unsigned int b = src[8 * i + 2];
+            unsigned int g = src[8 * i + 1];
+            unsigned int r = src[8 * i + 0];
+
+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+
+            ydst[2 * i] = Y;
+        }
         udst += chromStride;
         vdst += chromStride;
         ydst += lumStride;
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index 053c06adf5..52469b2e4a 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -2062,7 +2062,6 @@ void ff_get_unscaled_swscale(SwsContext *c)
     const enum AVPixelFormat dstFormat = c->dstFormat;
     const int flags = c->flags;
     const int dstH = c->dstH;
-    const int dstW = c->dstW;
     int needsDither;

     needsDither = isAnyRGB(dstFormat) &&
@@ -2120,12 +2119,12 @@ void ff_get_unscaled_swscale(SwsContext *c)
     /* bgr24toYV12 */
     if (srcFormat == AV_PIX_FMT_BGR24 &&
         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
-        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        !(flags & SWS_ACCURATE_RND))
         c->convert_unscaled = bgr24ToYv12Wrapper;
     /* rgb24toYV12 */
     if (srcFormat == AV_PIX_FMT_RGB24 &&
         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
-        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        !(flags & SWS_ACCURATE_RND))
         c->convert_unscaled = rgb24ToYv12Wrapper;

     /* bgrxtoYV12 */
@@ -2136,17 +2135,17 @@ void ff_get_unscaled_swscale(SwsContext *c)
     /* rgbx24toYV12 */
     if (((srcFormat == AV_PIX_FMT_RGBA && dstFormat == AV_PIX_FMT_YUV420P) ||
          (srcFormat == AV_PIX_FMT_RGB0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
-        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        !(flags & SWS_ACCURATE_RND))
         c->convert_unscaled = rgbxToYv12Wrapper;
     /* xbgrtoYV12 */
     if (((srcFormat == AV_PIX_FMT_ABGR && dstFormat == AV_PIX_FMT_YUV420P) ||
          (srcFormat == AV_PIX_FMT_0BGR && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
-        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        !(flags & SWS_ACCURATE_RND))
         c->convert_unscaled = xbgrToYv12Wrapper;
     /* xrgb24toYV12 */
     if (((srcFormat == AV_PIX_FMT_ARGB && dstFormat == AV_PIX_FMT_YUV420P) ||
          (srcFormat == AV_PIX_FMT_0RGB && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
-        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        !(flags & SWS_ACCURATE_RND))
         c->convert_unscaled = xrgbToYv12Wrapper;

     /* RGB/BGR -> RGB/BGR (no dither needed forms) */

From 68c6482d9473ce774e87cac2455a8c7b3e2d99b4 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 4 May 2023 14:26:14 +0000
Subject: [PATCH 131/161] rtpenc: Add code to send H264 new extradata in
 sidedata

Fixes issue with pi V4L2 H264 encode which cannot create extradata
at init time.

(cherry picked from commit 4f852b4b093f841b64b4934a6f1720e98e4e0f2c)
---
 libavformat/rtpenc.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/libavformat/rtpenc.c b/libavformat/rtpenc.c
index a8d296a154..f67dc2a15a 100644
--- a/libavformat/rtpenc.c
+++ b/libavformat/rtpenc.c
@@ -19,6 +19,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

+#include "avc.h"
 #include "avformat.h"
 #include "mpegts.h"
 #include "internal.h"
@@ -585,8 +586,25 @@ static int rtp_write_packet(AVFormatContext *s1, AVPacket *pkt)
         ff_rtp_send_vc2hq(s1, pkt->data, size, st->codecpar->field_order != AV_FIELD_PROGRESSIVE ? 1 : 0);
         break;
     case AV_CODEC_ID_H264:
+    {
+        uint8_t *side_data;
+        int side_data_size = 0;
+
+        side_data = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA,
+                                            &side_data_size);
+
+        if (side_data_size != 0) {
+            int ps_size = side_data_size;
+            uint8_t * ps_buf = NULL;
+
+            ff_avc_write_annexb_extradata(side_data, &ps_buf, &ps_size);
+            av_log(s1, AV_LOG_TRACE, "H264: write side data=%d\n", ps_size);
+            ff_rtp_send_h264_hevc(s1, ps_buf ? ps_buf : side_data, ps_size);
+            av_free(ps_buf);
+        }
         ff_rtp_send_h264_hevc(s1, pkt->data, size);
         break;
+    }
     case AV_CODEC_ID_H261:
         ff_rtp_send_h261(s1, pkt->data, size);
         break;

From 5240cc7fc3abed8af5f178c5461ca9fe11a7d5e4 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 5 Jun 2023 08:34:38 +0000
Subject: [PATCH 132/161] rgb2rgb: Fix luma narrow+saturation instruction

(cherry picked from commit 9cdac1c08ad5c0aea28907d1d3fd0bdda387955a)
---
 libswscale/aarch64/rgb2rgb_neon.S | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 978ab443ea..476ca723a0 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -203,11 +203,11 @@ function ff_bgr24toyv12_aarch64, export=1
         shrn            v26.4h, v26.4s, #12
         shrn2           v26.8h, v27.4s, #12
         add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-        uqrshrn         v26.8b, v26.8h, #3
+        sqrshrun        v26.8b, v26.8h, #3
         shrn            v28.4h, v28.4s, #12
         shrn2           v28.8h, v29.4s, #12
         add             v28.8h, v28.8h, v6.8h
-        uqrshrn2        v26.16b, v28.8h, #3
+        sqrshrun2       v26.16b, v28.8h, #3
         // Y0/Y1

         // U
@@ -282,11 +282,11 @@ function ff_bgr24toyv12_aarch64, export=1
         shrn            v26.4h, v26.4s, #12
         shrn2           v26.8h, v27.4s, #12
         add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-        uqrshrn         v26.8b, v26.8h, #3
+        sqrshrun        v26.8b, v26.8h, #3
         shrn            v28.4h, v28.4s, #12
         shrn2           v28.8h, v29.4s, #12
         add             v28.8h, v28.8h, v6.8h
-        uqrshrn2        v26.16b, v28.8h, #3
+        sqrshrun2       v26.16b, v28.8h, #3
         // Y0/Y1

         // U
@@ -416,11 +416,11 @@ function ff_bgr24toyv12_aarch64, export=1
         shrn            v26.4h, v26.4s, #12
         shrn2           v26.8h, v27.4s, #12
         add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-        uqrshrn         v26.8b, v26.8h, #3
+        sqrshrun        v26.8b, v26.8h, #3
         shrn            v28.4h, v28.4s, #12
         shrn2           v28.8h, v29.4s, #12
         add             v28.8h, v28.8h, v6.8h
-        uqrshrn2        v26.16b, v28.8h, #3
+        sqrshrun2       v26.16b, v28.8h, #3
         // Y0/Y1

         subs            w9, w9, #16
@@ -464,11 +464,11 @@ function ff_bgr24toyv12_aarch64, export=1
         shrn            v26.4h, v26.4s, #12
         shrn2           v26.8h, v27.4s, #12
         add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-        uqrshrn         v26.8b, v26.8h, #3
+        sqrshrun        v26.8b, v26.8h, #3
         shrn            v28.4h, v28.4s, #12
         shrn2           v28.8h, v29.4s, #12
         add             v28.8h, v28.8h, v6.8h
-        uqrshrn2        v26.16b, v28.8h, #3
+        sqrshrun2       v26.16b, v28.8h, #3
         // Y0/Y1

         // Here:

From 9474d9d227f2af488d5d2bd614c5c707479ca3c3 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Sun, 4 Jun 2023 13:37:59 +0000
Subject: [PATCH 133/161] v4l2_m2m_dec: Tweak pending count to use dts &
 reorder size

(cherry picked from commit ca438b382c90f9a5f58f4708205e6ac25395db2a)
---
 libavcodec/v4l2_m2m.h     |  1 +
 libavcodec/v4l2_m2m_dec.c | 53 +++++++++++++++++++++++++++++++--------
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index ded1478a49..a506e69d67 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -115,6 +115,7 @@ typedef struct V4L2m2mContext {

     /* req pkt */
     int req_pkt;
+    int reorder_size;

     /* Ext data sent */
     int extdata_sent;
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index d124c7b1fc..13af62e819 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -121,13 +121,18 @@ log_dump(void * logctx, int lvl, const void * const data, const size_t len)
 }
 #endif

-static int64_t pts_stats_guess(const pts_stats_t * const stats)
+static unsigned int pts_stats_interval(const pts_stats_t * const stats)
+{
+    return stats->last_interval;
+}
+
+static int64_t pts_stats_guess(const pts_stats_t * const stats, const int fail_bad_guess)
 {
     if (stats->last_count <= 1)
         return stats->last_pts;
     if (stats->last_pts == AV_NOPTS_VALUE ||
-            stats->last_interval == 0 ||
-            stats->last_count >= STATS_LAST_COUNT_MAX)
+            fail_bad_guess && (stats->last_interval == 0 ||
+                               stats->last_count >= STATS_LAST_COUNT_MAX))
         return AV_NOPTS_VALUE;
     return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval;
 }
@@ -345,7 +350,7 @@ set_best_effort_pts(AVCodecContext *const avctx,
 {
     pts_stats_add(ps, frame->pts);

-    frame->best_effort_timestamp = pts_stats_guess(ps);
+    frame->best_effort_timestamp = pts_stats_guess(ps, 1);
     // If we can't guess from just PTS - try DTS
     if (frame->best_effort_timestamp == AV_NOPTS_VALUE)
         frame->best_effort_timestamp = frame->pkt_dts;
@@ -380,15 +385,25 @@ xlat_init(xlat_track_t * const x)
 }

 static int
-xlat_pending(const xlat_track_t * const x)
+xlat_pending(const V4L2m2mContext * const s)
 {
+    const xlat_track_t *const x = &s->xlat;
     unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
     int i;
-    const int64_t now = x->last_pts;
+    const int64_t now = pts_stats_guess(&s->pts_stat, 0);
+    int64_t first_dts = AV_NOPTS_VALUE;
+    int no_dts_count = 0;
+    unsigned int interval = pts_stats_interval(&s->pts_stat);

     for (i = 0; i < FF_V4L2_M2M_TRACK_SIZE; ++i, n = (n - 1) & (FF_V4L2_M2M_TRACK_SIZE - 1)) {
         const V4L2m2mTrackEl * const t = x->track_els + n;

+        if (first_dts == AV_NOPTS_VALUE)
+            if (t->dts == AV_NOPTS_VALUE)
+                ++no_dts_count;
+            else
+                first_dts = t->dts;
+
         // Discard only set on never-set or flushed entries
         // So if we get here we've never successfully decoded a frame so allow
         // more frames into the buffer before stalling
@@ -408,6 +423,18 @@ xlat_pending(const xlat_track_t * const x)
             break;
     }

+    if (first_dts != AV_NOPTS_VALUE && now != AV_NOPTS_VALUE && interval != 0 && s->reorder_size != 0) {
+        const int iframes = (first_dts - now) / (int)interval;
+        const int t = iframes - s->reorder_size + no_dts_count;
+
+//        av_log(s->avctx, AV_LOG_DEBUG, "Last:%"PRId64", Now:%"PRId64", First:%"PRId64", delta=%"PRId64", frames=%d, nodts=%d\n",
+//               x->last_dts, now, first_dts, first_dts - now, iframes, no_dts_count);
+
+        if (iframes > 0 && iframes < 64 && t < i) {
+            return t;
+        }
+    }
+
     return i;
 }

@@ -585,12 +612,12 @@ static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
 static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 {
     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-    int src_rv = NQ_OK;
+    int src_rv = -1;
     int dst_rv = 1;  // Non-zero (done), non-negative (error) number
     unsigned int i = 0;

     do {
-        const int pending = xlat_pending(&s->xlat);
+        const int pending = xlat_pending(s);
         const int prefer_dq = (pending > 4);
         const int last_src_rv = src_rv;

@@ -966,8 +993,10 @@ static uint32_t max_coded_size(const AVCodecContext * const avctx)
 }

 static void
-parse_extradata(AVCodecContext *avctx)
+parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s)
 {
+    s->reorder_size = 0;
+
     if (!avctx->extradata || !avctx->extradata_size)
         return;

@@ -996,6 +1025,7 @@ parse_extradata(AVCodecContext *avctx)
                     avctx->profile = ff_h264_get_profile(sps);
                     avctx->level = sps->level_idc;
                 }
+                s->reorder_size = sps->num_reorder_frames;
             }
             ff_h264_ps_uninit(&ps);
             break;
@@ -1025,6 +1055,7 @@ parse_extradata(AVCodecContext *avctx)
                 if (sps) {
                     avctx->profile = sps->ptl.general_ptl.profile_idc;
                     avctx->level   = sps->ptl.general_ptl.level_idc;
+                    s->reorder_size = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering;
                 }
             }
             ff_hevc_ps_uninit(&ps);
@@ -1057,12 +1088,12 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
         avctx->ticks_per_frame = 2;
     }

-    parse_extradata(avctx);
-
     ret = ff_v4l2_m2m_create_context(priv, &s);
     if (ret < 0)
         return ret;

+    parse_extradata(avctx, s);
+
     xlat_init(&s->xlat);
     pts_stats_init(&s->pts_stat, avctx, "decoder");


From 2145b9c9177f0fe9569ce39e2d4eb629caf8bd47 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 7 Jun 2023 11:14:52 +0000
Subject: [PATCH 134/161] v4l2_m2m: Add encode size check

Previously an out of bounds size would fail whilst trying to copy the
buffer with an unhelpful message. This produces a better error at init
time.

(cherry picked from commit 0b61c4617e26f043d28d44c8767f7b9fd4882f97)
---
 libavcodec/v4l2_m2m.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index f802687b1b..28d9ed4988 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -109,6 +109,44 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)
     return AVERROR(EINVAL);
 }

+static int check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
+{
+    struct v4l2_format fmt = {.type = s->output.type};
+    int rv;
+    uint32_t pixfmt = ff_v4l2_format_avfmt_to_v4l2(avctx->pix_fmt);
+    unsigned int w;
+    unsigned int h;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
+        fmt.fmt.pix_mp.pixelformat = pixfmt;
+        fmt.fmt.pix_mp.width = avctx->width;
+        fmt.fmt.pix_mp.height = avctx->height;
+    }
+    else {
+        fmt.fmt.pix.pixelformat = pixfmt;
+        fmt.fmt.pix.width = avctx->width;
+        fmt.fmt.pix.height = avctx->height;
+    }
+
+    rv = ioctl(s->fd, VIDIOC_TRY_FMT, &fmt);
+
+    if (rv != 0) {
+        rv = AVERROR(errno);
+        av_log(avctx, AV_LOG_ERROR, "%s: Tryfmt failed: %s\n", __func__, av_err2str(rv));
+        return rv;
+    }
+
+    w = ff_v4l2_get_format_width(&fmt);
+    h = ff_v4l2_get_format_height(&fmt);
+
+    if (w < avctx->width || h < avctx->height) {
+        av_log(avctx, AV_LOG_WARNING, "%s: Size check failed: asked for %dx%d, got: %dx%d\n", __func__, avctx->width, avctx->height, w, h);
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
 static int v4l2_probe_driver(V4L2m2mContext *s)
 {
     void *log_ctx = s->avctx;
@@ -128,6 +166,11 @@ static int v4l2_probe_driver(V4L2m2mContext *s)
         goto done;
     }

+    // If being given frames (encode) check that V4L2 can cope with the size
+    if (s->output.av_codec_id == AV_CODEC_ID_RAWVIDEO &&
+        (ret = check_size(s->avctx, s)) != 0)
+        goto done;
+
     ret = ff_v4l2_context_get_format(&s->capture, 1);
     if (ret) {
         av_log(log_ctx, AV_LOG_DEBUG, "v4l2 capture format not supported\n");

From 805985ea191c98885a74dbf994b1ca11551cd81e Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 9 Jun 2023 10:28:12 +0000
Subject: [PATCH 135/161] vf_bwdif: Add attributes to ask for vectorization

(cherry picked from commit 281250290ba5c2dcd8676e9a261050e65c10bcb7)
---
 libavfilter/vf_bwdif.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 65c617ebb3..09e68523bb 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -74,10 +74,10 @@ typedef struct ThreadData {
         int temporal_diff1 =(FFABS(prev[mrefs] - c) + FFABS(prev[prefs] - e)) >> 1; \
         int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e)) >> 1; \
         int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \
- \
+ {/*\
         if (!diff) { \
             dst[0] = d; \
-        } else {
+        } else {*/

 #define SPAT_CHECK() \
             int b = ((prev2[mrefs2] + next2[mrefs2]) >> 1) - c; \
@@ -89,15 +89,16 @@ typedef struct ThreadData {
             diff = FFMAX3(diff, min, -max);

 #define FILTER_LINE() \
+            int i1, i2; \
             SPAT_CHECK() \
-            if (FFABS(c - e) > temporal_diff0) { \
-                interpol = (((coef_hf[0] * (prev2[0] + next2[0]) \
+            /*if (FFABS(c - e) > temporal_diff0)*/ { \
+                i1 = (((coef_hf[0] * (prev2[0] + next2[0]) \
                     - coef_hf[1] * (prev2[mrefs2] + next2[mrefs2] + prev2[prefs2] + next2[prefs2]) \
                     + coef_hf[2] * (prev2[mrefs4] + next2[mrefs4] + prev2[prefs4] + next2[prefs4])) >> 2) \
                     + coef_lf[0] * (c + e) - coef_lf[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
-            } else { \
-                interpol = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
-            }
+            } /*else*/ { \
+                i2 = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
+            }interpol = FFABS(c - e) > temporal_diff0 ? i1:i2;\

 #define FILTER_EDGE() \
             if (spat) { \
@@ -111,7 +112,7 @@ typedef struct ThreadData {
             else if (interpol < d - diff) \
                 interpol = d - diff; \
  \
-            dst[0] = av_clip(interpol, 0, clip_max); \
+            dst[0] = !diff ? d : av_clip(interpol, 0, clip_max); \
         } \
  \
         dst++; \
@@ -122,7 +123,7 @@ typedef struct ThreadData {
         next2++; \
     }

-static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
+static void __attribute__((optimize("tree-vectorize"))) filter_intra(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
                          int prefs3, int mrefs3, int parity, int clip_max)
 {
     uint8_t *dst = dst1;
@@ -132,7 +133,7 @@ static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
     FILTER_INTRA()
 }

-static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+static void __attribute__((optimize("tree-vectorize"))) filter_line_c(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
                           int w, int prefs, int mrefs, int prefs2, int mrefs2,
                           int prefs3, int mrefs3, int prefs4, int mrefs4,
                           int parity, int clip_max)
@@ -150,7 +151,7 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
     FILTER2()
 }

-static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
+static void __attribute__((optimize("tree-vectorize"))) filter_edge(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
                         int w, int prefs, int mrefs, int prefs2, int mrefs2,
                         int parity, int clip_max, int spat)
 {
@@ -167,7 +168,7 @@ static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
     FILTER2()
 }

-static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mrefs,
+static void __attribute__((optimize("tree-vectorize"))) filter_intra_16bit(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
                                int prefs3, int mrefs3, int parity, int clip_max)
 {
     uint16_t *dst = dst1;
@@ -177,7 +178,7 @@ static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mre
     FILTER_INTRA()
 }

-static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1,
+static void __attribute__((optimize("tree-vectorize"))) filter_line_c_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
                                 int w, int prefs, int mrefs, int prefs2, int mrefs2,
                                 int prefs3, int mrefs3, int prefs4, int mrefs4,
                                 int parity, int clip_max)
@@ -195,7 +196,7 @@ static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1
     FILTER2()
 }

-static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1,
+static void __attribute__((optimize("tree-vectorize"))) filter_edge_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
                               int parity, int clip_max, int spat)
 {

From f4012f09da1c57a0aa5db01f9096992d0c385f7b Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 13 Jun 2023 13:07:55 +0000
Subject: [PATCH 136/161] v4l2m2m_dec: Fix h264 reorder size if no sps
 initially

(cherry picked from commit 8832f7924bf47cbca0de251d7b406917f958ebf4)
---
 libavcodec/v4l2_m2m_dec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 13af62e819..11c83b2d66 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -1024,8 +1024,8 @@ parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s)
                 if (sps) {
                     avctx->profile = ff_h264_get_profile(sps);
                     avctx->level = sps->level_idc;
+                    s->reorder_size = sps->num_reorder_frames;
                 }
-                s->reorder_size = sps->num_reorder_frames;
             }
             ff_h264_ps_uninit(&ps);
             break;

From fd31937e4befa2368d48e234d66fb962246bf777 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 30 Jun 2023 18:03:29 +0000
Subject: [PATCH 137/161] sand_fns: Add missing uxtw for neon stride

---
 libavutil/aarch64/rpi_sand_neon.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
index 2f07d9674c..19411cf3f1 100644
--- a/libavutil/aarch64/rpi_sand_neon.S
+++ b/libavutil/aarch64/rpi_sand_neon.S
@@ -469,6 +469,7 @@ endfunc
 function ff_rpi_sand30_lines_to_planar_y16, export=1
                 lsl             w4,  w4,  #7
                 sub             w4,  w4,  #64
+                uxtw            x4,  w4
                 sub             w1,  w1,  w7, lsl #1
                 uxtw            x6,  w6
                 add             x8,  x2,  x6, lsl #7
@@ -634,6 +635,7 @@ endfunc
 function ff_rpi_sand30_lines_to_planar_y8, export=1
                 lsl             w4,  w4,  #7
                 sub             w4,  w4,  #64
+                uxtw            x4,  w4
                 sub             w1,  w1,  w7
                 uxtw            x6,  w6
                 add             x8,  x2,  x6, lsl #7

From f6a19a36ffe0dbe0a6e2e450dafec6711db19057 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 30 Jun 2023 18:12:16 +0000
Subject: [PATCH 138/161] sand_fns: Rework aarch64 neon
 sand30_lines_to_planar_c16

Previous version could overflow its write buffer on small buffers
which sometimes crashed WPP_F_ericsson_MAIN10_2.

This version is probably faster too
---
 libavutil/aarch64/rpi_sand_neon.S | 329 ++++++++++++++----------------
 1 file changed, 151 insertions(+), 178 deletions(-)

diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
index 19411cf3f1..af7e2a88c4 100644
--- a/libavutil/aarch64/rpi_sand_neon.S
+++ b/libavutil/aarch64/rpi_sand_neon.S
@@ -248,199 +248,172 @@ incomplete_block_loop_end_c8:
     ret
 endfunc

-//void ff_rpi_sand30_lines_to_planar_c16(
-//  uint8_t * dst_u,            // [x0]
-//  unsigned int dst_stride_u,  // [w1] == _w*2
-//  uint8_t * dst_v,            // [x2]
-//  unsigned int dst_stride_v,  // [w3] == _w*2
-//  const uint8_t * src,        // [x4]
-//  unsigned int stride1,       // [w5] == 128
-//  unsigned int stride2,       // [w6]
-//  unsigned int _x,            // [w7] == 0
-//  unsigned int y,             // [sp, #0] == 0
-//  unsigned int _w,            // [sp, #8] -> w3
-//  unsigned int h);            // [sp, #16] -> w7
-
-.macro rpi_sand30_lines_to_planar_c16_block_half
-    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
-
-    xtn v4.4h, v0.4s
-    ushr v0.4s, v0.4s, #10
-    xtn v5.4h, v0.4s
-    ushr v0.4s, v0.4s, #10
-    xtn v6.4h, v0.4s
-    xtn2 v4.8h, v1.4s
-    ushr v1.4s, v1.4s, #10
-    xtn2 v5.8h, v1.4s
-    ushr v1.4s, v1.4s, #10
-    xtn2 v6.8h, v1.4s
-    and v4.16b, v4.16b, v16.16b
-    and v5.16b, v5.16b, v16.16b
-    and v6.16b, v6.16b, v16.16b
-    st3 { v4.8h, v5.8h, v6.8h }, [sp], #48
-
-    xtn v4.4h, v2.4s
-    ushr v2.4s, v2.4s, #10
-    xtn v5.4h, v2.4s
-    ushr v2.4s, v2.4s, #10
-    xtn v6.4h, v2.4s
-    xtn2 v4.8h, v3.4s
-    ushr v3.4s, v3.4s, #10
-    xtn2 v5.8h, v3.4s
-    ushr v3.4s, v3.4s, #10
-    xtn2 v6.8h, v3.4s
-    and v4.16b, v4.16b, v16.16b
-    and v5.16b, v5.16b, v16.16b
-    and v6.16b, v6.16b, v16.16b
-    st3 { v4.8h, v5.8h, v6.8h }, [sp]
-    sub sp, sp, #48
-.endm
-
-function ff_rpi_sand30_lines_to_planar_c16, export=1
-    stp x19, x20, [sp, #-48]!
-    stp x21, x22, [sp, #16]
-    stp x23, x24, [sp, #32]
-
-    ldr w3, [sp, #48+8]    // w3 = width
-    ldr w7, [sp, #48+16]   // w7 = height
-
-    // reserve space on the stack for intermediate results
-    sub sp, sp, #256
+// Unzip chroma
+//
+// On entry:
+// a0 = V0, U2,  ...
+// a1 = U0, V1,  ...
+// a2 = U1, V2,  ...
+// b0 = V8, U10, ...
+// b1 = U8, V9,  ...
+// b2 = U9, V10, ...
+//
+// On exit:
+// d0 = U0, U3, ...
+// ...
+// a0 = V0, V3, ..
+// ...
+//
+// Reg order for USAND is a1, a0, a2 (i.e. swap natural order of 1st 2 dest regs)

-    // number of 128byte blocks per row, w8 = width / 48
-    mov w9, #48
-    udiv w8, w3, w9
+.macro UZPH_C d0, d1, d2, a0, a1, a2, b0, b1, b2
+                uzp1            \d0\().8h, \a1\().8h, \b1\().8h
+                uzp1            \d1\().8h, \a2\().8h, \b2\().8h
+                uzp2            \d2\().8h, \a0\().8h, \b0\().8h

-    // remaining pixels (rem_pix) per row, w9 = width - w8 * 48
-    mul w9, w8, w9
-    sub w9, w3, w9
+                uzp1            \a0\().8h, \a0\().8h, \b0\().8h
+                uzp2            \a1\().8h, \a1\().8h, \b1\().8h
+                uzp2            \a2\().8h, \a2\().8h, \b2\().8h
+.endm

-    // row offset, the beginning of the next row to process
-    eor w10, w10, w10
+// SAND30 -> 10bit
+.macro USAND10 d0, d1, d2, a0, a1
+                shrn            \d2\().4h, \a0\().4s, #14
+                xtn             \d0\().4h, \a0\().4s
+                shrn            \d1\().4h, \a0\().4s, #10

-    // offset to the beginning of the next block, w11 = stride2 * 128 - 128
-    lsl w11, w6, #7
-    sub w11, w11, #128
+                shrn2           \d2\().8h, \a1\().4s, #14
+                xtn2            \d0\().8h, \a1\().4s
+                shrn2           \d1\().8h, \a1\().4s, #10

-    // decrease the height by one and in case of remaining pixels increase the block count by one
-    sub w7, w7, #1
-    cmp w9, #0
-    cset w19, ne    // w19 == 1 iff reamining pixels != 0
-    add w8, w8, w19
+                ushr            \d2\().8h, \d2\().8h, #6
+                bic             \d0\().8h, #0xfc,     lsl #8
+                bic             \d1\().8h, #0xfc,     lsl #8
+.endm

-    // bytes we have to move dst back by at the end of every row
-    mov w21, #48*2
-    mul w21, w21, w8
-    sub w21, w1, w21
+// void ff_rpi_sand30_lines_to_planar_c16(
+//   uint8_t * dst_u,            // [x0]
+//   unsigned int dst_stride_u,  // [w1]
+//   uint8_t * dst_v,            // [x2]
+//   unsigned int dst_stride_v,  // [w3]
+//   const uint8_t * src,        // [x4]
+//   unsigned int stride1,       // [w5]      128
+//   unsigned int stride2,       // [w6]
+//   unsigned int _x,            // [w7]      0
+//   unsigned int y,             // [sp, #0]
+//   unsigned int _w,            // [sp, #8]  w9
+//   unsigned int h);            // [sp, #16] w10

-    mov w20, #0     // w20 = flag, last row processed
+function ff_rpi_sand30_lines_to_planar_c16, export=1
+                ldr             w7,  [sp, #0]                   // y
+                ldr             w8,  [sp, #8]                   // _w
+                ldr             w10, [sp, #16]                  // h
+                lsl             w6,  w6,  #7                    // Fixup stride2
+                sub             w6,  w6,  #64
+                uxtw            x6,  w6
+                sub             w1,  w1,  w8,  LSL #1           // Fixup chroma strides
+                sub             w3,  w3,  w8,  LSL #1
+                lsl             w7,  w7,  #7                    // Add y to src
+                add             x4,  x4,  w7,  UXTW
+10:
+                mov             w13, #0
+                mov             x5,  x4
+                mov             w9,  w8
+1:
+                ld1             {v0.4s-v3.4s}, [x5], #64
+                ld1             {v4.4s-v7.4s}, [x5], x6

-    mov x12, #0x03ff03ff03ff03ff
-    dup v16.2d, x12
+                USAND10         v17, v16, v18, v0, v1
+                USAND10         v20, v19, v21, v2, v3
+                UZPH_C          v0, v1, v2, v16, v17, v18, v19, v20, v21
+                USAND10         v23, v22, v24, v4, v5
+                USAND10         v26, v25, v27, v6, v7
+                UZPH_C          v4, v5, v6, v22, v23, v24, v25, v26, v27

-    // iterate through rows, row counter = w12 = 0
-    eor w12, w12, w12
-row_loop_c16:
-    cmp w12, w7
-    bge row_loop_c16_fin
+                subs            w9,  w9,  #48
+                blt             2f

-    // address of row data = src + row_offset
-    mov x13, x4
-    add x13, x13, x10
+                st3             {v0.8h-v2.8h},   [x0], #48
+                st3             {v4.8h-v6.8h},   [x0], #48
+                st3             {v16.8h-v18.8h}, [x2], #48
+                st3             {v22.8h-v24.8h}, [x2], #48

-    eor w14, w14, w14
-block_loop_c16:
-    cmp w14, w8
-    bge block_loop_c16_fin
-
-    rpi_sand30_lines_to_planar_c16_block_half
-
-    ld2 { v0.8h, v1.8h }, [sp], #32
-    ld2 { v2.8h, v3.8h }, [sp], #32
-    ld2 { v4.8h, v5.8h }, [sp]
-    sub sp, sp, #64
-
-    st1 { v0.8h }, [x0], #16
-    st1 { v2.8h }, [x0], #16
-    st1 { v4.8h }, [x0], #16
-    st1 { v1.8h }, [x2], #16
-    st1 { v3.8h }, [x2], #16
-    st1 { v5.8h }, [x2], #16
-
-    rpi_sand30_lines_to_planar_c16_block_half
-
-    ld2 { v0.8h, v1.8h }, [sp], #32
-    ld2 { v2.8h, v3.8h }, [sp], #32
-    ld2 { v4.8h, v5.8h }, [sp]
-    sub sp, sp, #64
-
-    st1 { v0.8h }, [x0], #16
-    st1 { v2.8h }, [x0], #16
-    st1 { v4.8h }, [x0], #16
-    st1 { v1.8h }, [x2], #16
-    st1 { v3.8h }, [x2], #16
-    st1 { v5.8h }, [x2], #16
-
-    add x13, x13, x11 // offset to next block
-    add w14, w14, #1
-    b block_loop_c16
-block_loop_c16_fin:
+                bne             1b
+11:
+                subs            w10, w10, #1
+                add             x4,  x4,  #128
+                add             x0,  x0,  w1,  UXTW
+                add             x2,  x2,  w3,  UXTW
+                bne             10b
+99:
+                ret

-    add w10, w10, #128
-    add w12, w12, #1
-    add x0, x0, w21, sxtw  // move dst pointers back by x21
-    add x2, x2, w21, sxtw
-    b row_loop_c16
-row_loop_c16_fin:
-
-    cmp w20, #1
-    beq row_loop_c16_fin2
-    mov w20, #1
-    sub w8, w8, w19 // decrease block count by w19
-    add w7, w7, #1 // increase height
-    b row_loop_c16
-
-row_loop_c16_fin2:
-    sub x0, x0, w21, sxtw // readd x21 in case of the last row
-    sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels
-
-    // last incomplete block to be finished
-    // read operations are fine, stride2 is more than large enough even if rem_pix is 0
-    rpi_sand30_lines_to_planar_c16_block_half
-    ld2 { v0.8h, v1.8h }, [sp], #32
-    ld2 { v2.8h, v3.8h }, [sp], #32
-    ld2 { v4.8h, v5.8h }, [sp], #32
-    rpi_sand30_lines_to_planar_c16_block_half
-    ld2 { v0.8h, v1.8h }, [sp], #32
-    ld2 { v2.8h, v3.8h }, [sp], #32
-    ld2 { v4.8h, v5.8h }, [sp]
-    sub sp, sp, #160
-
-    mov x4, sp
-    eor w20, w20, w20
-rem_pix_c16_loop:
-    cmp w20, w9
-    bge rem_pix_c16_fin
-
-    ldr w22, [x4], #4
-    str w22, [x0], #2
-    lsr w22, w22, #16
-    str w22, [x2], #2
-
-    add w20, w20, #1
-    b rem_pix_c16_loop
-rem_pix_c16_fin:
-
-    add sp, sp, #256
-
-    ldp x23, x24, [sp, #32]
-    ldp x21, x22, [sp, #16]
-    ldp x19, x20, [sp], #48
-    ret
+// Partial final write
+2:
+                cmp             w9,  #24-48
+                blt             1f
+                st3             {v0.8h  - v2.8h},  [x0], #48
+                st3             {v16.8h - v18.8h}, [x2], #48
+                beq             11b
+                mov             v0.16b,  v4.16b
+                mov             v1.16b,  v5.16b
+                sub             w9,  w9,  #24
+                mov             v2.16b,  v6.16b
+                mov             v16.16b, v22.16b
+                mov             v17.16b, v23.16b
+                mov             v18.16b, v24.16b
+1:
+                cmp             w9,  #12-48
+                blt             1f
+                st3             {v0.4h  - v2.4h},  [x0], #24
+                st3             {v16.4h - v18.4h}, [x2], #24
+                beq             11b
+                mov             v0.2d[0],  v0.2d[1]
+                sub             w9,  w9,  #12
+                mov             v1.2d[0],  v1.2d[1]
+                mov             v2.2d[0],  v2.2d[1]
+                mov             v16.2d[0], v16.2d[1]
+                mov             v17.2d[0], v17.2d[1]
+                mov             v18.2d[0], v18.2d[1]
+1:
+                cmp             w9,  #6-48
+                blt             1f
+                st3             {v0.h  - v2.h}[0],  [x0], #6
+                st3             {v0.h  - v2.h}[1],  [x0], #6
+                st3             {v16.h - v18.h}[0], [x2], #6
+                st3             {v16.h - v18.h}[1], [x2], #6
+                beq             11b
+                mov             v0.s[0],  v0.s[1]
+                sub             w9,  w9,  #6
+                mov             v1.s[0],  v1.s[1]
+                mov             v2.s[0],  v2.s[1]
+                mov             v16.s[0], v16.s[1]
+                mov             v17.s[0], v17.s[1]
+                mov             v18.s[0], v18.s[1]
+1:
+                cmp             w9,  #3-48
+                blt             1f
+                st3             {v0.h  - v2.h}[0],  [x0], #6
+                st3             {v16.h - v18.h}[0], [x2], #6
+                beq             11b
+                mov             v0.h[0],  v0.h[1]
+                sub             w9,  w9,  #3
+                mov             v1.h[0],  v1.h[1]
+                mov             v16.h[0], v16.h[1]
+                mov             v17.h[0], v17.h[1]
+1:
+                cmp             w9,  #2-48
+                blt             1f
+                st2             {v0.h  - v1.h}[0],  [x0], #4
+                st2             {v16.h - v17.h}[0], [x2], #4
+                b               11b
+1:
+                st1             {v0.h}[0],  [x0], #2
+                st1             {v16.h}[0], [x2], #2
+                b               11b
 endfunc


-
 //void ff_rpi_sand30_lines_to_planar_p010(
 //  uint8_t * dest,
 //  unsigned int dst_stride,

From 68356e594ff32e18e419a476889d958dc24af4b2 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 30 Jun 2023 19:41:06 +0000
Subject: [PATCH 139/161] sand_fns: Minor optimisations to aarch64 neon

---
 libavutil/aarch64/rpi_sand_neon.S | 140 ++++++------------------------
 1 file changed, 28 insertions(+), 112 deletions(-)

diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
index af7e2a88c4..11658de0c8 100644
--- a/libavutil/aarch64/rpi_sand_neon.S
+++ b/libavutil/aarch64/rpi_sand_neon.S
@@ -279,18 +279,37 @@ endfunc
 // SAND30 -> 10bit
 .macro USAND10 d0, d1, d2, a0, a1
                 shrn            \d2\().4h, \a0\().4s, #14
-                xtn             \d0\().4h, \a0\().4s
                 shrn            \d1\().4h, \a0\().4s, #10

                 shrn2           \d2\().8h, \a1\().4s, #14
-                xtn2            \d0\().8h, \a1\().4s
                 shrn2           \d1\().8h, \a1\().4s, #10
+                uzp1            \d0\().8h, \a0\().8h, \a1\().8h

                 ushr            \d2\().8h, \d2\().8h, #6
                 bic             \d0\().8h, #0xfc,     lsl #8
                 bic             \d1\().8h, #0xfc,     lsl #8
 .endm

+// SAND30 -> 8bit
+.macro USAND8 d0, d1, d2, a0, a1, a2, a3, t0, t1, t2
+                shrn            \d1\().4h,  \a0\().4s,  #12
+                shrn2           \d1\().8h,  \a1\().4s,  #12
+                uzp1            \d0\().8h,  \a0\().8h,  \a1\().8h
+                uzp2            \d2\().8h,  \a0\().8h,  \a1\().8h
+
+                shrn            \t1\().4h,  \a2\().4s,  #12
+                shrn2           \t1\().8h,  \a3\().4s,  #12
+                uzp1            \t0\().8h,  \a2\().8h,  \a3\().8h
+                uzp2            \t2\().8h,  \a2\().8h,  \a3\().8h
+
+                shrn            \d0\().8b,  \d0\().8h,  #2
+                shrn2           \d0\().16b, \t0\().8h,  #2
+                shrn            \d2\().8b,  \d2\().8h,  #6
+                shrn2           \d2\().16b, \t2\().8h,  #6
+                uzp1            \d1\().16b, \d1\().16b, \t1\().16b
+.endm
+
+
 // void ff_rpi_sand30_lines_to_planar_c16(
 //   uint8_t * dst_u,            // [x0]
 //   unsigned int dst_stride_u,  // [w1]
@@ -322,6 +341,7 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1
 1:
                 ld1             {v0.4s-v3.4s}, [x5], #64
                 ld1             {v4.4s-v7.4s}, [x5], x6
+                subs            w9,  w9,  #48

                 USAND10         v17, v16, v18, v0, v1
                 USAND10         v20, v19, v21, v2, v3
@@ -330,7 +350,6 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1
                 USAND10         v26, v25, v27, v6, v7
                 UZPH_C          v4, v5, v6, v22, v23, v24, v25, v26, v27

-                subs            w9,  w9,  #48
                 blt             2f

                 st3             {v0.8h-v2.8h},   [x0], #48
@@ -457,61 +476,10 @@ function ff_rpi_sand30_lines_to_planar_y16, export=1

                 subs            w5,  w5,  #96

-                // v0, v1
-
-                shrn            v18.4h,  v0.4s,   #14
-                xtn             v16.4h,  v0.4s
-                shrn            v17.4h,  v0.4s,   #10
-
-                shrn2           v18.8h,  v1.4s,   #14
-                xtn2            v16.8h,  v1.4s
-                shrn2           v17.8h,  v1.4s,   #10
-
-                ushr            v18.8h,  v18.8h,  #6
-                bic             v16.8h,  #0xfc,   lsl #8
-                bic             v17.8h,  #0xfc,   lsl #8
-
-                // v2, v3
-
-                shrn            v21.4h,  v2.4s,   #14
-                xtn             v19.4h,  v2.4s
-                shrn            v20.4h,  v2.4s,   #10
-
-                shrn2           v21.8h,  v3.4s,   #14
-                xtn2            v19.8h,  v3.4s
-                shrn2           v20.8h,  v3.4s,   #10
-
-                ushr            v21.8h,  v21.8h,  #6
-                bic             v19.8h,  #0xfc,   lsl #8
-                bic             v20.8h,  #0xfc,   lsl #8
-
-                // v4, v5
-
-                shrn            v24.4h,  v4.4s,   #14
-                xtn             v22.4h,  v4.4s
-                shrn            v23.4h,  v4.4s,   #10
-
-                shrn2           v24.8h,  v5.4s,   #14
-                xtn2            v22.8h,  v5.4s
-                shrn2           v23.8h,  v5.4s,   #10
-
-                ushr            v24.8h,  v24.8h,  #6
-                bic             v22.8h,  #0xfc,   lsl #8
-                bic             v23.8h,  #0xfc,   lsl #8
-
-                // v6, v7
-
-                shrn            v27.4h,  v6.4s,   #14
-                xtn             v25.4h,  v6.4s
-                shrn            v26.4h,  v6.4s,   #10
-
-                shrn2           v27.8h,  v7.4s,   #14
-                xtn2            v25.8h,  v7.4s
-                shrn2           v26.8h,  v7.4s,   #10
-
-                ushr            v27.8h,  v27.8h,  #6
-                bic             v25.8h,  #0xfc,   lsl #8
-                bic             v26.8h,  #0xfc,   lsl #8
+                USAND10         v16, v17, v18, v0, v1
+                USAND10         v19, v20, v21, v2, v3
+                USAND10         v22, v23, v24, v4, v5
+                USAND10         v25, v26, v27, v6, v7

                 blt             2f

@@ -624,60 +592,8 @@ function ff_rpi_sand30_lines_to_planar_y8, export=1
                 subs            w5,  w5,  #96

                 // v0, v1
-
-                shrn            v18.4h,  v0.4s,   #16
-                xtn             v16.4h,  v0.4s
-                shrn            v17.4h,  v0.4s,   #12
-
-                shrn2           v18.8h,  v1.4s,   #16
-                xtn2            v16.8h,  v1.4s
-                shrn2           v17.8h,  v1.4s,   #12
-
-                shrn            v18.8b,  v18.8h,  #6
-                shrn            v16.8b,  v16.8h,  #2
-                xtn             v17.8b,  v17.8h
-
-                // v2, v3
-
-                shrn            v21.4h,  v2.4s,   #16
-                xtn             v19.4h,  v2.4s
-                shrn            v20.4h,  v2.4s,   #12
-
-                shrn2           v21.8h,  v3.4s,   #16
-                xtn2            v19.8h,  v3.4s
-                shrn2           v20.8h,  v3.4s,   #12
-
-                shrn2           v18.16b, v21.8h,  #6
-                shrn2           v16.16b, v19.8h,  #2
-                xtn2            v17.16b, v20.8h
-
-                // v4, v5
-
-                shrn            v24.4h,  v4.4s,   #16
-                xtn             v22.4h,  v4.4s
-                shrn            v23.4h,  v4.4s,   #12
-
-                shrn2           v24.8h,  v5.4s,   #16
-                xtn2            v22.8h,  v5.4s
-                shrn2           v23.8h,  v5.4s,   #12
-
-                shrn            v21.8b,  v24.8h,  #6
-                shrn            v19.8b,  v22.8h,  #2
-                xtn             v20.8b,  v23.8h
-
-                // v6, v7
-
-                shrn            v27.4h,  v6.4s,   #16
-                xtn             v25.4h,  v6.4s
-                shrn            v26.4h,  v6.4s,   #12
-
-                shrn2           v27.8h,  v7.4s,   #16
-                xtn2            v25.8h,  v7.4s
-                shrn2           v26.8h,  v7.4s,   #12
-
-                shrn2           v21.16b, v27.8h,  #6
-                shrn2           v19.16b, v25.8h,  #2
-                xtn2            v20.16b, v26.8h
+                USAND8          v16, v17, v18, v0, v1, v2, v3, v22, v23, v24
+                USAND8          v19, v20, v21, v4, v5, v6, v7, v22, v23, v24

                 blt             2f


From 3abb0dcc453aba0a069bc1a8f26ba77913c5ef2b Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Sat, 1 Jul 2023 18:43:32 +0000
Subject: [PATCH 140/161] sand_fns: Add test for neon to sand30 fns so they can
 be tested by checkasm

---
 libavutil/rpi_sand_fns.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
index b6071e2928..0626bb06cb 100644
--- a/libavutil/rpi_sand_fns.c
+++ b/libavutil/rpi_sand_fns.c
@@ -35,10 +35,12 @@ Authors: John Cox
 #include "frame.h"

 #if ARCH_ARM && HAVE_NEON
-#include "arm/rpi_sand_neon.h"
+#include "libavutil/arm/cpu.h"
+#include "libavutil/arm/rpi_sand_neon.h"
 #define HAVE_SAND_ASM 1
 #elif ARCH_AARCH64 && HAVE_NEON
-#include "aarch64/rpi_sand_neon.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavutil/aarch64/rpi_sand_neon.h"
 #define HAVE_SAND_ASM 1
 #else
 #define HAVE_SAND_ASM 0
@@ -97,7 +99,7 @@ void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
     const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words

 #if HAVE_SAND_ASM
-    if (_x == 0) {
+    if (_x == 0 && have_neon(av_get_cpu_flags())) {
         ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
         return;
     }
@@ -163,7 +165,7 @@ void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_
     const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words

 #if HAVE_SAND_ASM
-    if (_x == 0) {
+    if (_x == 0 && have_neon(av_get_cpu_flags())) {
         ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v,
                                        src, stride1, stride2, _x, y, _w, h);
         return;

From fb72aa34ec2c42fc595bb1a6c32b599da870fa2b Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Sat, 1 Jul 2023 18:43:57 +0000
Subject: [PATCH 141/161] checkasm: Add tests for rpi_sand sand30 fns

Something of a kludge for function selection as, at the moment, the
rpi_sand fns don't have a jump table that we could use for selection.
---
 tests/checkasm/Makefile   |   3 +-
 tests/checkasm/checkasm.c |   3 +
 tests/checkasm/checkasm.h |   1 +
 tests/checkasm/rpi_sand.c | 118 ++++++++++++++++++++++++++++++++++++++
 tests/fate/checkasm.mak   |   1 +
 5 files changed, 125 insertions(+), 1 deletion(-)
 create mode 100644 tests/checkasm/rpi_sand.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index a6f06c7007..66291baf33 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -59,8 +59,9 @@ CHECKASMOBJS-$(CONFIG_SWSCALE)  += $(SWSCALEOBJS)
 AVUTILOBJS                              += av_tx.o
 AVUTILOBJS                              += fixed_dsp.o
 AVUTILOBJS                              += float_dsp.o
+AVUTILOBJS-$(CONFIG_SAND)               += rpi_sand.o

-CHECKASMOBJS-$(CONFIG_AVUTIL)  += $(AVUTILOBJS)
+CHECKASMOBJS-$(CONFIG_AVUTIL)  += $(AVUTILOBJS) $(AVUTILOBJS-yes)

 CHECKASMOBJS-$(ARCH_AARCH64)            += aarch64/checkasm.o
 CHECKASMOBJS-$(HAVE_ARMV5TE_EXTERNAL)   += arm/checkasm.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index e96d84a7da..57e0091b80 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -210,6 +210,9 @@ static const struct {
         { "fixed_dsp", checkasm_check_fixed_dsp },
         { "float_dsp", checkasm_check_float_dsp },
         { "av_tx",     checkasm_check_av_tx },
+    #if CONFIG_SAND
+        { "rpi_sand",  checkasm_check_rpi_sand },
+    #endif
 #endif
     { NULL }
 };
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 8744a81218..f4a0d20358 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -73,6 +73,7 @@ void checkasm_check_motion(void);
 void checkasm_check_nlmeans(void);
 void checkasm_check_opusdsp(void);
 void checkasm_check_pixblockdsp(void);
+void checkasm_check_rpi_sand(void);
 void checkasm_check_sbrdsp(void);
 void checkasm_check_synth_filter(void);
 void checkasm_check_sw_gbrp(void);
diff --git a/tests/checkasm/rpi_sand.c b/tests/checkasm/rpi_sand.c
new file mode 100644
index 0000000000..0888714c4c
--- /dev/null
+++ b/tests/checkasm/rpi_sand.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2023 John Cox
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+#include "checkasm.h"
+#include "libavutil/common.h"
+#include "libavutil/rpi_sand_fns.h"
+
+#if ARCH_ARM
+#include "libavutil/arm/cpu.h"
+#include "libavutil/arm/rpi_sand_neon.h"
+#elif ARCH_AARCH64
+#include "libavutil/aarch64/cpu.h"
+#include "libavutil/aarch64/rpi_sand_neon.h"
+#endif
+
+static inline uint32_t pack30(unsigned int a, unsigned int b, unsigned int c)
+{
+    return (a & 0x3ff) | ((b & 0x3ff) << 10) | ((c & 0x3ff) << 20);
+}
+
+void checkasm_check_rpi_sand(void)
+{
+    const unsigned int w = 1280;
+    const unsigned int h = 66;
+    const unsigned int stride1 = 128;
+    const unsigned int stride2 = h*3/2;
+    const unsigned int ssize = ((w+95)/96)*128*h*3/2;
+    const unsigned int ysize = ((w + 32) * (h + 32) * 2);
+
+    uint8_t * sbuf0 = malloc(ssize);
+    uint8_t * sbuf1 = malloc(ssize);
+    uint8_t * ybuf0 = malloc(ysize);
+    uint8_t * ybuf1 = malloc(ysize);
+    uint8_t * vbuf0 = malloc(ysize);
+    uint8_t * vbuf1 = malloc(ysize);
+    uint8_t * yframe0 = (w + 32) * 16 + ybuf0;
+    uint8_t * yframe1 = (w + 32) * 16 + ybuf1;
+    uint8_t * vframe0 = (w + 32) * 16 + vbuf0;
+    uint8_t * vframe1 = (w + 32) * 16 + vbuf1;
+    unsigned int i;
+
+    for (i = 0; i != ssize; i += 4)
+        *(uint32_t*)(sbuf0 + i) = rnd();
+    memcpy(sbuf1, sbuf0, ssize);
+
+    if (check_func(have_neon(av_get_cpu_flags()) ? ff_rpi_sand30_lines_to_planar_y16 : av_rpi_sand30_to_planar_y16, "rpi_sand30_to_planar_y16")) {
+        declare_func(void, uint8_t * dst, const unsigned int dst_stride,
+                     const uint8_t * src,
+                     unsigned int stride1, unsigned int stride2,
+                     unsigned int _x, unsigned int y,
+                     unsigned int _w, unsigned int h);
+
+        memset(ybuf0, 0xbb, ysize);
+        memset(ybuf1, 0xbb, ysize);
+
+        call_ref(yframe0, (w + 32) * 2, sbuf0, stride1, stride2, 0, 0, w, h);
+        call_new(yframe1, (w + 32) * 2, sbuf1, stride1, stride2, 0, 0, w, h);
+
+        if (memcmp(sbuf0, sbuf1, ssize)
+            || memcmp(ybuf0, ybuf1, ysize))
+            fail();
+
+        bench_new(ybuf1, (w + 32) * 2, sbuf1, stride1, stride2, 0, 0, w, h);
+    }
+
+    if (check_func(have_neon(av_get_cpu_flags()) ? ff_rpi_sand30_lines_to_planar_c16 : av_rpi_sand30_to_planar_c16, "rpi_sand30_to_planar_c16")) {
+        declare_func(void, uint8_t * u_dst, const unsigned int u_stride,
+                     uint8_t * v_dst, const unsigned int v_stride,
+                     const uint8_t * src,
+                     unsigned int stride1, unsigned int stride2,
+                     unsigned int _x, unsigned int y,
+                     unsigned int _w, unsigned int h);
+
+        memset(ybuf0, 0xbb, ysize);
+        memset(ybuf1, 0xbb, ysize);
+        memset(vbuf0, 0xbb, ysize);
+        memset(vbuf1, 0xbb, ysize);
+
+        call_ref(yframe0, (w + 32), vframe0, (w + 32), sbuf0, stride1, stride2, 0, 0, w/2, h/2);
+        call_new(yframe1, (w + 32), vframe1, (w + 32), sbuf1, stride1, stride2, 0, 0, w/2, h/2);
+
+        if (memcmp(sbuf0, sbuf1, ssize)
+            || memcmp(ybuf0, ybuf1, ysize)
+            || memcmp(vbuf0, vbuf1, ysize))
+            fail();
+
+        bench_new(yframe1, (w + 32), vframe1, (w + 32), sbuf1, stride1, stride2, 0, 0, w/2, h/2);
+    }
+
+
+    report("sand30");
+
+    free(sbuf0);
+    free(sbuf1);
+    free(ybuf0);
+    free(ybuf1);
+    free(vbuf0);
+    free(vbuf1);
+}
+
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index a4e95541f5..6fda6d227e 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -27,6 +27,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
                 fate-checkasm-motion                                    \
                 fate-checkasm-opusdsp                                   \
                 fate-checkasm-pixblockdsp                               \
+                fate-checkasm-rpi_sand                                  \
                 fate-checkasm-sbrdsp                                    \
                 fate-checkasm-synth_filter                              \
                 fate-checkasm-sw_gbrp                                   \

From d798c7b90dd63bca6f9878b1fb30ec1d8f0b9a5e Mon Sep 17 00:00:00 2001
From: James Darnley <jdarnley@obe.tv>
Date: Mon, 20 Feb 2023 20:55:08 +0100
Subject: [PATCH 142/161] avfilter/bwdif: move filter_line init to a dedicated
 function

(cherry picked from commit b503b5a0cf80f38ecf4737c012b621b7e94f242a)
---
 libavfilter/bwdif.h             |  3 ++-
 libavfilter/vf_bwdif.c          | 13 +++++++++----
 libavfilter/x86/vf_bwdif_init.c |  4 +---
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
index 889ff772ed..5749345f78 100644
--- a/libavfilter/bwdif.h
+++ b/libavfilter/bwdif.h
@@ -37,6 +37,7 @@ typedef struct BWDIFContext {
                         int parity, int clip_max, int spat);
 } BWDIFContext;

-void ff_bwdif_init_x86(BWDIFContext *bwdif);
+void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
+void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth);

 #endif /* AVFILTER_BWDIF_H */
diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 09e68523bb..539fabbd46 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -341,7 +341,14 @@ static int config_props(AVFilterLink *link)

     yadif->csp = av_pix_fmt_desc_get(link->format);
     yadif->filter = filter;
-    if (yadif->csp->comp[0].depth > 8) {
+    ff_bwdif_init_filter_line(s, yadif->csp->comp[0].depth);
+
+    return 0;
+}
+
+av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
+{
+    if (bit_depth > 8) {
         s->filter_intra = filter_intra_16bit;
         s->filter_line  = filter_line_c_16bit;
         s->filter_edge  = filter_edge_16bit;
@@ -352,10 +359,8 @@ static int config_props(AVFilterLink *link)
     }

 #if ARCH_X86
-    ff_bwdif_init_x86(s);
+    ff_bwdif_init_x86(s, bit_depth);
 #endif
-
-    return 0;
 }


diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c
index e24e5cd9b1..ba7bc40c3d 100644
--- a/libavfilter/x86/vf_bwdif_init.c
+++ b/libavfilter/x86/vf_bwdif_init.c
@@ -42,11 +42,9 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *ne
                                       int mrefs2, int prefs3, int mrefs3, int prefs4,
                                       int mrefs4, int parity, int clip_max);

-av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif)
+av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
 {
-    YADIFContext *yadif = &bwdif->yadif;
     int cpu_flags = av_get_cpu_flags();
-    int bit_depth = (!yadif->csp) ? 8 : yadif->csp->comp[0].depth;

     if (bit_depth <= 8) {
         if (EXTERNAL_SSE2(cpu_flags))

From 0eb9c627c07931cf93c4932e07e0df6c0ce860fd Mon Sep 17 00:00:00 2001
From: James Darnley <jdarnley@obe.tv>
Date: Mon, 20 Feb 2023 20:55:08 +0100
Subject: [PATCH 143/161] checkasm: add test for bwdif

(cherry picked from commit 087faf8cac51e5e20a5f41b36b8d4c2705a10039)
---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/vf_bwdif.c | 84 +++++++++++++++++++++++++++++++++++++++
 tests/fate/checkasm.mak   |  1 +
 5 files changed, 90 insertions(+)
 create mode 100644 tests/checkasm/vf_bwdif.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 66291baf33..2c80d8e661 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -40,6 +40,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC)          += $(AVCODECOBJS-yes)
 # libavfilter tests
 AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o
 AVFILTEROBJS-$(CONFIG_BLEND_FILTER) += vf_blend.o
+AVFILTEROBJS-$(CONFIG_BWDIF_FILTER)      += vf_bwdif.o
 AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
 AVFILTEROBJS-$(CONFIG_EQ_FILTER)         += vf_eq.o
 AVFILTEROBJS-$(CONFIG_GBLUR_FILTER)      += vf_gblur.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 57e0091b80..4f983d7fbc 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -179,6 +179,9 @@ static const struct {
     #if CONFIG_BLEND_FILTER
         { "vf_blend", checkasm_check_blend },
     #endif
+    #if CONFIG_BWDIF_FILTER
+        { "vf_bwdif", checkasm_check_vf_bwdif },
+    #endif
     #if CONFIG_COLORSPACE_FILTER
         { "vf_colorspace", checkasm_check_colorspace },
     #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index f4a0d20358..d69bc43999 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -83,6 +83,7 @@ void checkasm_check_utvideodsp(void);
 void checkasm_check_v210dec(void);
 void checkasm_check_v210enc(void);
 void checkasm_check_vc1dsp(void);
+void checkasm_check_vf_bwdif(void);
 void checkasm_check_vf_eq(void);
 void checkasm_check_vf_gblur(void);
 void checkasm_check_vf_hflip(void);
diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
new file mode 100644
index 0000000000..46224bb575
--- /dev/null
+++ b/tests/checkasm/vf_bwdif.c
@@ -0,0 +1,84 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+#include "checkasm.h"
+#include "libavcodec/internal.h"
+#include "libavfilter/bwdif.h"
+
+#define WIDTH 256
+
+#define randomize_buffers(buf0, buf1, mask, count) \
+    for (size_t i = 0; i < count; i++) \
+        buf0[i] = buf1[i] = rnd() & mask
+
+#define BODY(type, depth)                                                      \
+    do {                                                                       \
+        type prev0[9*WIDTH], prev1[9*WIDTH];                                   \
+        type next0[9*WIDTH], next1[9*WIDTH];                                   \
+        type cur0[9*WIDTH], cur1[9*WIDTH];                                     \
+        type dst0[WIDTH], dst1[WIDTH];                                         \
+        const int stride = WIDTH;                                              \
+        const int mask = (1<<depth)-1;                                         \
+                                                                               \
+        declare_func(void, void *dst, void *prev, void *cur, void *next,       \
+                        int w, int prefs, int mrefs, int prefs2, int mrefs2,   \
+                        int prefs3, int mrefs3, int prefs4, int mrefs4,        \
+                        int parity, int clip_max);                             \
+                                                                               \
+        randomize_buffers(prev0, prev1, mask, 9*WIDTH);                        \
+        randomize_buffers(next0, next1, mask, 9*WIDTH);                        \
+        randomize_buffers( cur0,  cur1, mask, 9*WIDTH);                        \
+                                                                               \
+        call_ref(dst0, prev0 + 4*WIDTH, cur0 + 4*WIDTH, next0 + 4*WIDTH,       \
+                WIDTH, stride, -stride, 2*stride, -2*stride,                   \
+                3*stride, -3*stride, 4*stride, -4*stride,                      \
+                0, mask);                                                      \
+        call_new(dst1, prev1 + 4*WIDTH, cur1 + 4*WIDTH, next1 + 4*WIDTH,       \
+                WIDTH, stride, -stride, 2*stride, -2*stride,                   \
+                3*stride, -3*stride, 4*stride, -4*stride,                      \
+                0, mask);                                                      \
+                                                                               \
+        if (memcmp(dst0, dst1, sizeof dst0)                                    \
+                || memcmp(prev0, prev1, sizeof prev0)                          \
+                || memcmp(next0, next1, sizeof next0)                          \
+                || memcmp( cur0,  cur1, sizeof cur0))                          \
+            fail();                                                            \
+        bench_new(dst1, prev1 + 4*WIDTH, cur1 + 4*WIDTH, next1 + 4*WIDTH,      \
+                WIDTH, stride, -stride, 2*stride, -2*stride,                   \
+                3*stride, -3*stride, 4*stride, -4*stride,                      \
+                0, mask);                                                      \
+    } while (0)
+
+void checkasm_check_vf_bwdif(void)
+{
+    BWDIFContext ctx_8, ctx_10;
+
+    ff_bwdif_init_filter_line(&ctx_8, 8);
+    ff_bwdif_init_filter_line(&ctx_10, 10);
+
+    if (check_func(ctx_8.filter_line, "bwdif8")) {
+        BODY(uint8_t, 8);
+        report("bwdif8");
+    }
+
+    if (check_func(ctx_10.filter_line, "bwdif10")) {
+        BODY(uint16_t, 10);
+        report("bwdif10");
+    }
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index 6fda6d227e..1620ab0be0 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -38,6 +38,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
                 fate-checkasm-v210enc                                   \
                 fate-checkasm-vc1dsp                                    \
                 fate-checkasm-vf_blend                                  \
+                fate-checkasm-vf_bwdif                                  \
                 fate-checkasm-vf_colorspace                             \
                 fate-checkasm-vf_eq                                     \
                 fate-checkasm-vf_gblur                                  \

From c19ab5f6e7f8dd9dff5704510db98a81a1304f80 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 6 Jul 2023 13:56:18 +0000
Subject: [PATCH 144/161] Revert "vf_bwdif: Add attributes to ask for
 vectorization"

This reverts commit 281250290ba5c2dcd8676e9a261050e65c10bcb7.
Will be replaced by hand coded asm as on upstream
---
 libavfilter/vf_bwdif.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 539fabbd46..34e8c5e234 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -74,10 +74,10 @@ typedef struct ThreadData {
         int temporal_diff1 =(FFABS(prev[mrefs] - c) + FFABS(prev[prefs] - e)) >> 1; \
         int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e)) >> 1; \
         int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \
- {/*\
+ \
         if (!diff) { \
             dst[0] = d; \
-        } else {*/
+        } else {

 #define SPAT_CHECK() \
             int b = ((prev2[mrefs2] + next2[mrefs2]) >> 1) - c; \
@@ -89,16 +89,15 @@ typedef struct ThreadData {
             diff = FFMAX3(diff, min, -max);

 #define FILTER_LINE() \
-            int i1, i2; \
             SPAT_CHECK() \
-            /*if (FFABS(c - e) > temporal_diff0)*/ { \
-                i1 = (((coef_hf[0] * (prev2[0] + next2[0]) \
+            if (FFABS(c - e) > temporal_diff0) { \
+                interpol = (((coef_hf[0] * (prev2[0] + next2[0]) \
                     - coef_hf[1] * (prev2[mrefs2] + next2[mrefs2] + prev2[prefs2] + next2[prefs2]) \
                     + coef_hf[2] * (prev2[mrefs4] + next2[mrefs4] + prev2[prefs4] + next2[prefs4])) >> 2) \
                     + coef_lf[0] * (c + e) - coef_lf[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
-            } /*else*/ { \
-                i2 = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
-            }interpol = FFABS(c - e) > temporal_diff0 ? i1:i2;\
+            } else { \
+                interpol = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
+            }

 #define FILTER_EDGE() \
             if (spat) { \
@@ -112,7 +111,7 @@ typedef struct ThreadData {
             else if (interpol < d - diff) \
                 interpol = d - diff; \
  \
-            dst[0] = !diff ? d : av_clip(interpol, 0, clip_max); \
+            dst[0] = av_clip(interpol, 0, clip_max); \
         } \
  \
         dst++; \
@@ -123,7 +122,7 @@ typedef struct ThreadData {
         next2++; \
     }

-static void __attribute__((optimize("tree-vectorize"))) filter_intra(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
+static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
                          int prefs3, int mrefs3, int parity, int clip_max)
 {
     uint8_t *dst = dst1;
@@ -133,7 +132,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_intra(void *restr
     FILTER_INTRA()
 }

-static void __attribute__((optimize("tree-vectorize"))) filter_line_c(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
+static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
                           int w, int prefs, int mrefs, int prefs2, int mrefs2,
                           int prefs3, int mrefs3, int prefs4, int mrefs4,
                           int parity, int clip_max)
@@ -151,7 +150,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_line_c(void *rest
     FILTER2()
 }

-static void __attribute__((optimize("tree-vectorize"))) filter_edge(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
+static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
                         int w, int prefs, int mrefs, int prefs2, int mrefs2,
                         int parity, int clip_max, int spat)
 {
@@ -168,7 +167,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_edge(void *restri
     FILTER2()
 }

-static void __attribute__((optimize("tree-vectorize"))) filter_intra_16bit(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
+static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mrefs,
                                int prefs3, int mrefs3, int parity, int clip_max)
 {
     uint16_t *dst = dst1;
@@ -178,7 +177,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_intra_16bit(void
     FILTER_INTRA()
 }

-static void __attribute__((optimize("tree-vectorize"))) filter_line_c_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
+static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1,
                                 int w, int prefs, int mrefs, int prefs2, int mrefs2,
                                 int prefs3, int mrefs3, int prefs4, int mrefs4,
                                 int parity, int clip_max)
@@ -196,7 +195,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_line_c_16bit(void
     FILTER2()
 }

-static void __attribute__((optimize("tree-vectorize"))) filter_edge_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
+static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1,
                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
                               int parity, int clip_max, int spat)
 {

From 093eddd9ef66a7db9e637f3acfe51d950c87f613 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 4 Jul 2023 14:04:39 +0000
Subject: [PATCH 145/161] tests/checkasm: Add test for vf_bwdif filter_intra
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: John Cox <jc@kynesim.co.uk>
Signed-off-by: Martin Storsjö <martin@martin.st>
(cherry picked from commit 7caa8d6b91e738ad2c1ea61746b6c062c470f7d3)
---
 tests/checkasm/vf_bwdif.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
index 46224bb575..034bbabb4c 100644
--- a/tests/checkasm/vf_bwdif.c
+++ b/tests/checkasm/vf_bwdif.c
@@ -20,6 +20,7 @@
 #include "checkasm.h"
 #include "libavcodec/internal.h"
 #include "libavfilter/bwdif.h"
+#include "libavutil/mem_internal.h"

 #define WIDTH 256

@@ -81,4 +82,40 @@ void checkasm_check_vf_bwdif(void)
         BODY(uint16_t, 10);
         report("bwdif10");
     }
+
+    if (check_func(ctx_8.filter_intra, "bwdif8.intra")) {
+        LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, dst0,  [WIDTH*3]);
+        LOCAL_ALIGNED_16(uint8_t, dst1,  [WIDTH*3]);
+        const int stride = WIDTH;
+        const int mask = (1<<8)-1;
+
+        declare_func(void, void *dst1, void *cur1, int w, int prefs, int mrefs,
+                     int prefs3, int mrefs3, int parity, int clip_max);
+
+        randomize_buffers( cur0,  cur1, mask, 11*WIDTH);
+        memset(dst0, 0xba, WIDTH * 3);
+        memset(dst1, 0xba, WIDTH * 3);
+
+        call_ref(dst0 + stride,
+                 cur0 + stride * 4, WIDTH,
+                 stride, -stride, stride * 3, -stride * 3,
+                 0, mask);
+        call_new(dst1 + stride,
+                 cur0 + stride * 4, WIDTH,
+                 stride, -stride, stride * 3, -stride * 3,
+                 0, mask);
+
+        if (memcmp(dst0, dst1, WIDTH*3)
+                || memcmp( cur0,  cur1, WIDTH*11))
+            fail();
+
+        bench_new(dst1 + stride,
+                  cur0 + stride * 4, WIDTH,
+                  stride, -stride, stride * 3, -stride * 3,
+                  0, mask);
+
+        report("bwdif8.intra");
+    }
 }

From 28ef7402381b6fe241f81e21f302a23f8af674bf Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 4 Jul 2023 14:04:40 +0000
Subject: [PATCH 146/161] avfilter/vf_bwdif: Add neon for filter_intra
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds an outline for aarch neon functions
Adds common macros and consts for aarch64 neon
Exports C filter_intra needed for tail fixup of neon code
Adds neon for filter_intra

Signed-off-by: John Cox <jc@kynesim.co.uk>
Signed-off-by: Martin Storsjö <martin@martin.st>
(cherry picked from commit 5075cfb4e6a21f6b4da9e62bdb0bad4cb32a4673)
---
 libavfilter/aarch64/Makefile                |   2 +
 libavfilter/aarch64/vf_bwdif_init_aarch64.c |  56 ++++++++
 libavfilter/aarch64/vf_bwdif_neon.S         | 136 ++++++++++++++++++++
 libavfilter/bwdif.h                         |   4 +
 libavfilter/vf_bwdif.c                      |   8 +-
 5 files changed, 203 insertions(+), 3 deletions(-)
 create mode 100644 libavfilter/aarch64/vf_bwdif_init_aarch64.c
 create mode 100644 libavfilter/aarch64/vf_bwdif_neon.S

diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
index b58daa3a3f..b68209bc94 100644
--- a/libavfilter/aarch64/Makefile
+++ b/libavfilter/aarch64/Makefile
@@ -1,3 +1,5 @@
+OBJS-$(CONFIG_BWDIF_FILTER)                  += aarch64/vf_bwdif_init_aarch64.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o

+NEON-OBJS-$(CONFIG_BWDIF_FILTER)             += aarch64/vf_bwdif_neon.o
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
new file mode 100644
index 0000000000..3ffaa07ab3
--- /dev/null
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -0,0 +1,56 @@
+/*
+ * bwdif aarch64 NEON optimisations
+ *
+ * Copyright (c) 2023 John Cox <jc@kynesim.co.uk>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavfilter/bwdif.h"
+#include "libavutil/aarch64/cpu.h"
+
+void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                                int prefs3, int mrefs3, int parity, int clip_max);
+
+
+static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                                int prefs3, int mrefs3, int parity, int clip_max)
+{
+    const int w0 = clip_max != 255 ? 0 : w & ~15;
+
+    ff_bwdif_filter_intra_neon(dst1, cur1, w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
+
+    if (w0 < w)
+        ff_bwdif_filter_intra_c((char *)dst1 + w0, (char *)cur1 + w0,
+                                w - w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
+}
+
+void
+ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
+{
+    const int cpu_flags = av_get_cpu_flags();
+
+    if (bit_depth != 8)
+        return;
+
+    if (!have_neon(cpu_flags))
+        return;
+
+    s->filter_intra = filter_intra_helper;
+}
+
diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
new file mode 100644
index 0000000000..e288efbe6c
--- /dev/null
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -0,0 +1,136 @@
+/*
+ * bwdif aarch64 NEON optimisations
+ *
+ * Copyright (c) 2023 John Cox <jc@kynesim.co.uk>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/aarch64/asm.S"
+
+// Space taken on the stack by an int (32-bit)
+#ifdef __APPLE__
+.set    SP_INT, 4
+#else
+.set    SP_INT, 8
+#endif
+
+.macro SQSHRUNN b, s0, s1, s2, s3, n
+        sqshrun         \s0\().4h, \s0\().4s, #\n - 8
+        sqshrun2        \s0\().8h, \s1\().4s, #\n - 8
+        sqshrun         \s1\().4h, \s2\().4s, #\n - 8
+        sqshrun2        \s1\().8h, \s3\().4s, #\n - 8
+        uzp2            \b\().16b, \s0\().16b, \s1\().16b
+.endm
+
+.macro SMULL4K a0, a1, a2, a3, s0, s1, k
+        smull           \a0\().4s, \s0\().4h, \k
+        smull2          \a1\().4s, \s0\().8h, \k
+        smull           \a2\().4s, \s1\().4h, \k
+        smull2          \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro UMULL4K a0, a1, a2, a3, s0, s1, k
+        umull           \a0\().4s, \s0\().4h, \k
+        umull2          \a1\().4s, \s0\().8h, \k
+        umull           \a2\().4s, \s1\().4h, \k
+        umull2          \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro UMLAL4K a0, a1, a2, a3, s0, s1, k
+        umlal           \a0\().4s, \s0\().4h, \k
+        umlal2          \a1\().4s, \s0\().8h, \k
+        umlal           \a2\().4s, \s1\().4h, \k
+        umlal2          \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro UMLSL4K a0, a1, a2, a3, s0, s1, k
+        umlsl           \a0\().4s, \s0\().4h, \k
+        umlsl2          \a1\().4s, \s0\().8h, \k
+        umlsl           \a2\().4s, \s1\().4h, \k
+        umlsl2          \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro LDR_COEFFS d, t0
+        movrel          \t0, coeffs, 0
+        ld1             {\d\().8h}, [\t0]
+.endm
+
+// static const uint16_t coef_lf[2] = { 4309, 213 };
+// static const uint16_t coef_hf[3] = { 5570, 3801, 1016 };
+// static const uint16_t coef_sp[2] = { 5077, 981 };
+
+const coeffs, align=4   // align 4 means align on 2^4 boundry
+        .hword          4309 * 4, 213 * 4               // lf[0]*4 = v0.h[0]
+        .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
+        .hword          5077, 981                       // sp[0] = v0.h[6]
+endconst
+
+// ============================================================================
+//
+// void ff_bwdif_filter_intra_neon(
+//      void *dst1,     // x0
+//      void *cur1,     // x1
+//      int w,          // w2
+//      int prefs,      // w3
+//      int mrefs,      // w4
+//      int prefs3,     // w5
+//      int mrefs3,     // w6
+//      int parity,     // w7       unused
+//      int clip_max)   // [sp, #0] unused
+
+function ff_bwdif_filter_intra_neon, export=1
+        cmp             w2, #0
+        ble             99f
+
+        LDR_COEFFS      v0, x17
+
+//    for (x = 0; x < w; x++) {
+10:
+
+//        interpol = (coef_sp[0] * (cur[mrefs] + cur[prefs]) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13;
+        ldr             q31, [x1, w4, sxtw]
+        ldr             q30, [x1, w3, sxtw]
+        ldr             q29, [x1, w6, sxtw]
+        ldr             q28, [x1, w5, sxtw]
+
+        uaddl           v20.8h,  v31.8b,  v30.8b
+        uaddl2          v21.8h,  v31.16b, v30.16b
+
+        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[6]
+
+        uaddl           v20.8h,  v29.8b,  v28.8b
+        uaddl2          v21.8h,  v29.16b, v28.16b
+
+        UMLSL4K         v2, v3, v4, v5, v20, v21, v0.h[7]
+
+//        dst[0] = av_clip(interpol, 0, clip_max);
+        SQSHRUNN        v2, v2, v3, v4, v5, 13
+        str             q2, [x0], #16
+
+//        dst++;
+//        cur++;
+//    }
+
+        subs            w2,  w2,  #16
+        add             x1,  x1,  #16
+        bgt             10b
+
+99:
+        ret
+endfunc
diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
index 5749345f78..ae6f6ce223 100644
--- a/libavfilter/bwdif.h
+++ b/libavfilter/bwdif.h
@@ -39,5 +39,9 @@ typedef struct BWDIFContext {

 void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
 void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth);
+void ff_bwdif_init_aarch64(BWDIFContext *bwdif, int bit_depth);
+
+void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                             int prefs3, int mrefs3, int parity, int clip_max);

 #endif /* AVFILTER_BWDIF_H */
diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 34e8c5e234..6ec8bbab5d 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -122,8 +122,8 @@ typedef struct ThreadData {
         next2++; \
     }

-static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
-                         int prefs3, int mrefs3, int parity, int clip_max)
+void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                             int prefs3, int mrefs3, int parity, int clip_max)
 {
     uint8_t *dst = dst1;
     uint8_t *cur = cur1;
@@ -352,13 +352,15 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
         s->filter_line  = filter_line_c_16bit;
         s->filter_edge  = filter_edge_16bit;
     } else {
-        s->filter_intra = filter_intra;
+        s->filter_intra = ff_bwdif_filter_intra_c;
         s->filter_line  = filter_line_c;
         s->filter_edge  = filter_edge;
     }

 #if ARCH_X86
     ff_bwdif_init_x86(s, bit_depth);
+#elif ARCH_AARCH64
+    ff_bwdif_init_aarch64(s, bit_depth);
 #endif
 }


From 2f8199a41cfd43595352899e722646052b0db2ee Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 4 Jul 2023 14:04:41 +0000
Subject: [PATCH 147/161] tests/checkasm: Add test for vf_bwdif filter_edge
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: John Cox <jc@kynesim.co.uk>
Signed-off-by: Martin Storsjö <martin@martin.st>
(cherry picked from commit 7ed7c00f55a50ac88589f9e17c172d4a4fce0581)
---
 tests/checkasm/vf_bwdif.c | 54 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
index 034bbabb4c..5fdba09fdc 100644
--- a/tests/checkasm/vf_bwdif.c
+++ b/tests/checkasm/vf_bwdif.c
@@ -83,6 +83,60 @@ void checkasm_check_vf_bwdif(void)
         report("bwdif10");
     }

+    {
+        LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, next0, [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, next1, [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, dst0,  [WIDTH*3]);
+        LOCAL_ALIGNED_16(uint8_t, dst1,  [WIDTH*3]);
+        const int stride = WIDTH;
+        const int mask = (1<<8)-1;
+        int spat;
+        int parity;
+
+        for (spat = 0; spat != 2; ++spat) {
+            for (parity = 0; parity != 2; ++parity) {
+                if (check_func(ctx_8.filter_edge, "bwdif8.edge.s%d.p%d", spat, parity)) {
+
+                    declare_func(void, void *dst1, void *prev1, void *cur1, void *next1,
+                                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                                            int parity, int clip_max, int spat);
+
+                    randomize_buffers(prev0, prev1, mask, 11*WIDTH);
+                    randomize_buffers(next0, next1, mask, 11*WIDTH);
+                    randomize_buffers( cur0,  cur1, mask, 11*WIDTH);
+                    memset(dst0, 0xba, WIDTH * 3);
+                    memset(dst1, 0xba, WIDTH * 3);
+
+                    call_ref(dst0 + stride,
+                             prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, WIDTH,
+                             stride, -stride, stride * 2, -stride * 2,
+                             parity, mask, spat);
+                    call_new(dst1 + stride,
+                             prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, WIDTH,
+                             stride, -stride, stride * 2, -stride * 2,
+                             parity, mask, spat);
+
+                    if (memcmp(dst0, dst1, WIDTH*3)
+                            || memcmp(prev0, prev1, WIDTH*11)
+                            || memcmp(next0, next1, WIDTH*11)
+                            || memcmp( cur0,  cur1, WIDTH*11))
+                        fail();
+
+                    bench_new(dst1 + stride,
+                             prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, WIDTH,
+                             stride, -stride, stride * 2, -stride * 2,
+                             parity, mask, spat);
+                }
+            }
+        }
+
+        report("bwdif8.edge");
+    }
+
     if (check_func(ctx_8.filter_intra, "bwdif8.intra")) {
         LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
         LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);

From 171d7f201503812617b8e320c83cc33120425923 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 4 Jul 2023 14:04:42 +0000
Subject: [PATCH 148/161] avfilter/vf_bwdif: Add neon for filter_edge
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds clip and spatial macros for aarch64 neon
Exports C filter_edge needed for tail fixup of neon code
Adds neon for filter_edge

Signed-off-by: John Cox <jc@kynesim.co.uk>
Signed-off-by: Martin Storsjö <martin@martin.st>
(cherry picked from commit 8130df83e0fbd3264fe990fb4e084ecbd452d0b1)
---
 libavfilter/aarch64/vf_bwdif_init_aarch64.c |  20 +++
 libavfilter/aarch64/vf_bwdif_neon.S         | 177 ++++++++++++++++++++
 libavfilter/bwdif.h                         |   4 +
 libavfilter/vf_bwdif.c                      |   8 +-
 4 files changed, 205 insertions(+), 4 deletions(-)

diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
index 3ffaa07ab3..e75cf2f204 100644
--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -24,10 +24,29 @@
 #include "libavfilter/bwdif.h"
 #include "libavutil/aarch64/cpu.h"

+void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
+                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                               int parity, int clip_max, int spat);
+
 void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
                                 int prefs3, int mrefs3, int parity, int clip_max);


+static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
+                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                               int parity, int clip_max, int spat)
+{
+    const int w0 = clip_max != 255 ? 0 : w & ~15;
+
+    ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2,
+                              parity, clip_max, spat);
+
+    if (w0 < w)
+        ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
+                               w - w0, prefs, mrefs, prefs2, mrefs2,
+                               parity, clip_max, spat);
+}
+
 static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
                                 int prefs3, int mrefs3, int parity, int clip_max)
 {
@@ -52,5 +71,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
         return;

     s->filter_intra = filter_intra_helper;
+    s->filter_edge  = filter_edge_helper;
 }

diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
index e288efbe6c..389302b813 100644
--- a/libavfilter/aarch64/vf_bwdif_neon.S
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -66,6 +66,79 @@
         umlsl2          \a3\().4s, \s1\().8h, \k
 .endm

+//      int b = m2s1 - m1;
+//      int f = p2s1 - p1;
+//      int dc = c0s1 - m1;
+//      int de = c0s1 - p1;
+//      int sp_max = FFMIN(p1 - c0s1, m1 - c0s1);
+//      sp_max = FFMIN(sp_max, FFMAX(-b,-f));
+//      int sp_min = FFMIN(c0s1 - p1, c0s1 - m1);
+//      sp_min = FFMIN(sp_min, FFMAX(b,f));
+//      diff = diff == 0 ? 0 : FFMAX3(diff, sp_min, sp_max);
+.macro SPAT_CHECK diff, m2s1, m1, c0s1, p1, p2s1, t0, t1, t2, t3
+        uqsub           \t0\().16b, \p1\().16b, \c0s1\().16b
+        uqsub           \t2\().16b, \m1\().16b, \c0s1\().16b
+        umin            \t2\().16b, \t0\().16b, \t2\().16b
+
+        uqsub           \t1\().16b, \m1\().16b, \m2s1\().16b
+        uqsub           \t3\().16b, \p1\().16b, \p2s1\().16b
+        umax            \t3\().16b, \t3\().16b, \t1\().16b
+        umin            \t3\().16b, \t3\().16b, \t2\().16b
+
+        uqsub           \t0\().16b, \c0s1\().16b, \p1\().16b
+        uqsub           \t2\().16b, \c0s1\().16b, \m1\().16b
+        umin            \t2\().16b, \t0\().16b, \t2\().16b
+
+        uqsub           \t1\().16b, \m2s1\().16b, \m1\().16b
+        uqsub           \t0\().16b, \p2s1\().16b, \p1\().16b
+        umax            \t0\().16b, \t0\().16b, \t1\().16b
+        umin            \t2\().16b, \t2\().16b, \t0\().16b
+
+        cmeq            \t1\().16b, \diff\().16b, #0
+        umax            \diff\().16b, \diff\().16b, \t3\().16b
+        umax            \diff\().16b, \diff\().16b, \t2\().16b
+        bic             \diff\().16b, \diff\().16b, \t1\().16b
+.endm
+
+//      i0 = s0;
+//      if (i0 > d0 + diff0)
+//          i0 = d0 + diff0;
+//      else if (i0 < d0 - diff0)
+//          i0 = d0 - diff0;
+//
+// i0 = s0 is safe
+.macro DIFF_CLIP i0, s0, d0, diff, t0, t1
+        uqadd           \t0\().16b, \d0\().16b, \diff\().16b
+        uqsub           \t1\().16b, \d0\().16b, \diff\().16b
+        umin            \i0\().16b, \s0\().16b, \t0\().16b
+        umax            \i0\().16b, \i0\().16b, \t1\().16b
+.endm
+
+//      i0 = FFABS(m1 - p1) > td0 ? i1 : i2;
+//      DIFF_CLIP
+//
+// i0 = i1 is safe
+.macro INTERPOL i0, i1, i2, m1, d0, p1, td0, diff, t0, t1, t2
+        uabd            \t0\().16b, \m1\().16b, \p1\().16b
+        cmhi            \t0\().16b, \t0\().16b, \td0\().16b
+        bsl             \t0\().16b, \i1\().16b, \i2\().16b
+        DIFF_CLIP       \i0, \t0, \d0, \diff, \t1, \t2
+.endm
+
+.macro PUSH_VREGS
+        stp             d8,  d9,  [sp, #-64]!
+        stp             d10, d11, [sp, #16]
+        stp             d12, d13, [sp, #32]
+        stp             d14, d15, [sp, #48]
+.endm
+
+.macro POP_VREGS
+        ldp             d14, d15, [sp, #48]
+        ldp             d12, d13, [sp, #32]
+        ldp             d10, d11, [sp, #16]
+        ldp             d8,  d9,  [sp], #64
+.endm
+
 .macro LDR_COEFFS d, t0
         movrel          \t0, coeffs, 0
         ld1             {\d\().8h}, [\t0]
@@ -81,6 +154,110 @@ const coeffs, align=4   // align 4 means align on 2^4 boundry
         .hword          5077, 981                       // sp[0] = v0.h[6]
 endconst

+// ============================================================================
+//
+// void ff_bwdif_filter_edge_neon(
+//      void *dst1,     // x0
+//      void *prev1,    // x1
+//      void *cur1,     // x2
+//      void *next1,    // x3
+//      int w,          // w4
+//      int prefs,      // w5
+//      int mrefs,      // w6
+//      int prefs2,     // w7
+//      int mrefs2,     // [sp, #0]
+//      int parity,     // [sp, #SP_INT]
+//      int clip_max,   // [sp, #SP_INT*2]  unused
+//      int spat);      // [sp, #SP_INT*3]
+
+function ff_bwdif_filter_edge_neon, export=1
+        // Sanity check w
+        cmp             w4, #0
+        ble             99f
+
+// #define prev2 cur
+//     const uint8_t * restrict next2 = parity ? prev : next;
+
+        ldr             w8,  [sp, #0]                   // mrefs2
+
+        ldr             w17, [sp, #SP_INT]              // parity
+        ldr             w16, [sp, #SP_INT*3]            // spat
+        cmp             w17, #0
+        csel            x17, x1, x3, ne
+
+//     for (x = 0; x < w; x++) {
+
+10:
+//        int m1 = cur[mrefs];
+//        int d = (prev2[0] + next2[0]) >> 1;
+//        int p1 = cur[prefs];
+//        int temporal_diff0 = FFABS(prev2[0] - next2[0]);
+//        int temporal_diff1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
+//        int temporal_diff2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
+//        int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2);
+        ldr             q31, [x2]
+        ldr             q21, [x17]
+        uhadd           v16.16b, v31.16b, v21.16b       // d0 = v16
+        uabd            v17.16b, v31.16b, v21.16b       // td0 = v17
+        ldr             q24, [x2, w6, sxtw]             // m1 = v24
+        ldr             q22, [x2, w5, sxtw]             // p1 = v22
+
+        ldr             q0,  [x1, w6, sxtw]             // prev[mrefs]
+        ldr             q2,  [x1, w5, sxtw]             // prev[prefs]
+        ldr             q1,  [x3, w6, sxtw]             // next[mrefs]
+        ldr             q3,  [x3, w5, sxtw]             // next[prefs]
+
+        ushr            v29.16b, v17.16b, #1
+
+        uabd            v31.16b, v0.16b,  v24.16b
+        uabd            v30.16b, v2.16b,  v22.16b
+        uhadd           v0.16b,  v31.16b, v30.16b       // td1 = q0
+
+        uabd            v31.16b, v1.16b,  v24.16b
+        uabd            v30.16b, v3.16b,  v22.16b
+        uhadd           v1.16b,  v31.16b, v30.16b       // td2 = q1
+
+        umax            v0.16b,  v0.16b,  v29.16b
+        umax            v0.16b,  v0.16b,  v1.16b        // diff = v0
+
+//        if (spat) {
+//            SPAT_CHECK()
+//        }
+//        i0 = (m1 + p1) >> 1;
+        cbz             w16, 1f
+
+        ldr             q31, [x2,  w8, sxtw]
+        ldr             q18, [x17, w8, sxtw]
+        ldr             q30, [x2,  w7, sxtw]
+        ldr             q19, [x17, w7, sxtw]
+        uhadd           v18.16b, v18.16b, v31.16b
+        uhadd           v19.16b, v19.16b, v30.16b
+
+        SPAT_CHECK      v0, v18, v24, v16, v22, v19, v31, v30, v29, v28
+
+1:
+        uhadd           v2.16b,  v22.16b, v24.16b
+
+        // i0 = v2, s0 = v2, d0 = v16, diff = v0, t0 = v31, t1 = v30
+        DIFF_CLIP       v2, v2, v16, v0, v31, v30
+
+//        dst[0] = av_clip(interpol, 0, clip_max);
+        str             q2, [x0], #16
+
+//        dst++;
+//        cur++;
+//    }
+        subs            w4,  w4,  #16
+        add             x1,  x1,  #16
+        add             x2,  x2,  #16
+        add             x3,  x3,  #16
+        add             x17, x17, #16
+        bgt             10b
+
+99:
+        ret
+endfunc
+
 // ============================================================================
 //
 // void ff_bwdif_filter_intra_neon(
diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
index ae6f6ce223..ae1616d366 100644
--- a/libavfilter/bwdif.h
+++ b/libavfilter/bwdif.h
@@ -41,6 +41,10 @@ void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
 void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth);
 void ff_bwdif_init_aarch64(BWDIFContext *bwdif, int bit_depth);

+void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
+                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                            int parity, int clip_max, int spat);
+
 void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
                              int prefs3, int mrefs3, int parity, int clip_max);

diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 6ec8bbab5d..688c2d2572 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -150,9 +150,9 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
     FILTER2()
 }

-static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
-                        int w, int prefs, int mrefs, int prefs2, int mrefs2,
-                        int parity, int clip_max, int spat)
+void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
+                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                            int parity, int clip_max, int spat)
 {
     uint8_t *dst   = dst1;
     uint8_t *prev  = prev1;
@@ -354,7 +354,7 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
     } else {
         s->filter_intra = ff_bwdif_filter_intra_c;
         s->filter_line  = filter_line_c;
-        s->filter_edge  = filter_edge;
+        s->filter_edge  = ff_bwdif_filter_edge_c;
     }

 #if ARCH_X86

From abf6588935bce275ba302766bcd8c3bb7a523d3c Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 4 Jul 2023 14:04:43 +0000
Subject: [PATCH 149/161] avfilter/vf_bwdif: Add neon for filter_line
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Exports C filter_line needed for tail fixup of neon code
Adds neon for filter_line

Signed-off-by: John Cox <jc@kynesim.co.uk>
Signed-off-by: Martin Storsjö <martin@martin.st>
(cherry picked from commit 94cb94a2c0910d364a7181fc5cc0e9556b777d0a)
---
 libavfilter/aarch64/vf_bwdif_init_aarch64.c |  21 ++
 libavfilter/aarch64/vf_bwdif_neon.S         | 203 ++++++++++++++++++++
 libavfilter/bwdif.h                         |   5 +
 libavfilter/vf_bwdif.c                      |  10 +-
 4 files changed, 234 insertions(+), 5 deletions(-)

diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
index e75cf2f204..21e67884ab 100644
--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
 void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
                                 int prefs3, int mrefs3, int parity, int clip_max);

+void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
+                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                               int prefs3, int mrefs3, int prefs4, int mrefs4,
+                               int parity, int clip_max);
+
+
+static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
+                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                               int prefs3, int mrefs3, int prefs4, int mrefs4,
+                               int parity, int clip_max)
+{
+    const int w0 = clip_max != 255 ? 0 : w & ~15;
+
+    ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1,
+                              w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
+
+    if (w0 < w)
+        ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
+                               w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
+}

 static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
                                int w, int prefs, int mrefs, int prefs2, int mrefs2,
@@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
         return;

     s->filter_intra = filter_intra_helper;
+    s->filter_line  = filter_line_helper;
     s->filter_edge  = filter_edge_helper;
 }

diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
index 389302b813..f185e94e3c 100644
--- a/libavfilter/aarch64/vf_bwdif_neon.S
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -154,6 +154,209 @@ const coeffs, align=4   // align 4 means align on 2^4 boundry
         .hword          5077, 981                       // sp[0] = v0.h[6]
 endconst

+// ===========================================================================
+//
+// void filter_line(
+//      void *dst1,     // x0
+//      void *prev1,    // x1
+//      void *cur1,     // x2
+//      void *next1,    // x3
+//      int w,          // w4
+//      int prefs,      // w5
+//      int mrefs,      // w6
+//      int prefs2,     // w7
+//      int mrefs2,     // [sp, #0]
+//      int prefs3,     // [sp, #SP_INT]
+//      int mrefs3,     // [sp, #SP_INT*2]
+//      int prefs4,     // [sp, #SP_INT*3]
+//      int mrefs4,     // [sp, #SP_INT*4]
+//      int parity,     // [sp, #SP_INT*5]
+//      int clip_max)   // [sp, #SP_INT*6]
+
+function ff_bwdif_filter_line_neon, export=1
+        // Sanity check w
+        cmp             w4, #0
+        ble             99f
+
+        // Rearrange regs to be the same as line3 for ease of debug!
+        mov             w10, w4                         // w10 = loop count
+        mov             w9,  w6                         // w9  = mref
+        mov             w12, w7                         // w12 = pref2
+        mov             w11, w5                         // w11 = pref
+        ldr             w8,  [sp, #0]                   // w8 =  mref2
+        ldr             w7,  [sp, #SP_INT*2]            // w7  = mref3
+        ldr             w6,  [sp, #SP_INT*4]            // w6  = mref4
+        ldr             w13, [sp, #SP_INT]              // w13 = pref3
+        ldr             w14, [sp, #SP_INT*3]            // w14 = pref4
+
+        mov             x4,  x3
+        mov             x3,  x2
+        mov             x2,  x1
+
+        LDR_COEFFS      v0, x17
+
+// #define prev2 cur
+//        const uint8_t * restrict next2 = parity ? prev : next;
+        ldr             w17, [sp, #SP_INT*5]            // parity
+        cmp             w17, #0
+        csel            x17, x2, x4, ne
+
+        PUSH_VREGS
+
+//         for (x = 0; x < w; x++) {
+//             int diff0, diff2;
+//             int d0, d2;
+//             int temporal_diff0, temporal_diff2;
+//
+//             int i1, i2;
+//             int j1, j2;
+//             int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4;
+
+10:
+//             c0 = prev2[0] + next2[0];            // c0 = v20, v21
+//             d0  = c0 >> 1;                       // d0 = v10
+//             temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11
+        ldr             q31, [x3]
+        ldr             q21, [x17]
+        uhadd           v10.16b, v31.16b, v21.16b
+        uabd            v11.16b, v31.16b, v21.16b
+        uaddl           v20.8h,  v21.8b,  v31.8b
+        uaddl2          v21.8h,  v21.16b, v31.16b
+
+        ldr             q31, [x3, w6, sxtw]
+        ldr             q23, [x17, w6, sxtw]
+
+//             i1 = coef_hf[0] * c0;                // i1 = v2-v5
+        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[2]
+
+        ldr             q30, [x3, w14, sxtw]
+        ldr             q25, [x17, w14, sxtw]
+
+//             m4 = prev2[mrefs4] + next2[mrefs4];  // m4 = v22,v23
+        uaddl           v22.8h,  v23.8b,  v31.8b
+        uaddl2          v23.8h,  v23.16b, v31.16b
+
+//             p4 = prev2[prefs4] + next2[prefs4];  // p4 = v24,v25, (p4 >> 1) = v12
+        uhadd           v12.16b, v25.16b, v30.16b
+        uaddl           v24.8h,  v25.8b,  v30.8b
+        uaddl2          v25.8h,  v25.16b, v30.16b
+
+//             m3 = cur[mrefs3];                    // m3 = v20
+        ldr             q20, [x3, w7, sxtw]
+
+//             p3 = cur[prefs3];                    // p3 = v21
+        ldr             q21, [x3, w13, sxtw]
+
+//             i1 += coef_hf[2] * (m4 + p4);        // (-m4:v22,v23) (-p4:v24,v25)
+        add             v22.8h,  v22.8h,  v24.8h
+        add             v23.8h,  v23.8h,  v25.8h
+        UMLAL4K         v2, v3, v4, v5, v22, v23, v0.h[4]
+
+        ldr             q29, [x3, w8, sxtw]
+        ldr             q23, [x17, w8, sxtw]
+
+//             i1 -= coef_lf[1] * 4 * (m3 + p3);    // -
+        uaddl           v30.8h,  v20.8b,  v21.8b
+        uaddl2          v31.8h,  v20.16b, v21.16b
+
+        UMLSL4K         v2, v3, v4, v5, v30, v31, v0.h[1]
+
+        ldr             q31, [x3, w12, sxtw]
+        ldr             q27, [x17, w12, sxtw]
+
+//             m2 = prev2[mrefs2] + next2[mrefs2];  // m2 = v22,v23, (m2 >> 1) = v13
+        uhadd           v13.16b, v23.16b, v29.16b
+        uaddl           v22.8h,  v23.8b,  v29.8b
+        uaddl2          v23.8h,  v23.16b, v29.16b
+
+//             m1 = cur[mrefs];                     // m1 = v24
+        ldr             q24, [x3, w9, sxtw]
+
+//             p2 = prev2[prefs2] + next2[prefs2];  // p2 = v26, v27
+//             temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14
+//             d2  = p2 >> 1;                       // d2 = v15
+        uabd            v14.16b, v31.16b, v27.16b
+        uhadd           v15.16b, v31.16b, v27.16b
+        uaddl           v26.8h,  v27.8b,  v31.8b
+        uaddl2          v27.8h,  v27.16b, v31.16b
+
+//             i1 -= coef_hf[1] * (m2 + p2);        // (-m2:v22,v23*) (-p2:v26*,v27*)
+        add             v22.8h,  v22.8h,  v26.8h
+        add             v23.8h,  v23.8h,  v27.8h
+        UMLSL4K         v2, v3, v4, v5, v22, v23, v0.h[3]
+
+//             p1 = cur[prefs];                     // p1 = v22
+        ldr             q22, [x3, w11, sxtw]
+
+//             i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17
+        uaddl           v18.8h,  v22.8b,  v24.8b
+        uaddl2          v19.8h,  v22.16b, v24.16b
+        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
+
+        uaddl           v18.8h,  v20.8b,  v21.8b
+        uaddl2          v19.8h,  v20.16b, v21.16b
+        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
+
+        SQSHRUNN        v17, v28, v29, v30, v31, 13
+
+//             i1 += coef_lf[0] * 4 * (m1 + p1);    // p1 = v22, m1 = v24
+        uaddl           v26.8h,  v24.8b,  v22.8b
+        uaddl2          v27.8h,  v24.16b, v22.16b
+        UMLAL4K         v2, v3, v4, v5, v26, v27, v0.h[0]
+
+        ldr             q31, [x2, w9, sxtw]
+        ldr             q29, [x4, w9, sxtw]
+
+        ldr             q30, [x2, w11, sxtw]
+        ldr             q28, [x4, w11, sxtw]
+
+//             i1 >>= 15;                            // i1 = v2, -v3, -v4*, -v5*
+        SQSHRUNN        v2, v2, v3, v4, v5, 15
+
+//             {
+//                 int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
+//                 int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
+        uabd            v30.16b, v22.16b, v30.16b
+        uabd            v31.16b, v24.16b, v31.16b
+        uabd            v28.16b, v22.16b, v28.16b
+        uabd            v29.16b, v24.16b, v29.16b
+        uhadd           v31.16b, v31.16b, v30.16b
+        uhadd           v29.16b, v29.16b, v28.16b
+
+//                 diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18
+        ushr            v18.16b, v11.16b, #1
+        umax            v18.16b, v18.16b, v31.16b
+        umax            v18.16b, v18.16b, v29.16b
+
+        // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15
+        SPAT_CHECK      v18, v13, v24, v10, v22, v15, v31, v30, v29, v28
+
+        // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18
+        INTERPOL        v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29
+
+//                 dst[0] = av_clip_uint8(interpol);
+        str             q2,  [x0], #16
+//             }
+//
+//             dst++;
+//             cur++;
+//             prev++;
+//             prev2++;
+//             next++;
+//         }
+
+        subs            w10, w10, #16
+        add             x2,  x2,  #16
+        add             x3,  x3,  #16
+        add             x4,  x4,  #16
+        add             x17, x17, #16
+        bgt             10b
+
+        POP_VREGS
+99:
+        ret
+endfunc
+
 // ============================================================================
 //
 // void ff_bwdif_filter_edge_neon(
diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
index ae1616d366..cce99953f3 100644
--- a/libavfilter/bwdif.h
+++ b/libavfilter/bwdif.h
@@ -48,4 +48,9 @@ void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
 void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
                              int prefs3, int mrefs3, int parity, int clip_max);

+void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                            int prefs3, int mrefs3, int prefs4, int mrefs4,
+                            int parity, int clip_max);
+
 #endif /* AVFILTER_BWDIF_H */
diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 688c2d2572..2dc47f9614 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -132,10 +132,10 @@ void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs
     FILTER_INTRA()
 }

-static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
-                          int w, int prefs, int mrefs, int prefs2, int mrefs2,
-                          int prefs3, int mrefs3, int prefs4, int mrefs4,
-                          int parity, int clip_max)
+void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                            int prefs3, int mrefs3, int prefs4, int mrefs4,
+                            int parity, int clip_max)
 {
     uint8_t *dst   = dst1;
     uint8_t *prev  = prev1;
@@ -353,7 +353,7 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
         s->filter_edge  = filter_edge_16bit;
     } else {
         s->filter_intra = ff_bwdif_filter_intra_c;
-        s->filter_line  = filter_line_c;
+        s->filter_line  = ff_bwdif_filter_line_c;
         s->filter_edge  = ff_bwdif_filter_edge_c;
     }


From 7601de6ab2604d1f530e4b8f20f409d1ec2ae6a4 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 4 Jul 2023 14:04:44 +0000
Subject: [PATCH 150/161] avfilter/vf_bwdif: Add a filter_line3 method for
 optimisation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an optional filter_line3 to the available optimisations.

filter_line3 is equivalent to filter_line, memcpy, filter_line

filter_line shares quite a number of loads and some calculations in
common with its next iteration and testing shows that using aarch64
neon filter_line3s performance is 30% better than two filter_lines
and a memcpy.

Adds a test for vf_bwdif filter_line3 to checkasm

Rounds job start lines down to a multiple of 4. This means that if
filter_line3 exists then filter_line will not sometimes be called
once at the end of a slice depending on thread count. The final slice
may do up to 3 extra lines but filter_edge is faster than filter_line
so it is unlikely to create any noticable thread load variation.

Signed-off-by: John Cox <jc@kynesim.co.uk>
Signed-off-by: Martin Storsjö <martin@martin.st>
(cherry picked from commit 697533e76dbea8cc7fd6a0642bc60050cc05ead8)
---
 libavfilter/bwdif.h       |  7 ++++
 libavfilter/vf_bwdif.c    | 44 +++++++++++++++++++--
 tests/checkasm/vf_bwdif.c | 81 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 129 insertions(+), 3 deletions(-)

diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
index cce99953f3..496cec72ef 100644
--- a/libavfilter/bwdif.h
+++ b/libavfilter/bwdif.h
@@ -35,6 +35,9 @@ typedef struct BWDIFContext {
     void (*filter_edge)(void *dst, void *prev, void *cur, void *next,
                         int w, int prefs, int mrefs, int prefs2, int mrefs2,
                         int parity, int clip_max, int spat);
+    void (*filter_line3)(void *dst, int dstride,
+                         const void *prev, const void *cur, const void *next, int prefs,
+                         int w, int parity, int clip_max);
 } BWDIFContext;

 void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
@@ -53,4 +56,8 @@ void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
                             int prefs3, int mrefs3, int prefs4, int mrefs4,
                             int parity, int clip_max);

+void ff_bwdif_filter_line3_c(void * dst1, int d_stride,
+                             const void * prev1, const void * cur1, const void * next1, int s_stride,
+                             int w, int parity, int clip_max);
+
 #endif /* AVFILTER_BWDIF_H */
diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 2dc47f9614..9847d38b6a 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -150,6 +150,31 @@ void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
     FILTER2()
 }

+#define NEXT_LINE()\
+    dst += d_stride; \
+    prev += prefs; \
+    cur  += prefs; \
+    next += prefs;
+
+void ff_bwdif_filter_line3_c(void * dst1, int d_stride,
+                             const void * prev1, const void * cur1, const void * next1, int s_stride,
+                             int w, int parity, int clip_max)
+{
+    const int prefs = s_stride;
+    uint8_t * dst  = dst1;
+    const uint8_t * prev = prev1;
+    const uint8_t * cur  = cur1;
+    const uint8_t * next = next1;
+
+    ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w,
+                           prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max);
+    NEXT_LINE();
+    memcpy(dst, cur, w);
+    NEXT_LINE();
+    ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w,
+                           prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max);
+}
+
 void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
                             int w, int prefs, int mrefs, int prefs2, int mrefs2,
                             int parity, int clip_max, int spat)
@@ -212,6 +237,13 @@ static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1,
     FILTER2()
 }

+// Round job start line down to multiple of 4 so that if filter_line3 exists
+// and the frame is a multiple of 4 high then filter_line will never be called
+static inline int job_start(const int jobnr, const int nb_jobs, const int h)
+{
+    return jobnr >= nb_jobs ? h : ((h * jobnr) / nb_jobs) & ~3;
+}
+
 static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
 {
     BWDIFContext *s = ctx->priv;
@@ -221,8 +253,8 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
     int clip_max = (1 << (yadif->csp->comp[td->plane].depth)) - 1;
     int df = (yadif->csp->comp[td->plane].depth + 7) / 8;
     int refs = linesize / df;
-    int slice_start = (td->h *  jobnr   ) / nb_jobs;
-    int slice_end   = (td->h * (jobnr+1)) / nb_jobs;
+    int slice_start = job_start(jobnr, nb_jobs, td->h);
+    int slice_end   = job_start(jobnr + 1, nb_jobs, td->h);
     int y;

     for (y = slice_start; y < slice_end; y++) {
@@ -244,6 +276,11 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
                                refs << 1, -(refs << 1),
                                td->parity ^ td->tff, clip_max,
                                (y < 2) || ((y + 3) > td->h) ? 0 : 1);
+            } else if (s->filter_line3 && y + 2 < slice_end && y + 6 < td->h) {
+                s->filter_line3(dst, td->frame->linesize[td->plane],
+                                prev, cur, next, linesize, td->w,
+                                td->parity ^ td->tff, clip_max);
+                y += 2;
             } else {
                 s->filter_line(dst, prev, cur, next, td->w,
                                refs, -refs, refs << 1, -(refs << 1),
@@ -280,7 +317,7 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic,
         td.plane = i;

         ff_filter_execute(ctx, filter_slice, &td, NULL,
-                          FFMIN(h, ff_filter_get_nb_threads(ctx)));
+                          FFMIN((h+3)/4, ff_filter_get_nb_threads(ctx)));
     }
     if (yadif->current_field == YADIF_FIELD_END) {
         yadif->current_field = YADIF_FIELD_NORMAL;
@@ -347,6 +384,7 @@ static int config_props(AVFilterLink *link)

 av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
 {
+    s->filter_line3 = 0;
     if (bit_depth > 8) {
         s->filter_intra = filter_intra_16bit;
         s->filter_line  = filter_line_c_16bit;
diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
index 5fdba09fdc..3399cacdf7 100644
--- a/tests/checkasm/vf_bwdif.c
+++ b/tests/checkasm/vf_bwdif.c
@@ -28,6 +28,10 @@
     for (size_t i = 0; i < count; i++) \
         buf0[i] = buf1[i] = rnd() & mask

+#define randomize_overflow_check(buf0, buf1, mask, count) \
+    for (size_t i = 0; i < count; i++) \
+        buf0[i] = buf1[i] = (rnd() & 1) != 0 ? mask : 0;
+
 #define BODY(type, depth)                                                      \
     do {                                                                       \
         type prev0[9*WIDTH], prev1[9*WIDTH];                                   \
@@ -83,6 +87,83 @@ void checkasm_check_vf_bwdif(void)
         report("bwdif10");
     }

+    if (!ctx_8.filter_line3)
+        ctx_8.filter_line3 = ff_bwdif_filter_line3_c;
+
+    {
+        LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, next0, [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, next1, [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, dst0,  [WIDTH*3]);
+        LOCAL_ALIGNED_16(uint8_t, dst1,  [WIDTH*3]);
+        const int stride = WIDTH;
+        const int mask = (1<<8)-1;
+        int parity;
+
+        for (parity = 0; parity != 2; ++parity) {
+            if (check_func(ctx_8.filter_line3, "bwdif8.line3.rnd.p%d", parity)) {
+
+                declare_func(void, void * dst1, int d_stride,
+                                          const void * prev1, const void * cur1, const void * next1, int prefs,
+                                          int w, int parity, int clip_max);
+
+                randomize_buffers(prev0, prev1, mask, 11*WIDTH);
+                randomize_buffers(next0, next1, mask, 11*WIDTH);
+                randomize_buffers( cur0,  cur1, mask, 11*WIDTH);
+
+                call_ref(dst0, stride,
+                         prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, stride,
+                         WIDTH, parity, mask);
+                call_new(dst1, stride,
+                         prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
+                         WIDTH, parity, mask);
+
+                if (memcmp(dst0, dst1, WIDTH*3)
+                        || memcmp(prev0, prev1, WIDTH*11)
+                        || memcmp(next0, next1, WIDTH*11)
+                        || memcmp( cur0,  cur1, WIDTH*11))
+                    fail();
+
+                bench_new(dst1, stride,
+                         prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
+                         WIDTH, parity, mask);
+            }
+        }
+
+        // Use just 0s and ~0s to try to provoke bad cropping or overflow
+        // Parity makes no difference to this test so just test 0
+        if (check_func(ctx_8.filter_line3, "bwdif8.line3.overflow")) {
+
+            declare_func(void, void * dst1, int d_stride,
+                                      const void * prev1, const void * cur1, const void * next1, int prefs,
+                                      int w, int parity, int clip_max);
+
+            randomize_overflow_check(prev0, prev1, mask, 11*WIDTH);
+            randomize_overflow_check(next0, next1, mask, 11*WIDTH);
+            randomize_overflow_check( cur0,  cur1, mask, 11*WIDTH);
+
+            call_ref(dst0, stride,
+                     prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, stride,
+                     WIDTH, 0, mask);
+            call_new(dst1, stride,
+                     prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
+                     WIDTH, 0, mask);
+
+            if (memcmp(dst0, dst1, WIDTH*3)
+                    || memcmp(prev0, prev1, WIDTH*11)
+                    || memcmp(next0, next1, WIDTH*11)
+                    || memcmp( cur0,  cur1, WIDTH*11))
+                fail();
+
+            // No point to benching
+        }
+
+        report("bwdif8.line3");
+    }
+
     {
         LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
         LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);

From 120058b7abd0db1d222b1e197207de8226fdfd94 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 4 Jul 2023 14:04:45 +0000
Subject: [PATCH 151/161] avfilter/vf_bwdif: Add neon for filter_line3
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: John Cox <jc@kynesim.co.uk>
Signed-off-by: Martin Storsjö <martin@martin.st>
(cherry picked from commit f00222e81f7d6a59d977fbb280d67989818e0ad2)
---
 libavfilter/aarch64/vf_bwdif_init_aarch64.c |  28 ++
 libavfilter/aarch64/vf_bwdif_neon.S         | 272 ++++++++++++++++++++
 2 files changed, 300 insertions(+)

diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
index 21e67884ab..f52bc4b9b4 100644
--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -36,6 +36,33 @@ void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
                                int prefs3, int mrefs3, int prefs4, int mrefs4,
                                int parity, int clip_max);

+void ff_bwdif_filter_line3_neon(void * dst1, int d_stride,
+                                const void * prev1, const void * cur1, const void * next1, int s_stride,
+                                int w, int parity, int clip_max);
+
+
+static void filter_line3_helper(void * dst1, int d_stride,
+                                const void * prev1, const void * cur1, const void * next1, int s_stride,
+                                int w, int parity, int clip_max)
+{
+    // Asm works on 16 byte chunks
+    // If w is a multiple of 16 then all is good - if not then if width rounded
+    // up to nearest 16 will fit in both src & dst strides then allow the asm
+    // to write over the padding bytes as that is almost certainly faster than
+    // having to invoke the C version to clean up the tail.
+    const int w1 = FFALIGN(w, 16);
+    const int w0 = clip_max != 255 ? 0 :
+                   d_stride <= w1 && s_stride <= w1 ? w : w & ~15;
+
+    ff_bwdif_filter_line3_neon(dst1, d_stride,
+                               prev1, cur1, next1, s_stride,
+                               w0, parity, clip_max);
+
+    if (w0 < w)
+        ff_bwdif_filter_line3_c((char *)dst1 + w0, d_stride,
+                                (const char *)prev1 + w0, (const char *)cur1 + w0, (const char *)next1 + w0, s_stride,
+                                w - w0, parity, clip_max);
+}

 static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
                                int w, int prefs, int mrefs, int prefs2, int mrefs2,
@@ -93,5 +120,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
     s->filter_intra = filter_intra_helper;
     s->filter_line  = filter_line_helper;
     s->filter_edge  = filter_edge_helper;
+    s->filter_line3 = filter_line3_helper;
 }

diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
index f185e94e3c..ae9aab20cd 100644
--- a/libavfilter/aarch64/vf_bwdif_neon.S
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -154,6 +154,278 @@ const coeffs, align=4   // align 4 means align on 2^4 boundry
         .hword          5077, 981                       // sp[0] = v0.h[6]
 endconst

+// ===========================================================================
+//
+// void ff_bwdif_filter_line3_neon(
+//         void * dst1,         // x0
+//         int d_stride,        // w1
+//         const void * prev1,  // x2
+//         const void * cur1,   // x3
+//         const void * next1,  // x4
+//         int s_stride,        // w5
+//         int w,               // w6
+//         int parity,          // w7
+//         int clip_max);       // [sp, #0] (Ignored)
+
+function ff_bwdif_filter_line3_neon, export=1
+        // Sanity check w
+        cmp             w6, #0
+        ble             99f
+
+        LDR_COEFFS      v0, x17
+
+// #define prev2 cur
+//        const uint8_t * restrict next2 = parity ? prev : next;
+        cmp             w7, #0
+        csel            x17, x2, x4, ne
+
+        // We want all the V registers - save all the ones we must
+        PUSH_VREGS
+
+        // Some rearrangement of initial values for nice layout of refs in regs
+        mov             w10, w6                         // w10 = loop count
+        neg             w9,  w5                         // w9  = mref
+        lsl             w8,  w9,  #1                    // w8 =  mref2
+        add             w7,  w9,  w9, LSL #1            // w7  = mref3
+        lsl             w6,  w9,  #2                    // w6  = mref4
+        mov             w11, w5                         // w11 = pref
+        lsl             w12, w5,  #1                    // w12 = pref2
+        add             w13, w5,  w5, LSL #1            // w13 = pref3
+        lsl             w14, w5,  #2                    // w14 = pref4
+        add             w15, w5,  w5, LSL #2            // w15 = pref5
+        add             w16, w14, w12                   // w16 = pref6
+
+        lsl             w5,  w1,  #1                    // w5 = d_stride * 2
+
+//         for (x = 0; x < w; x++) {
+//             int diff0, diff2;
+//             int d0, d2;
+//             int temporal_diff0, temporal_diff2;
+//
+//             int i1, i2;
+//             int j1, j2;
+//             int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4;
+
+10:
+//             c0 = prev2[0] + next2[0];                // c0 = v20, v21
+//             d0  = c0 >> 1;                           // d0 = v10
+//             temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11
+        ldr             q31, [x3]
+        ldr             q21, [x17]
+        uhadd           v10.16b, v31.16b, v21.16b
+        uabd            v11.16b, v31.16b, v21.16b
+        uaddl           v20.8h,  v21.8b,  v31.8b
+        uaddl2          v21.8h,  v21.16b, v31.16b
+
+        ldr             q31, [x3, w6, sxtw]
+        ldr             q23, [x17, w6, sxtw]
+
+//             i1 = coef_hf[0] * c0;                    // i1 = v2-v5
+        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[2]
+
+        ldr             q30, [x3, w14, sxtw]
+        ldr             q25, [x17, w14, sxtw]
+
+//             m4 = prev2[mrefs4] + next2[mrefs4];      // m4 = v22,v23
+        uaddl           v22.8h,  v23.8b,  v31.8b
+        uaddl2          v23.8h,  v23.16b, v31.16b
+
+//             p4 = prev2[prefs4] + next2[prefs4];      // p4 = v24,v25, (p4 >> 1) = v12
+        uhadd           v12.16b, v25.16b, v30.16b
+        uaddl           v24.8h,  v25.8b,  v30.8b
+        uaddl2          v25.8h,  v25.16b, v30.16b
+
+//             j1 = -coef_hf[1] * (c0 + p4);            // j1 = v6-v9  (-c0:v20,v21)
+        add             v20.8h,  v20.8h,  v24.8h
+        add             v21.8h,  v21.8h,  v25.8h
+        SMULL4K         v6, v7, v8, v9, v20, v21, v0.h[5]
+
+//             m3 = cur[mrefs3];                        // m3 = v20
+        ldr             q20, [x3, w7, sxtw]
+
+//             p3 = cur[prefs3];                        // p3 = v21
+        ldr             q21, [x3, w13, sxtw]
+
+//             i1 += coef_hf[2] * (m4 + p4);            // (-m4:v22,v23) (-p4:v24,v25)
+        add             v22.8h,  v22.8h,  v24.8h
+        add             v23.8h,  v23.8h,  v25.8h
+        UMLAL4K         v2, v3, v4, v5, v22, v23, v0.h[4]
+
+        ldr             q29, [x3, w8, sxtw]
+        ldr             q23, [x17, w8, sxtw]
+
+//             i1 -= coef_lf[1] * 4 * (m3 + p3);        // -
+        uaddl           v30.8h,  v20.8b,  v21.8b
+        uaddl2          v31.8h,  v20.16b, v21.16b
+
+        ldr             q28, [x3, w16, sxtw]
+        ldr             q25, [x17, w16, sxtw]
+
+        UMLSL4K         v2, v3, v4, v5, v30, v31, v0.h[1]
+
+//             m2 = prev2[mrefs2] + next2[mrefs2];      // m2 = v22,v23, (m2 >> 1) = v13
+        uhadd           v13.16b, v23.16b, v29.16b
+        uaddl           v22.8h,  v23.8b,  v29.8b
+        uaddl2          v23.8h,  v23.16b, v29.16b
+
+        ldr             q31, [x3, w12, sxtw]
+        ldr             q27, [x17, w12, sxtw]
+
+//             p6 = prev2[prefs6] + next2[prefs6];      // p6 = v24,v25
+        uaddl           v24.8h,  v25.8b,  v28.8b
+        uaddl2          v25.8h,  v25.16b, v28.16b
+
+//             j1 += coef_hf[2] * (m2 + p6);            // (-p6:v24,v25)
+        add             v24.8h,  v24.8h,  v22.8h
+        add             v25.8h,  v25.8h,  v23.8h
+        UMLAL4K         v6, v7, v8, v9, v24, v25, v0.h[4]
+
+//             m1 = cur[mrefs];                         // m1 = v24
+        ldr             q24, [x3, w9, sxtw]
+
+//             p5 = cur[prefs5];                        // p5 = v25
+        ldr             q25, [x3, w15, sxtw]
+
+//             p2 = prev2[prefs2] + next2[prefs2];      // p2 = v26, v27
+//             temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14
+//             d2  = p2 >> 1;                           // d2 = v15
+        uabd            v14.16b, v31.16b, v27.16b
+        uhadd           v15.16b, v31.16b, v27.16b
+        uaddl           v26.8h,  v27.8b,  v31.8b
+        uaddl2          v27.8h,  v27.16b, v31.16b
+
+//             j1 += coef_hf[0] * p2;                   // -
+        UMLAL4K         v6, v7, v8, v9, v26, v27, v0.h[2]
+
+//             i1 -= coef_hf[1] * (m2 + p2);            // (-m2:v22,v23*) (-p2:v26*,v27*)
+        add             v22.8h,  v22.8h,  v26.8h
+        add             v23.8h,  v23.8h,  v27.8h
+        UMLSL4K         v2, v3, v4, v5, v22, v23, v0.h[3]
+
+//             p1 = cur[prefs];                         // p1 = v22
+        ldr             q22, [x3, w11, sxtw]
+
+//             j1 -= coef_lf[1] * 4 * (m1 + p5);        // -
+        uaddl           v26.8h,  v24.8b,  v25.8b
+        uaddl2          v27.8h,  v24.16b, v25.16b
+        UMLSL4K         v6, v7, v8, v9, v26, v27, v0.h[1]
+
+//             j2 = (coef_sp[0] * (p1 + p3) - coef_sp[1]  * (m1 + p5)) >> 13; // (-p5:v25*) j2=v16
+        uaddl           v18.8h,  v22.8b,  v21.8b
+        uaddl2          v19.8h,  v22.16b, v21.16b
+        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
+
+        uaddl           v18.8h,  v24.8b,  v25.8b
+        uaddl2          v19.8h,  v24.16b, v25.16b
+        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
+
+        SQSHRUNN        v16, v28, v29, v30, v31, 13
+
+//             i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17
+        uaddl           v18.8h,  v22.8b,  v24.8b
+        uaddl2          v19.8h,  v22.16b, v24.16b
+        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
+
+        uaddl           v18.8h,  v20.8b,  v21.8b
+        uaddl2          v19.8h,  v20.16b, v21.16b
+        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
+
+        SQSHRUNN        v17, v28, v29, v30, v31, 13
+
+//             i1 += coef_lf[0] * 4 * (m1 + p1);        // p1 = v22, m1 = v24
+        uaddl           v26.8h,  v24.8b,  v22.8b
+        uaddl2          v27.8h,  v24.16b, v22.16b
+        UMLAL4K         v2, v3, v4, v5, v26, v27, v0.h[0]
+
+        ldr             q31, [x2, w9, sxtw]
+        ldr             q29, [x4, w9, sxtw]
+
+//             j1 += coef_lf[0] * 4 * (p1 + p3);        // p1 = v22, p3 = v21
+        uaddl           v26.8h,  v21.8b,  v22.8b
+        uaddl2          v27.8h,  v21.16b, v22.16b
+        UMLAL4K         v6, v7, v8, v9, v26, v27, v0.h[0]
+
+        ldr             q30, [x2, w11, sxtw]
+        ldr             q28, [x4, w11, sxtw]
+
+//             i1 >>= 15;                               // i1 = v2, -v3, -v4*, -v5*
+        SQSHRUNN        v2, v2, v3, v4, v5, 15
+
+//             j1 >>= 15;                               // j1 = v3, -v6*, -v7*, -v8*, -v9*
+        SQSHRUNN        v3, v6, v7, v8, v9, 15
+
+//             {
+//                 int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
+//                 int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
+        uabd            v30.16b, v22.16b, v30.16b
+        uabd            v31.16b, v24.16b, v31.16b
+        uabd            v28.16b, v22.16b, v28.16b
+        uabd            v29.16b, v24.16b, v29.16b
+        uhadd           v31.16b, v31.16b, v30.16b
+        uhadd           v29.16b, v29.16b, v28.16b
+
+        ldr             q27, [x2, w13, sxtw]
+        ldr             q26, [x4, w13, sxtw]
+
+//                 diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18
+        ushr            v18.16b, v11.16b, #1
+        umax            v18.16b, v18.16b, v31.16b
+        umax            v18.16b, v18.16b, v29.16b
+//             }                                        // v28, v30 preserved for next block
+//             {  // tdiff2 = v14
+//                 int t1 =(FFABS(prev[prefs] - p1) + FFABS(prev[prefs3] - p3)) >> 1;
+//                 int t2 =(FFABS(next[prefs] - p1) + FFABS(next[prefs3] - p3)) >> 1;
+        uabd            v31.16b, v21.16b, v27.16b
+        uabd            v29.16b, v21.16b, v26.16b
+        uhadd           v31.16b, v31.16b, v30.16b
+        uhadd           v29.16b, v29.16b, v28.16b
+
+//                 diff2 = FFMAX3(temporal_diff2 >> 1, t1, t2); // diff2=v19
+        ushr            v19.16b, v14.16b, #1
+        umax            v19.16b, v19.16b, v31.16b
+        umax            v19.16b, v19.16b, v29.16b
+//             }
+
+        // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15
+        SPAT_CHECK      v18, v13, v24, v10, v22, v15, v31, v30, v29, v28
+
+        //  diff2 = v19, d0 = v10, p1 = v22, d2 = v15, p3 = v21, (p4 >> 1) = v12
+        SPAT_CHECK      v19, v10, v22, v15, v21, v12, v31, v30, v29, v28
+
+        // j1 = v3, j2 = v16, p1 = v22, d2 = v15, p3 = v21, td2 = v14, diff2 = v19
+        INTERPOL        v3, v3, v16, v22, v15, v21, v14, v19, v31, v30, v29
+
+//                 dst[d_stride * 2] = av_clip_uint8(interpol);
+        str             q3,  [x0, w5, sxtw]
+
+//             dst[d_stride] = p1;
+        str             q22, [x0, w1, sxtw]
+
+        // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18
+        INTERPOL        v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29
+
+//                 dst[0] = av_clip_uint8(interpol);
+        str             q2,  [x0], #16
+//             }
+//
+//             dst++;
+//             cur++;
+//             prev++;
+//             prev2++;
+//             next++;
+//         }
+        subs            w10, w10, #16
+        add             x2,  x2,  #16
+        add             x3,  x3,  #16
+        add             x4,  x4,  #16
+        add             x17, x17, #16
+        bgt             10b
+
+        POP_VREGS
+99:
+        ret
+endfunc
+
 // ===========================================================================
 //
 // void filter_line(

From ef057ba172ece9646e1b4dd84a8f9587af28844f Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 24 Jul 2023 16:39:06 +0100
Subject: [PATCH 152/161] weak_link: Fix ref count init

(cherry picked from commit d6de45b15a0c96bfdc96bbc441963a60945e5eba)
---
 libavcodec/weak_link.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavcodec/weak_link.c b/libavcodec/weak_link.c
index f234a985b9..5a79e89ed7 100644
--- a/libavcodec/weak_link.c
+++ b/libavcodec/weak_link.c
@@ -19,6 +19,7 @@ struct ff_weak_link_master * ff_weak_link_new(void * p)
     struct ff_weak_link_master * w = malloc(sizeof(*w));
     if (!w)
         return NULL;
+    atomic_init(&w->ref_count, 0);
     w->ptr = p;
     if (pthread_rwlock_init(&w->lock, NULL)) {
         free(w);

From 506426ba08bc90e7efa712b549b29d8e1c231344 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 24 Jul 2023 17:28:06 +0100
Subject: [PATCH 153/161] v4l2_m2m: Check fd before attempting to close (fix
 valgrind warn)

(cherry picked from commit befa42878d054d1fba53d5da14406faaae224daf)
---
 libavcodec/v4l2_m2m.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index 28d9ed4988..238ceea235 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -340,8 +340,11 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
     ff_v4l2_context_release(&s->output);

     dmabufs_ctl_unref(&s->db_ctl);
-    close(s->fd);
-    s->fd = -1;
+
+    if (s->fd != -1) {
+        close(s->fd);
+        s->fd = -1;
+    }

     s->self_ref = NULL;
     // This is only called on avctx close so after this point we don't have that

From 4c58b8d414d6b84efad08c6e9920fe1f5c49a682 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 26 Jul 2023 16:29:39 +0000
Subject: [PATCH 154/161] v4l2_req_devscan: Fix udev leak

(cherry picked from commit 53b17ffd8a8890ef483163f3c9b0f96b437303f1)
---
 libavcodec/v4l2_req_devscan.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_req_devscan.c b/libavcodec/v4l2_req_devscan.c
index cfa94d55c4..ee8527ba1f 100644
--- a/libavcodec/v4l2_req_devscan.c
+++ b/libavcodec/v4l2_req_devscan.c
@@ -437,12 +437,14 @@ int devscan_build(void * const dc, struct devscan **pscan)
     }

     udev_enumerate_unref(enumerate);
+    udev_unref(udev);

     *pscan = scan;
     return 0;

 fail:
-    udev_unref(udev);
+    if (udev)
+        udev_unref(udev);
     devscan_delete(&scan);
     return ret;
 }

From 175ef7dc323be400a0570c53725a8da4ac1fa213 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 26 Jul 2023 16:42:27 +0000
Subject: [PATCH 155/161] v4l2_m2m: Fix device_ref leak

(cherry picked from commit bfea15c07b4301cd1208981c8f221e5e3a598b34)
---
 libavcodec/v4l2_m2m.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index 238ceea235..add64b8e63 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -338,6 +338,7 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
     }

     ff_v4l2_context_release(&s->output);
+    av_buffer_unref(&s->device_ref);

     dmabufs_ctl_unref(&s->db_ctl);


From e324b9d956d7dab58a5790efcfb8e46c95868ff4 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 28 Jul 2023 16:10:01 +0000
Subject: [PATCH 156/161] v4l2_m2m_dec: Avoid structure init warnings when
 struct changes

(cherry picked from commit 8a836af420ed8c8dba90e2fd88691bcaa0668f8a)
---
 libavcodec/v4l2_m2m_dec.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 11c83b2d66..584e0b8825 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -1004,11 +1004,13 @@ parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s)
 #if CONFIG_H264_DECODER
         case AV_CODEC_ID_H264:
         {
-            H264ParamSets ps = {{NULL}};
+            H264ParamSets ps;
             int is_avc = 0;
             int nal_length_size = 0;
             int ret;

+            memset(&ps, 0, sizeof(ps));
+
             ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size,
                                            &ps, &is_avc, &nal_length_size,
                                            avctx->err_recognition, avctx);
@@ -1034,12 +1036,15 @@ parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s)
 #if CONFIG_HEVC_DECODER
         case AV_CODEC_ID_HEVC:
         {
-            HEVCParamSets ps = {{NULL}};
-            HEVCSEI sei = {{{{0}}}};
+            HEVCParamSets ps;
+            HEVCSEI sei;
             int is_nalff = 0;
             int nal_length_size = 0;
             int ret;

+            memset(&ps, 0, sizeof(ps));
+            memset(&sei, 0, sizeof(sei));
+
             ret = ff_hevc_decode_extradata(avctx->extradata, avctx->extradata_size,
                                            &ps, &sei, &is_nalff, &nal_length_size,
                                            avctx->err_recognition, 0, avctx);

From ae9cd86b532ae2a3800981c23a0aeac15a83f918 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Sat, 22 Jul 2023 12:33:50 +0000
Subject: [PATCH 157/161] v4l2_m2m_dec: Avoid calling get_format if no V4L2
 decoder device

Move the get_format callback to after the decoder device has been found.
This means that get_format will never be called if there is no chance
that init will succeed which helps programs (such as VLC) that do
significant processing in that callback to avoid it. It also means that
the list of formats availible can actually represent reality.

(cherry picked from commit 3b27cb41d7df73c054452fa49269988d4df32409)
---
 libavcodec/v4l2_context.c |  41 +++++++++++++
 libavcodec/v4l2_context.h |  13 ++++
 libavcodec/v4l2_m2m_dec.c | 122 ++++++++++++++++++++++++++++----------
 3 files changed, 145 insertions(+), 31 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 79a31cf930..978a487ca9 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -1064,6 +1064,47 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout)
     return 0;
 }

+// Return 0 terminated list of drm fourcc video formats for this context
+// NULL if none found or error
+// Returned list is malloced so must be freed
+uint32_t * ff_v4l2_context_enum_drm_formats(V4L2Context *ctx, unsigned int *pN)
+{
+    unsigned int i;
+    unsigned int n = 0;
+    unsigned int size = 0;
+    uint32_t * e = NULL;
+    *pN = 0;
+
+    for (i = 0; i < 1024; ++i) {
+        struct v4l2_fmtdesc fdesc = {
+            .index = i,
+            .type = ctx->type
+        };
+
+        if (ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc))
+            return e;
+
+        if (n + 1 >= size) {
+            unsigned int newsize = (size == 0) ? 16 : size * 2;
+            uint32_t * t = av_realloc(e, newsize * sizeof(*t));
+            if (!t)
+                return e;
+            e = t;
+            size = newsize;
+        }
+
+        e[n] = fdesc.pixelformat;
+        e[++n] = 0;
+        if (pN)
+            *pN = n;
+    }
+
+    // If we've looped 1024 times we are clearly confused
+    *pN = 0;
+    av_free(e);
+    return NULL;
+}
+
 int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
 {
     struct v4l2_format_update fmt = { 0 };
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 5afed3e6ec..f4240f7ddd 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -151,6 +151,19 @@ int ff_v4l2_context_set_format(V4L2Context* ctx);
  */
 int ff_v4l2_context_get_format(V4L2Context* ctx, int probe);

+/**
+ * Get the list of drm fourcc pixel formats for this context
+ *
+ * @param[in] ctx A pointer to a V4L2Context. See V4L2Context
+ *       description for required variables.
+ * @param[in] pN A pointer to receive the number of formats
+ *       found. May be NULL if not wanted.
+ * @return Pointer to malloced list of zero terminated formats,
+ *         NULL if none or error. As list is malloced it must be
+ *         freed.
+ */
+uint32_t * ff_v4l2_context_enum_drm_formats(V4L2Context *ctx, unsigned int *pN);
+
 /**
  * Releases a V4L2Context.
  *
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 584e0b8825..c4f38cc24e 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -873,10 +873,9 @@ check_profile(AVCodecContext *const avctx, V4L2m2mContext *const s)
 };

 static int
-check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
+check_size(AVCodecContext * const avctx, V4L2m2mContext * const s, const uint32_t fcc)
 {
     unsigned int i;
-    const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format);
     const uint32_t w = avctx->coded_width;
     const uint32_t h = avctx->coded_height;

@@ -1073,12 +1072,91 @@ parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s)
     }
 }

+static int
+choose_capture_format(AVCodecContext * const avctx, V4L2m2mContext * const s)
+{
+    const V4L2m2mPriv * const priv = avctx->priv_data;
+    unsigned int fmts_n;
+    uint32_t *fmts = ff_v4l2_context_enum_drm_formats(&s->capture, &fmts_n);
+    enum AVPixelFormat *fmts2 = NULL;
+    enum AVPixelFormat t;
+    enum AVPixelFormat gf_pix_fmt;
+    unsigned int i;
+    unsigned int n = 0;
+    unsigned int pref_n = 1;
+    int rv = AVERROR(ENOENT);
+
+    if (!fmts)
+        return AVERROR(ENOENT);
+
+    if ((fmts2 = av_malloc(sizeof(*fmts2) * (fmts_n + 2))) == NULL) {
+        rv = AVERROR(ENOMEM);
+        goto error;
+    }
+
+    // Filter for formats that are supported by ffmpeg and
+    // can accomodate the stream size
+    fmts2[n++] = AV_PIX_FMT_DRM_PRIME;
+    for (i = 0; i != fmts_n; ++i) {
+        const enum AVPixelFormat f = ff_v4l2_format_v4l2_to_avfmt(fmts[i], AV_CODEC_ID_RAWVIDEO);
+        if (f == AV_PIX_FMT_NONE)
+            continue;
+
+        if (check_size(avctx, s, fmts[i]) != 0)
+            continue;
+
+        if (f == priv->pix_fmt)
+            pref_n = n;
+        fmts2[n++] = f;
+    }
+    fmts2[n] = AV_PIX_FMT_NONE;
+
+    if (n < 2) {
+        av_log(avctx, AV_LOG_DEBUG, "%s: No usable formats found\n", __func__);
+        goto error;
+    }
+
+    // Put preferred s/w format at the end - ff_get_format will put it in sw_pix_fmt
+    t = fmts2[n - 1];
+    fmts2[n - 1] = fmts2[pref_n];
+    fmts2[pref_n] = t;
+
+    gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
+    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
+           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
+           avctx->coded_width, avctx->coded_height,
+           gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
+
+    if (gf_pix_fmt == AV_PIX_FMT_NONE)
+        goto error;
+
+    if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
+        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
+        s->capture.av_pix_fmt = avctx->sw_pix_fmt;
+        s->output_drm = 1;
+    }
+    else {
+        avctx->pix_fmt = gf_pix_fmt;
+        s->capture.av_pix_fmt = gf_pix_fmt;
+        s->output_drm = 0;
+    }
+
+    // Get format converts capture.av_pix_fmt back into a V4L2 format in the context
+    if ((rv = ff_v4l2_context_get_format(&s->capture, 0)) != 0)
+        goto error;
+    rv = ff_v4l2_context_set_format(&s->capture);
+
+error:
+    av_free(fmts2);
+    av_free(fmts);
+    return rv;
+}
+
 static av_cold int v4l2_decode_init(AVCodecContext *avctx)
 {
     V4L2Context *capture, *output;
     V4L2m2mContext *s;
     V4L2m2mPriv *priv = avctx->priv_data;
-    int gf_pix_fmt;
     int ret;

     av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
@@ -1122,28 +1200,8 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     capture->av_pix_fmt = avctx->pix_fmt;
     capture->min_buf_size = 0;

-    /* the client requests the codec to generate DRM frames:
-     *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
-     *       check the ff_v4l2_buffer_to_avframe conversion function.
-     *   - the DRM frame format is passed in the DRM frame descriptor layer.
-     *       check the v4l2_get_drm_frame function.
-     */
-
-    avctx->sw_pix_fmt = avctx->pix_fmt;
-    gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
-    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
-           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
-           avctx->coded_width, avctx->coded_height,
-           gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
-
-    if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
-        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
-        s->output_drm = 1;
-    }
-    else {
-        capture->av_pix_fmt = gf_pix_fmt;
-        s->output_drm = 0;
-    }
+    capture->av_pix_fmt = AV_PIX_FMT_NONE;
+    s->output_drm = 0;

     s->db_ctl = NULL;
     if (priv->dmabuf_alloc != NULL && strcmp(priv->dmabuf_alloc, "v4l2") != 0) {
@@ -1185,19 +1243,21 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
         return ret;
     }

-    if ((ret = v4l2_prepare_decoder(s)) < 0)
-        return ret;
-
     if ((ret = get_quirks(avctx, s)) != 0)
         return ret;

-    if ((ret = check_size(avctx, s)) != 0)
-        return ret;
-
     if ((ret = check_profile(avctx, s)) != 0) {
         av_log(avctx, AV_LOG_WARNING, "Profile %d not supported by decode\n", avctx->profile);
         return ret;
     }
+
+    // Size check done as part of format filtering
+    if ((ret = choose_capture_format(avctx, s)) != 0)
+        return ret;
+
+    if ((ret = v4l2_prepare_decoder(s)) < 0)
+        return ret;
+
     return 0;
 }


From 24221a3f821043f1ce11dcd1682b986e44ae4513 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 10 Aug 2023 06:26:35 +0000
Subject: [PATCH 158/161] v4l2_req_dmabufs: Fix crash on free if dmabuf
 imported

Thanks to Ratchanan Srirattanamet for finding this
---
 libavcodec/v4l2_req_dmabufs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
index acc0366e76..017c3892a5 100644
--- a/libavcodec/v4l2_req_dmabufs.c
+++ b/libavcodec/v4l2_req_dmabufs.c
@@ -232,7 +232,8 @@ void dmabuf_free(struct dmabuf_h * dh)
     request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
 #endif

-    dh->fns->buf_free(dh);
+    if (dh->fns != NULL && dh->fns->buf_free)
+        dh->fns->buf_free(dh);

     if (dh->mapptr != MAP_FAILED && dh->mapptr != NULL)
         munmap(dh->mapptr, dh->size);

From 05ceac3f3a7b5a04cc50c769a489e819895ca345 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 10 Aug 2023 06:34:47 +0000
Subject: [PATCH 159/161] aarch64/rgb2rgb_neon: Fix bgr24->yuv matrix read to
 flip correct way

---
 libswscale/aarch64/rgb2rgb_neon.S | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 476ca723a0..077d1dd593 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -92,15 +92,12 @@ endfunc

 function ff_rgb24toyv12_aarch64, export=1
         ldr             x15, [sp, #8]
-        ld1             {v3.s}[2], [x15], #4
-        ld1             {v3.s}[1], [x15], #4
-        ld1             {v3.s}[0], [x15], #4
-        ld1             {v4.s}[2], [x15], #4
-        ld1             {v4.s}[1], [x15], #4
-        ld1             {v4.s}[0], [x15], #4
-        ld1             {v5.s}[2], [x15], #4
-        ld1             {v5.s}[1], [x15], #4
-        ld1             {v5.s}[0], [x15]
+        ld3             {v3.s, v4.s, v5.s}[0], [x15], #12
+        ld3             {v3.s, v4.s, v5.s}[1], [x15], #12
+        ld3             {v3.s, v4.s, v5.s}[2], [x15]
+        mov             v6.16b, v3.16b
+        mov             v3.16b, v5.16b
+        mov             v5.16b, v6.16b
         b               99f
 endfunc


From 2860e13304a1b051f06110d2002b3371266d9e7f Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 10 Aug 2023 06:36:51 +0000
Subject: [PATCH 160/161] aarch64/rgb2rgb_neon: Add macros to make common code
 explicit

---
 libswscale/aarch64/rgb2rgb_neon.S | 276 ++++++++++--------------------
 1 file changed, 95 insertions(+), 181 deletions(-)

diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 077d1dd593..0956800b41 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -78,6 +78,67 @@ function ff_interleave_bytes_neon, export=1
         ret
 endfunc

+// Expand rgb2 into r0+r1/g0+g1/b0+b1
+.macro XRGB3Y r0, g0, b0, r1, g1, b1, r2, g2, b2
+        uxtl            \r0\().8h, \r2\().8b
+        uxtl            \g0\().8h, \g2\().8b
+        uxtl            \b0\().8h, \b2\().8b
+
+        uxtl2           \r1\().8h, \r2\().16b
+        uxtl2           \g1\().8h, \g2\().16b
+        uxtl2           \b1\().8h, \b2\().16b
+.endm
+
+// Expand rgb2 into r0+r1/g0+g1/b0+b1
+// and pick every other el to put back into rgb2 for chroma
+.macro XRGB3YC r0, g0, b0, r1, g1, b1, r2, g2, b2
+        XRGB3Y          \r0, \g0, \b0, \r1, \g1, \b1, \r2, \g2, \b2
+
+        bic             \r2\().8h, #0xff, LSL #8
+        bic             \g2\().8h, #0xff, LSL #8
+        bic             \b2\().8h, #0xff, LSL #8
+.endm
+
+.macro SMLAL3 d0, d1, s0, s1, s2, c0, c1, c2
+        smull           \d0\().4s, \s0\().4h, \c0
+        smlal           \d0\().4s, \s1\().4h, \c1
+        smlal           \d0\().4s, \s2\().4h, \c2
+        smull2          \d1\().4s, \s0\().8h, \c0
+        smlal2          \d1\().4s, \s1\().8h, \c1
+        smlal2          \d1\().4s, \s2\().8h, \c2
+.endm
+
+// d0 may be s0
+// s0, s2 corrupted
+.macro SHRN_Y d0, s0, s1, s2, s3, k128h
+        shrn            \s0\().4h, \s0\().4s, #12
+        shrn2           \s0\().8h, \s1\().4s, #12
+        add             \s0\().8h, \s0\().8h, \k128h\().8h     // +128 (>> 3 = 16)
+        sqrshrun        \d0\().8b, \s0\().8h, #3
+        shrn            \s2\().4h, \s2\().4s, #12
+        shrn2           \s2\().8h, \s3\().4s, #12
+        add             \s2\().8h, \s2\().8h, \k128h\().8h
+        sqrshrun2       \d0\().16b, v28.8h, #3
+.endm
+
+.macro SHRN_C d0, s0, s1, k128b
+        shrn            \s0\().4h, \s0\().4s, #14
+        shrn2           \s0\().8h, \s1\().4s, #14
+        sqrshrn         \s0\().8b, \s0\().8h, #1
+        add             \d0\().8b, \s0\().8b, \k128b\().8b     // +128
+.endm
+
+.macro STB2V s0, n, a
+        st1             {\s0\().b}[(\n+0)], [\a], #1
+        st1             {\s0\().b}[(\n+1)], [\a], #1
+.endm
+
+.macro STB4V s0, n, a
+        STB2V           \s0, (\n+0), \a
+        STB2V           \s0, (\n+2), \a
+.endm
+
+
 // void ff_rgb24toyv12_aarch64(
 //              const uint8_t *src,             // x0
 //              uint8_t *ydst,                  // x1
@@ -111,7 +172,7 @@ endfunc
 //              int lumStride,                  // w6
 //              int chromStride,                // w7
 //              int srcStr,                     // [sp, #0]
-//              int32_t *rgb2yuv);              // [sp, #8]
+//              int32_t *rgb2yuv);              // [sp, #8] (including Mac)

 // regs
 // v0-2         Src bytes - reused as chroma src
@@ -130,13 +191,12 @@ endfunc
 // v30          V out
 // v31          V tmp

-// Assumes Little Endian in tail stores & conversion matrix
-
 function ff_bgr24toyv12_aarch64, export=1
         ldr             x15, [sp, #8]
         ld3             {v3.s, v4.s, v5.s}[0], [x15], #12
         ld3             {v3.s, v4.s, v5.s}[1], [x15], #12
         ld3             {v3.s, v4.s, v5.s}[2], [x15]
+
 99:
         ldr             w14, [sp, #0]
         movi            v7.8b, #128
@@ -167,73 +227,29 @@ function ff_bgr24toyv12_aarch64, export=1
         b.le            13f

 10:
-        uxtl            v16.8h, v0.8b
-        uxtl            v17.8h, v1.8b
-        uxtl            v18.8h, v2.8b
-
-        uxtl2           v20.8h, v0.16b
-        uxtl2           v21.8h, v1.16b
-        uxtl2           v22.8h, v2.16b
-
-        bic             v0.8h, #0xff, LSL #8
-        bic             v1.8h, #0xff, LSL #8
-        bic             v2.8h, #0xff, LSL #8
+        XRGB3YC         v16, v17, v18,  v20, v21, v22,  v0, v1, v2

         // Testing shows it is faster to stack the smull/smlal ops together
         // rather than interleave them between channels and indeed even the
         // shift/add sections seem happier not interleaved

         // Y0
-        smull           v26.4s, v16.4h, v3.h[0]
-        smlal           v26.4s, v17.4h, v4.h[0]
-        smlal           v26.4s, v18.4h, v5.h[0]
-        smull2          v27.4s, v16.8h, v3.h[0]
-        smlal2          v27.4s, v17.8h, v4.h[0]
-        smlal2          v27.4s, v18.8h, v5.h[0]
+        SMLAL3          v26, v27, v16, v17, v18, v3.h[0], v4.h[0], v5.h[0]
         // Y1
-        smull           v28.4s, v20.4h, v3.h[0]
-        smlal           v28.4s, v21.4h, v4.h[0]
-        smlal           v28.4s, v22.4h, v5.h[0]
-        smull2          v29.4s, v20.8h, v3.h[0]
-        smlal2          v29.4s, v21.8h, v4.h[0]
-        smlal2          v29.4s, v22.8h, v5.h[0]
-        shrn            v26.4h, v26.4s, #12
-        shrn2           v26.8h, v27.4s, #12
-        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-        sqrshrun        v26.8b, v26.8h, #3
-        shrn            v28.4h, v28.4s, #12
-        shrn2           v28.8h, v29.4s, #12
-        add             v28.8h, v28.8h, v6.8h
-        sqrshrun2       v26.16b, v28.8h, #3
-        // Y0/Y1
+        SMLAL3          v28, v29, v20, v21, v22, v3.h[0], v4.h[0], v5.h[0]
+        SHRN_Y          v26, v26, v27, v28, v29, v6

         // U
         // Vector subscript *2 as we loaded into S but are only using H
-        smull           v24.4s, v0.4h, v3.h[2]
-        smlal           v24.4s, v1.4h, v4.h[2]
-        smlal           v24.4s, v2.4h, v5.h[2]
-        smull2          v25.4s, v0.8h, v3.h[2]
-        smlal2          v25.4s, v1.8h, v4.h[2]
-        smlal2          v25.4s, v2.8h, v5.h[2]
+        SMLAL3          v24, v25, v0, v1, v2, v3.h[2], v4.h[2], v5.h[2]

         // V
-        smull           v30.4s, v0.4h, v3.h[4]
-        smlal           v30.4s, v1.4h, v4.h[4]
-        smlal           v30.4s, v2.4h, v5.h[4]
-        smull2          v31.4s, v0.8h, v3.h[4]
-        smlal2          v31.4s, v1.8h, v4.h[4]
-        smlal2          v31.4s, v2.8h, v5.h[4]
+        SMLAL3          v30, v31, v0, v1, v2, v3.h[4], v4.h[4], v5.h[4]

         ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48

-        shrn            v24.4h, v24.4s, #14
-        shrn2           v24.8h, v25.4s, #14
-        sqrshrn         v24.8b, v24.8h, #1
-        add             v24.8b, v24.8b, v7.8b     // +128
-        shrn            v30.4h, v30.4s, #14
-        shrn2           v30.8h, v31.4s, #14
-        sqrshrn         v30.8b, v30.8h, #1
-        add             v30.8b, v30.8b, v7.8b     // +128
+        SHRN_C          v24, v24, v25, v7
+        SHRN_C          v30, v30, v31, v7

         subs            w9, w9, #16

@@ -250,69 +266,21 @@ function ff_bgr24toyv12_aarch64, export=1
 13:
         // Body is simple copy of main loop body minus preload

-        uxtl            v16.8h, v0.8b
-        uxtl            v17.8h, v1.8b
-        uxtl            v18.8h, v2.8b
-
-        uxtl2           v20.8h, v0.16b
-        uxtl2           v21.8h, v1.16b
-        uxtl2           v22.8h, v2.16b
-
-        bic             v0.8h, #0xff, LSL #8
-        bic             v1.8h, #0xff, LSL #8
-        bic             v2.8h, #0xff, LSL #8
-
+        XRGB3YC         v16, v17, v18,  v20, v21, v22,  v0, v1, v2
         // Y0
-        smull           v26.4s, v16.4h, v3.h[0]
-        smlal           v26.4s, v17.4h, v4.h[0]
-        smlal           v26.4s, v18.4h, v5.h[0]
-        smull2          v27.4s, v16.8h, v3.h[0]
-        smlal2          v27.4s, v17.8h, v4.h[0]
-        smlal2          v27.4s, v18.8h, v5.h[0]
+        SMLAL3          v26, v27, v16, v17, v18, v3.h[0], v4.h[0], v5.h[0]
         // Y1
-        smull           v28.4s, v20.4h, v3.h[0]
-        smlal           v28.4s, v21.4h, v4.h[0]
-        smlal           v28.4s, v22.4h, v5.h[0]
-        smull2          v29.4s, v20.8h, v3.h[0]
-        smlal2          v29.4s, v21.8h, v4.h[0]
-        smlal2          v29.4s, v22.8h, v5.h[0]
-        shrn            v26.4h, v26.4s, #12
-        shrn2           v26.8h, v27.4s, #12
-        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-        sqrshrun        v26.8b, v26.8h, #3
-        shrn            v28.4h, v28.4s, #12
-        shrn2           v28.8h, v29.4s, #12
-        add             v28.8h, v28.8h, v6.8h
-        sqrshrun2       v26.16b, v28.8h, #3
-        // Y0/Y1
-
+        SMLAL3          v28, v29, v20, v21, v22, v3.h[0], v4.h[0], v5.h[0]
+        SHRN_Y          v26, v26, v27, v28, v29, v6
         // U
-        // Vector subscript *2 as we loaded into S but are only using H
-        smull           v24.4s, v0.4h, v3.h[2]
-        smlal           v24.4s, v1.4h, v4.h[2]
-        smlal           v24.4s, v2.4h, v5.h[2]
-        smull2          v25.4s, v0.8h, v3.h[2]
-        smlal2          v25.4s, v1.8h, v4.h[2]
-        smlal2          v25.4s, v2.8h, v5.h[2]
-
+        SMLAL3          v24, v25, v0, v1, v2, v3.h[2], v4.h[2], v5.h[2]
         // V
-        smull           v30.4s, v0.4h, v3.h[4]
-        smlal           v30.4s, v1.4h, v4.h[4]
-        smlal           v30.4s, v2.4h, v5.h[4]
-        smull2          v31.4s, v0.8h, v3.h[4]
-        smlal2          v31.4s, v1.8h, v4.h[4]
-        smlal2          v31.4s, v2.8h, v5.h[4]
+        SMLAL3          v30, v31, v0, v1, v2, v3.h[4], v4.h[4], v5.h[4]

         cmp             w9, #-16

-        shrn            v24.4h, v24.4s, #14
-        shrn2           v24.8h, v25.4s, #14
-        sqrshrn         v24.8b, v24.8h, #1
-        add             v24.8b, v24.8b, v7.8b     // +128
-        shrn            v30.4h, v30.4s, #14
-        shrn2           v30.8h, v31.4s, #14
-        sqrshrn         v30.8b, v30.8h, #1
-        add             v30.8b, v30.8b, v7.8b     // +128
+        SHRN_C          v24, v24, v25, v7
+        SHRN_C          v30, v30, v31, v7

         // Here:
         // w9 == 0      width % 16 == 0, tail done
@@ -347,14 +315,14 @@ function ff_bgr24toyv12_aarch64, export=1
 2:
         tbz             w9, #3, 1f
         st1             {v26.8b},    [x11], #8
-        st1             {v24.s}[0],  [x12], #4
-        st1             {v30.s}[0],  [x13], #4
+        STB4V           v24, 0, x12
+        STB4V           v30, 0, x13
 1:      tbz             w9, #2, 1f
-        st1             {v26.s}[2],  [x11], #4
-        st1             {v24.h}[2],  [x12], #2
-        st1             {v30.h}[2],  [x13], #2
+        STB4V           v26  8, x11
+        STB2V           v24, 4, x12
+        STB2V           v30, 4, x13
 1:      tbz             w9, #1, 1f
-        st1             {v26.h}[6],  [x11], #2
+        STB2V           v26, 12, x11
         st1             {v24.b}[6],  [x12], #1
         st1             {v30.b}[6],  [x13], #1
 1:      tbz             w9, #0, 1f
@@ -381,44 +349,15 @@ function ff_bgr24toyv12_aarch64, export=1
         b.le            13f

 10:
-        uxtl            v16.8h, v0.8b
-        uxtl            v17.8h, v1.8b
-        uxtl            v18.8h, v2.8b
-
-        uxtl2           v20.8h, v0.16b
-        uxtl2           v21.8h, v1.16b
-        uxtl2           v22.8h, v2.16b
-
-        // Testing shows it is faster to stack the smull/smlal ops together
-        // rather than interleave them between channels and indeed even the
-        // shift/add sections seem happier not interleaved
-
+        XRGB3Y          v16, v17, v18,  v20, v21, v22,  v0, v1, v2
         // Y0
-        smull           v26.4s, v16.4h, v3.h[0]
-        smlal           v26.4s, v17.4h, v4.h[0]
-        smlal           v26.4s, v18.4h, v5.h[0]
-        smull2          v27.4s, v16.8h, v3.h[0]
-        smlal2          v27.4s, v17.8h, v4.h[0]
-        smlal2          v27.4s, v18.8h, v5.h[0]
+        SMLAL3          v26, v27, v16, v17, v18, v3.h[0], v4.h[0], v5.h[0]
         // Y1
-        smull           v28.4s, v20.4h, v3.h[0]
-        smlal           v28.4s, v21.4h, v4.h[0]
-        smlal           v28.4s, v22.4h, v5.h[0]
-        smull2          v29.4s, v20.8h, v3.h[0]
-        smlal2          v29.4s, v21.8h, v4.h[0]
-        smlal2          v29.4s, v22.8h, v5.h[0]
+        SMLAL3          v28, v29, v20, v21, v22, v3.h[0], v4.h[0], v5.h[0]

         ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48

-        shrn            v26.4h, v26.4s, #12
-        shrn2           v26.8h, v27.4s, #12
-        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-        sqrshrun        v26.8b, v26.8h, #3
-        shrn            v28.4h, v28.4s, #12
-        shrn2           v28.8h, v29.4s, #12
-        add             v28.8h, v28.8h, v6.8h
-        sqrshrun2       v26.16b, v28.8h, #3
-        // Y0/Y1
+        SHRN_Y          v26, v26, v27, v28, v29, v6

         subs            w9, w9, #16

@@ -433,40 +372,15 @@ function ff_bgr24toyv12_aarch64, export=1
 13:
         // Body is simple copy of main loop body minus preload

-        uxtl            v16.8h, v0.8b
-        uxtl            v17.8h, v1.8b
-        uxtl            v18.8h, v2.8b
-
-        uxtl2           v20.8h, v0.16b
-        uxtl2           v21.8h, v1.16b
-        uxtl2           v22.8h, v2.16b
-
+        XRGB3Y          v16, v17, v18,  v20, v21, v22,  v0, v1, v2
         // Y0
-        smull           v26.4s, v16.4h, v3.h[0]
-        smlal           v26.4s, v17.4h, v4.h[0]
-        smlal           v26.4s, v18.4h, v5.h[0]
-        smull2          v27.4s, v16.8h, v3.h[0]
-        smlal2          v27.4s, v17.8h, v4.h[0]
-        smlal2          v27.4s, v18.8h, v5.h[0]
+        SMLAL3          v26, v27, v16, v17, v18, v3.h[0], v4.h[0], v5.h[0]
         // Y1
-        smull           v28.4s, v20.4h, v3.h[0]
-        smlal           v28.4s, v21.4h, v4.h[0]
-        smlal           v28.4s, v22.4h, v5.h[0]
-        smull2          v29.4s, v20.8h, v3.h[0]
-        smlal2          v29.4s, v21.8h, v4.h[0]
-        smlal2          v29.4s, v22.8h, v5.h[0]
+        SMLAL3          v28, v29, v20, v21, v22, v3.h[0], v4.h[0], v5.h[0]

         cmp             w9, #-16

-        shrn            v26.4h, v26.4s, #12
-        shrn2           v26.8h, v27.4s, #12
-        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-        sqrshrun        v26.8b, v26.8h, #3
-        shrn            v28.4h, v28.4s, #12
-        shrn2           v28.8h, v29.4s, #12
-        add             v28.8h, v28.8h, v6.8h
-        sqrshrun2       v26.16b, v28.8h, #3
-        // Y0/Y1
+        SHRN_Y          v26, v26, v27, v28, v29, v6

         // Here:
         // w9 == 0      width % 16 == 0, tail done
@@ -500,9 +414,9 @@ function ff_bgr24toyv12_aarch64, export=1
         tbz             w9, #3, 1f
         st1             {v26.8b},    [x11], #8
 1:      tbz             w9, #2, 1f
-        st1             {v26.s}[2],  [x11], #4
+        STB4V           v26, 8,  x11
 1:      tbz             w9, #1, 1f
-        st1             {v26.h}[6],  [x11], #2
+        STB2V           v26, 12, x11
 1:      tbz             w9, #0, 1f
         st1             {v26.b}[14], [x11]
 1:

From d2aa73557f69d03094919873215a8aa9405f00d5 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 10 Aug 2023 08:11:21 +0000
Subject: [PATCH 161/161] v4l2_req_media: Fix dmabuf fd leak in MMAP mode

---
 libavcodec/v4l2_req_media.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c
index 1a9944774a..0394bb2b23 100644
--- a/libavcodec/v4l2_req_media.c
+++ b/libavcodec/v4l2_req_media.c
@@ -1205,8 +1205,10 @@ qe_import_from_buf(struct mediabufs_ctl *const mbc, struct qent_base * const be,
                     .plane = i,
                     .flags = O_RDWR, // *** Arguably O_RDONLY would be fine
                 };
-                if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0)
+                if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0) {
                     be->dh[i] = dmabuf_import(xbuf.fd, planes[i].length);
+                    close(xbuf.fd); // dmabuf_import dups the fd so close this one
+                }
             }
             else {
                 be->dh[i] = dmabuf_import_mmap(