From d15b25304e3f784c4935406fe8e81597f045427f Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 11:27:39 +0100
Subject: [PATCH 001/157] Add sand pix fmts & conversion fns

---
 configure                     |   3 +
 libavutil/Makefile            |   3 +
 libavutil/arm/Makefile        |   1 +
 libavutil/arm/rpi_sand_neon.S | 768 ++++++++++++++++++++++++++++++++++
 libavutil/arm/rpi_sand_neon.h |  99 +++++
 libavutil/pixdesc.c           |  44 ++
 libavutil/pixfmt.h            |   6 +
 libavutil/rpi_sand_fn_pw.h    | 227 ++++++++++
 libavutil/rpi_sand_fns.c      | 353 ++++++++++++++++
 libavutil/rpi_sand_fns.h      | 183 ++++++++
 10 files changed, 1687 insertions(+)
 create mode 100644 libavutil/arm/rpi_sand_neon.S
 create mode 100644 libavutil/arm/rpi_sand_neon.h
 create mode 100644 libavutil/rpi_sand_fn_pw.h
 create mode 100644 libavutil/rpi_sand_fns.c
 create mode 100644 libavutil/rpi_sand_fns.h

diff --git a/configure b/configure
index 3cd3bdfb44..5a5ada2071 100755
--- a/configure
+++ b/configure
@@ -344,6 +344,7 @@ External library support:
   --enable-libvpl          enable Intel oneVPL code via libvpl if libmfx is not used [no]
   --enable-libnpp          enable Nvidia Performance Primitives-based code [no]
   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
+  --enable-sand            enable sand video formats [rpi]
   --disable-nvdec          disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
   --disable-nvenc          disable Nvidia video encoding code [autodetect]
   --enable-omx             enable OpenMAX IL code [no]
@@ -1930,6 +1931,7 @@ FEATURE_LIST="
     omx_rpi
     runtime_cpudetect
     safe_bitstream_reader
+    sand
     shared
     small
     static
@@ -2495,6 +2497,7 @@ CONFIG_EXTRA="
     rtpdec
     rtpenc_chain
     rv34dsp
+    sand
     scene_sad
     sinewin
     snappy
diff --git a/libavutil/Makefile b/libavutil/Makefile
index dc9012f9a8..e33f5db099 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -73,6 +73,7 @@ HEADERS = adler32.h                                                     \
           rational.h                                                    \
           replaygain.h                                                  \
           ripemd.h                                                      \
+	  rpi_sand_fns.h                                                \
           samplefmt.h                                                   \
           sha.h                                                         \
           sha512.h                                                      \
@@ -192,6 +193,7 @@ OBJS-$(CONFIG_MACOS_KPERF)              += macos_kperf.o
 OBJS-$(CONFIG_MEDIACODEC)               += hwcontext_mediacodec.o
 OBJS-$(CONFIG_OPENCL)                   += hwcontext_opencl.o
 OBJS-$(CONFIG_QSV)                      += hwcontext_qsv.o
+OBJS-$(CONFIG_SAND)                     += rpi_sand_fns.o
 OBJS-$(CONFIG_VAAPI)                    += hwcontext_vaapi.o
 OBJS-$(CONFIG_VIDEOTOOLBOX)             += hwcontext_videotoolbox.o
 OBJS-$(CONFIG_VDPAU)                    += hwcontext_vdpau.o
@@ -212,6 +214,7 @@ SKIPHEADERS-$(CONFIG_D3D11VA)          += hwcontext_d3d11va.h
 SKIPHEADERS-$(CONFIG_DXVA2)            += hwcontext_dxva2.h
 SKIPHEADERS-$(CONFIG_QSV)              += hwcontext_qsv.h
 SKIPHEADERS-$(CONFIG_OPENCL)           += hwcontext_opencl.h
+SKIPHEADERS-$(CONFIG-RPI)              += rpi_sand_fn_pw.h
 SKIPHEADERS-$(CONFIG_VAAPI)            += hwcontext_vaapi.h
 SKIPHEADERS-$(CONFIG_VIDEOTOOLBOX)     += hwcontext_videotoolbox.h
 SKIPHEADERS-$(CONFIG_VDPAU)            += hwcontext_vdpau.h
diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
index 5da44b0542..b74b7c4e2f 100644
--- a/libavutil/arm/Makefile
+++ b/libavutil/arm/Makefile
@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o                                    \
 
 NEON-OBJS += arm/float_dsp_init_neon.o                                  \
              arm/float_dsp_neon.o                                       \
+             arm/rpi_sand_neon.o                                        \
diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
new file mode 100644
index 0000000000..80890fe985
--- /dev/null
+++ b/libavutil/arm/rpi_sand_neon.S
@@ -0,0 +1,768 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+#include "libavutil/arm/asm.S"
+
+
+@ General notes:
+@ Having done some timing on this in sand8->y8 (Pi4)
+@  vst1 (680fps) is a bit faster than vstm (660fps)
+@  vldm (680fps) is noticably faster than vld1 (480fps)
+@  (or it might be that a mix is what is required)
+@
+@ At least on a Pi4 it is no more expensive to have a single auto-inc register
+@ for dest address than it is to have 2 used alternately (On Pi3 Ben asserted
+@ the latter was better)
+@
+@ vstm will bus error on unaligned access (so will vldm), vst1 is safe unless
+@ the memory is uncached.
+@ As these are Sand -> planar we can assume that src is going to be aligned but
+@ it is possible that dest isn't (converting to .yuv or other packed format).
+@ Luckily vst1 is faster than vstm :-) so all is well
+@ vst1 has alignment requirements of el size so maybe splitting vst1.32 into 4
+@ .8 stores would let us do non-word aligned stores into uncached but it
+@ probably isn't worth it.
+
+
+
+
+@ void ff_rpi_sand128b_stripe_to_8_10(
+@   uint8_t * dest,             // [r0]
+@   const uint8_t * src1,       // [r1]
+@   const uint8_t * src2,       // [r2]
+@   unsigned int lines);        // [r3]
+
+.macro  stripe2_to_8, bit_depth
+        vpush    {q4-q7}
+1:
+        vldm     r1!, {q0-q7}
+        subs     r3, #1
+        vldm     r2!, {q8-q15}
+        vqrshrn.u16 d0,  q0,  #\bit_depth - 8
+        vqrshrn.u16 d1,  q1,  #\bit_depth - 8
+        vqrshrn.u16 d2,  q2,  #\bit_depth - 8
+        vqrshrn.u16 d3,  q3,  #\bit_depth - 8
+        vqrshrn.u16 d4,  q4,  #\bit_depth - 8
+        vqrshrn.u16 d5,  q5,  #\bit_depth - 8
+        vqrshrn.u16 d6,  q6,  #\bit_depth - 8
+        vqrshrn.u16 d7,  q7,  #\bit_depth - 8
+        vqrshrn.u16 d8,  q8,  #\bit_depth - 8
+        vqrshrn.u16 d9,  q9,  #\bit_depth - 8
+        vqrshrn.u16 d10, q10, #\bit_depth - 8
+        vqrshrn.u16 d11, q11, #\bit_depth - 8
+        vqrshrn.u16 d12, q12, #\bit_depth - 8
+        vqrshrn.u16 d13, q13, #\bit_depth - 8
+        vqrshrn.u16 d14, q14, #\bit_depth - 8
+        vqrshrn.u16 d15, q15, #\bit_depth - 8
+        vstm     r0!, {q0-q7}
+        bne      1b
+        vpop     {q4-q7}
+        bx       lr
+.endm
+
+function ff_rpi_sand128b_stripe_to_8_10, export=1
+        stripe2_to_8     10
+endfunc
+
+@ void ff_rpi_sand8_lines_to_planar_y8(
+@   uint8_t * dest,             // [r0]
+@   unsigned int dst_stride,    // [r1]
+@   const uint8_t * src,        // [r2]
+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+@   unsigned int src_stride2,   // [sp, #0]  -> r3
+@   unsigned int _x,            // [sp, #4]  Ignored - 0
+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+@   unsigned int h);            // [sp, #16] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for writing
+
+function ff_rpi_sand8_lines_to_planar_y8, export=1
+                push            {r4-r8, lr}     @ +24            L
+                ldr             r3,  [sp, #24]
+                ldr             r6,  [sp, #36]
+                ldr             r7,  [sp, #32]  @ y
+                lsl             r3,  #7
+                sub             r1,  r6
+                add             r8,  r2,  r7,  lsl #7
+                ldr             r7,  [sp, #40]
+
+10:
+                mov             r2,  r8
+                add             r4,  r0,  #24
+                mov             r5,  r6
+                mov             lr,  #0
+1:
+                vldm            r2,  {q8-q15}
+                add             r2,  r3
+                subs            r5,  #128
+                blt             2f
+                vst1.8          {d16, d17, d18, d19}, [r0]!
+                vst1.8          {d20, d21, d22, d23}, [r0]!
+                vst1.8          {d24, d25, d26, d27}, [r0]!
+                vst1.8          {d28, d29, d30, d31}, [r0]!
+                bne             1b
+11:
+                subs            r7,  #1
+                add             r0,  r1
+                add             r8,  #128
+                bne             10b
+
+                pop             {r4-r8, pc}
+
+@ Partial final write
+2:
+                cmp             r5,  #64-128
+                blt             1f
+                vst1.8          {d16, d17, d18, d19}, [r0]!
+                vst1.8          {d20, d21, d22, d23}, [r0]!
+                beq             11b
+                vmov            q8,  q12
+                vmov            q9,  q13
+                sub             r5,  #64
+                vmov            q10, q14
+                vmov            q11, q15
+1:
+                cmp             r5,  #32-128
+                blt             1f
+                vst1.8          {d16, d17, d18, d19}, [r0]!
+                beq             11b
+                vmov            q8,  q10
+                sub             r5,  #32
+                vmov            q9,  q11
+1:
+                cmp             r5,  #16-128
+                blt             1f
+                vst1.8          {d16, d17}, [r0]!
+                beq             11b
+                sub             r5,  #16
+                vmov            q8,  q9
+1:
+                cmp             r5,  #8-128
+                blt             1f
+                vst1.8          {d16}, [r0]!
+                beq             11b
+                sub             r5,  #8
+                vmov            d16, d17
+1:
+                cmp             r5,  #4-128
+                blt             1f
+                vst1.32         {d16[0]}, [r0]!
+                beq             11b
+                sub             r5,  #4
+                vshr.u64        d16, #32
+1:
+                cmp             r5,  #2-128
+                blt             1f
+                vst1.16         {d16[0]}, [r0]!
+                beq             11b
+                vst1.8          {d16[2]}, [r0]!
+                b               11b
+1:
+                vst1.8          {d16[0]}, [r0]!
+                b               11b
+endfunc
+
+@ void ff_rpi_sand8_lines_to_planar_c8(
+@   uint8_t * dst_u,            // [r0]
+@   unsigned int dst_stride_u,  // [r1]
+@   uint8_t * dst_v,            // [r2]
+@   unsigned int dst_stride_v,  // [r3]
+@   const uint8_t * src,        // [sp, #0]  -> r4, r5
+@   unsigned int stride1,       // [sp, #4]  128
+@   unsigned int stride2,       // [sp, #8]  -> r8
+@   unsigned int _x,            // [sp, #12] 0
+@   unsigned int y,             // [sp, #16] (r7 in prefix)
+@   unsigned int _w,            // [sp, #20] -> r12, r6
+@   unsigned int h);            // [sp, #24] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for writing
+
+function ff_rpi_sand8_lines_to_planar_c8, export=1
+                push            {r4-r8, lr}     @ +24
+
+                ldr             r5,  [sp, #24]
+                ldr             r8,  [sp, #32]
+                ldr             r7,  [sp, #40]
+                ldr             r6,  [sp, #44]
+                lsl             r8,  #7
+                add             r5,  r5,  r7,  lsl #7
+                sub             r1,  r1,  r6
+                sub             r3,  r3,  r6
+                ldr             r7,  [sp, #48]
+                vpush           {q4-q7}
+
+10:
+                mov             r4,  r5
+                mov             r12, r6
+1:
+                subs            r12, #64
+                vldm            r4,  {q0-q7}
+                add             r4,  r8
+                it              gt
+                vldmgt          r4,  {q8-q15}
+                add             r4,  r8
+
+                vuzp.8          q0,  q1
+                vuzp.8          q2,  q3
+                vuzp.8          q4,  q5
+                vuzp.8          q6,  q7
+
+                vuzp.8          q8,  q9
+                vuzp.8          q10, q11
+                vuzp.8          q12, q13
+                vuzp.8          q14, q15
+                subs            r12, #64
+
+                @ Rearrange regs so we can use vst1 with 4 regs
+                vswp            q1,  q2
+                vswp            q5,  q6
+                vswp            q9,  q10
+                vswp            q13, q14
+                blt             2f
+
+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
+                vst1.8          {d8,  d9,  d10, d11}, [r0]!
+                vst1.8          {d16, d17, d18, d19}, [r0]!
+                vst1.8          {d24, d25, d26, d27}, [r0]!
+
+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
+                vst1.8          {d12, d13, d14, d15}, [r2]!
+                vst1.8          {d20, d21, d22, d23}, [r2]!
+                vst1.8          {d28, d29, d30, d31}, [r2]!
+                bne             1b
+11:
+                subs            r7,  #1
+                add             r5,  #128
+                add             r0,  r1
+                add             r2,  r3
+                bne             10b
+                vpop            {q4-q7}
+                pop             {r4-r8,pc}
+
+2:
+                cmp             r12, #64-128
+                blt             1f
+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
+                vst1.8          {d8,  d9,  d10, d11}, [r0]!
+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
+                vst1.8          {d12, d13, d14, d15}, [r2]!
+                beq             11b
+                sub             r12, #64
+                vmov            q0,  q8
+                vmov            q1,  q9
+                vmov            q2,  q10
+                vmov            q3,  q11
+                vmov            q4,  q12
+                vmov            q5,  q13
+                vmov            q6,  q14
+                vmov            q7,  q15
+1:
+                cmp             r12, #32-128
+                blt             1f
+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
+                beq             11b
+                sub             r12, #32
+                vmov            q0,  q4
+                vmov            q1,  q5
+                vmov            q2,  q6
+                vmov            q3,  q7
+1:
+                cmp             r12, #16-128
+                blt             1f
+                vst1.8          {d0,  d1 }, [r0]!
+                vst1.8          {d4,  d5 }, [r2]!
+                beq             11b
+                sub             r12, #16
+                vmov            q0,  q1
+                vmov            q2,  q3
+1:
+                cmp             r12, #8-128
+                blt             1f
+                vst1.8          {d0}, [r0]!
+                vst1.8          {d4}, [r2]!
+                beq             11b
+                sub             r12, #8
+                vmov            d0,  d1
+                vmov            d4,  d5
+1:
+                cmp             r12, #4-128
+                blt             1f
+                vst1.32         {d0[0]}, [r0]!
+                vst1.32         {d4[0]}, [r2]!
+                beq             11b
+                sub             r12, #4
+                vmov            s0,  s1
+                vmov            s8,  s9
+1:
+                cmp             r12, #2-128
+                blt             1f
+                vst1.16         {d0[0]}, [r0]!
+                vst1.16         {d4[0]}, [r2]!
+                beq             11b
+                vst1.8          {d0[2]}, [r0]!
+                vst1.8          {d4[2]}, [r2]!
+                b               11b
+1:
+                vst1.8          {d0[0]}, [r0]!
+                vst1.8          {d4[0]}, [r2]!
+                b               11b
+endfunc
+
+
+
+@ void ff_rpi_sand30_lines_to_planar_y16(
+@   uint8_t * dest,             // [r0]
+@   unsigned int dst_stride,    // [r1]
+@   const uint8_t * src,        // [r2]
+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+@   unsigned int src_stride2,   // [sp, #0]  -> r3
+@   unsigned int _x,            // [sp, #4]  Ignored - 0
+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+@   unsigned int h);            // [sp, #16] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for writing
+
+function ff_rpi_sand30_lines_to_planar_y16, export=1
+                push            {r4-r8, lr}     @ +24
+                ldr             r3,  [sp, #24]
+                ldr             r6,  [sp, #36]
+                ldr             r7,  [sp, #32]  @ y
+                mov             r12, #48
+                vmov.u16        q15, #0x3ff
+                sub             r3,  #1
+                lsl             r3,  #7
+                sub             r1,  r1,  r6,  lsl #1
+                add             r8,  r2,  r7,  lsl #7
+                ldr             r7,  [sp, #40]
+
+10:
+                mov             r2,  r8
+                add             r4,  r0,  #24
+                mov             r5,  r6
+                mov             lr,  #0
+1:
+                vldm            r2!, {q10-q13}
+                add             lr,  #64
+
+                vshr.u32        q14, q10, #20    @ Cannot vshrn.u32 #20!
+                ands            lr,  #127
+                vshrn.u32       d2,  q10, #10
+                vmovn.u32       d0,  q10
+                vmovn.u32       d4,  q14
+
+                vshr.u32        q14, q11, #20
+                it              eq
+                addeq           r2,  r3
+                vshrn.u32       d3,  q11, #10
+                vmovn.u32       d1,  q11
+                vmovn.u32       d5,  q14
+
+                subs            r5,  #48
+                vand            q0,  q15
+                vand            q1,  q15
+                vand            q2,  q15
+
+                vshr.u32        q14, q12, #20
+                vshrn.u32       d18, q12, #10
+                vmovn.u32       d16, q12
+                vmovn.u32       d20, q14
+
+                vshr.u32        q14, q13, #20
+                vshrn.u32       d19, q13, #10
+                vmovn.u32       d17, q13
+                vmovn.u32       d21, q14
+
+                vand            q8,  q15
+                vand            q9,  q15
+                vand            q10, q15
+                blt             2f
+
+                vst3.16         {d0,  d2,  d4},  [r0], r12
+                vst3.16         {d1,  d3,  d5},  [r4], r12
+                vst3.16         {d16, d18, d20}, [r0], r12
+                vst3.16         {d17, d19, d21}, [r4], r12
+
+                bne             1b
+
+11:
+                subs            r7,  #1
+                add             r0,  r1
+                add             r8,  #128
+                bne             10b
+
+                pop             {r4-r8, pc}
+
+@ Partial final write
+2:
+                cmp             r5,  #24-48
+                blt             1f
+                vst3.16         {d0,  d2,  d4},  [r0], r12
+                vst3.16         {d1,  d3,  d5},  [r4]
+                beq             11b
+                vmov            q0,  q8
+                sub             r5,  #24
+                vmov            q1,  q9
+                vmov            q2,  q10
+1:
+                cmp             r5,  #12-48
+                blt             1f
+                vst3.16         {d0,  d2,  d4},  [r0]!
+                beq             11b
+                vmov            d0, d1
+                sub             r5, #12
+                vmov            d2, d3
+                vmov            d4, d5
+1:
+                cmp             r5,  #6-48
+                add             r4,  r0,  #6    @ avoid [r0]! on sequential instructions
+                blt             1f
+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]
+                vst3.16         {d0[1], d2[1], d4[1]}, [r4]
+                add             r0,  #12
+                beq             11b
+                vmov            s0,  s1
+                sub             r5,  #6
+                vmov            s4,  s5
+                vmov            s8,  s9
+1:
+                cmp             r5, #3-48
+                blt             1f
+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]!
+                beq             11b
+                sub             r5, #3
+                vshr.u32        d0, #16
+                vshr.u32        d2, #16
+1:
+                cmp             r5, #2-48
+                blt             1f
+                vst2.16         {d0[0], d2[0]}, [r0]!
+                b               11b
+1:
+                vst1.16         {d0[0]}, [r0]!
+                b               11b
+
+endfunc
+
+
+@ void ff_rpi_sand30_lines_to_planar_c16(
+@   uint8_t * dst_u,            // [r0]
+@   unsigned int dst_stride_u,  // [r1]
+@   uint8_t * dst_v,            // [r2]
+@   unsigned int dst_stride_v,  // [r3]
+@   const uint8_t * src,        // [sp, #0]  -> r4, r5
+@   unsigned int stride1,       // [sp, #4]  128
+@   unsigned int stride2,       // [sp, #8]  -> r8
+@   unsigned int _x,            // [sp, #12] 0
+@   unsigned int y,             // [sp, #16] (r7 in prefix)
+@   unsigned int _w,            // [sp, #20] -> r6, r9
+@   unsigned int h);            // [sp, #24] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for writing
+
+function ff_rpi_sand30_lines_to_planar_c16, export=1
+                push            {r4-r10, lr}    @ +32
+                ldr             r5,  [sp, #32]
+                ldr             r8,  [sp, #40]
+                ldr             r7,  [sp, #48]
+                ldr             r9,  [sp, #52]
+                mov             r12, #48
+                vmov.u16        q15, #0x3ff
+                sub             r8,  #1
+                lsl             r8,  #7
+                add             r5,  r5,  r7,  lsl #7
+                sub             r1,  r1,  r9,  lsl #1
+                sub             r3,  r3,  r9,  lsl #1
+                ldr             r7,  [sp, #56]
+10:
+                mov             lr,  #0
+                mov             r4,  r5
+                mov             r6,  r9
+1:
+                vldm            r4!, {q0-q3}
+                add             lr,  #64
+
+                @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2
+                vshr.u32        q14, q0,  #20
+                vshrn.u32       d16, q0,  #10
+                vmovn.u32       d18, q0
+                ands            lr,  #127
+                vmovn.u32       d20, q14
+
+                vshr.u32        q14, q1,  #20
+                vshrn.u32       d17, q1,  #10
+                vmovn.u32       d19, q1
+                vmovn.u32       d21, q14
+
+                vshr.u32        q14, q2,  #20
+                vshrn.u32       d22, q2,  #10
+                vmovn.u32       d24, q2
+                vmovn.u32       d26, q14
+
+                vshr.u32        q14, q3,  #20
+                vshrn.u32       d23, q3,  #10
+                vmovn.u32       d25, q3
+                add             r10, r0,  #24
+                vmovn.u32       d27, q14
+
+                it              eq
+                addeq           r4,  r8
+                vuzp.16         q8,  q11
+                vuzp.16         q9,  q12
+                vuzp.16         q10, q13
+
+                @ q8   V0, V3,.. -> q0
+                @ q9   U0, U3...
+                @ q10  U1, U4...
+                @ q11  U2, U5,..
+                @ q12  V1, V4,.. -> q1
+                @ q13  V2, V5,.. -> q2
+
+                subs            r6,  #24
+                vand            q11, q15
+                vand            q9,  q15
+                vand            q10, q15
+                vand            q0,  q8,  q15
+                vand            q1,  q12, q15
+                vand            q2,  q13, q15
+
+                blt             2f
+
+                vst3.16         {d18, d20, d22}, [r0],  r12
+                vst3.16         {d19, d21, d23}, [r10]
+                add             r10, r2,  #24
+                vst3.16         {d0,  d2,  d4},  [r2],  r12
+                vst3.16         {d1,  d3,  d5},  [r10]
+
+                bne             1b
+
+11:
+                subs            r7,  #1
+                add             r5,  #128
+                add             r0,  r1
+                add             r2,  r3
+                bne             10b
+
+                pop             {r4-r10, pc}
+
+@ Partial final write
+2:
+                cmp             r6,  #-12
+                blt             1f
+                vst3.16         {d18, d20, d22}, [r0]!
+                vst3.16         {d0,  d2,  d4},  [r2]!
+                beq             11b
+                vmov            d18, d19
+                vmov            d20, d21
+                vmov            d22, d23
+                sub             r6,  #12
+                vmov            d0,  d1
+                vmov            d2,  d3
+                vmov            d4,  d5
+1:
+                cmp             r6,  #-18
+                @ Rezip here as it makes the remaining tail handling easier
+                vzip.16         d0,  d18
+                vzip.16         d2,  d20
+                vzip.16         d4,  d22
+                blt             1f
+                vst3.16         {d0[1],  d2[1],  d4[1]},  [r0]!
+                vst3.16         {d0[0],  d2[0],  d4[0]},  [r2]!
+                vst3.16         {d0[3],  d2[3],  d4[3]},  [r0]!
+                vst3.16         {d0[2],  d2[2],  d4[2]},  [r2]!
+                beq             11b
+                vmov            d0,  d18
+                vmov            d2,  d20
+                sub             r6,  #6
+                vmov            d4,  d22
+1:
+                cmp             r6,  #-21
+                blt             1f
+                vst3.16         {d0[1], d2[1], d4[1]}, [r0]!
+                vst3.16         {d0[0], d2[0], d4[0]}, [r2]!
+                beq             11b
+                vmov            s4,  s5
+                sub             r6,  #3
+                vmov            s0,  s1
+1:
+                cmp             r6,  #-22
+                blt             1f
+                vst2.16         {d0[1], d2[1]}, [r0]!
+                vst2.16         {d0[0], d2[0]}, [r2]!
+                b               11b
+1:
+                vst1.16         {d0[1]}, [r0]!
+                vst1.16         {d0[0]}, [r2]!
+                b               11b
+
+endfunc
+
+@ void ff_rpi_sand30_lines_to_planar_p010(
+@   uint8_t * dest,             // [r0]
+@   unsigned int dst_stride,    // [r1]
+@   const uint8_t * src,        // [r2]
+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+@   unsigned int src_stride2,   // [sp, #0]  -> r3
+@   unsigned int _x,            // [sp, #4]  Ignored - 0
+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+@   unsigned int h);            // [sp, #16] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for writing
+
+function ff_rpi_sand30_lines_to_planar_p010, export=1
+                push            {r4-r8, lr}     @ +24
+                ldr             r3,  [sp, #24]
+                ldr             r6,  [sp, #36]
+                ldr             r7,  [sp, #32]  @ y
+                mov             r12, #48
+                vmov.u16        q15, #0xffc0
+                sub             r3,  #1
+                lsl             r3,  #7
+                sub             r1,  r1,  r6,  lsl #1
+                add             r8,  r2,  r7,  lsl #7
+                ldr             r7,  [sp, #40]
+
+10:
+                mov             r2,  r8
+                add             r4,  r0,  #24
+                mov             r5,  r6
+                mov             lr,  #0
+1:
+                vldm            r2!, {q10-q13}
+                add             lr,  #64
+
+                vshl.u32        q14, q10, #6
+                ands            lr,  #127
+                vshrn.u32       d4,  q10, #14
+                vshrn.u32       d2,  q10, #4
+                vmovn.u32       d0,  q14
+
+                vshl.u32        q14, q11, #6
+                it              eq
+                addeq           r2,  r3
+                vshrn.u32       d5,  q11, #14
+                vshrn.u32       d3,  q11, #4
+                vmovn.u32       d1,  q14
+
+                subs            r5,  #48
+                vand            q2,  q15
+                vand            q1,  q15
+                vand            q0,  q15
+
+                vshl.u32        q14, q12, #6
+                vshrn.u32       d20, q12, #14
+                vshrn.u32       d18, q12, #4
+                vmovn.u32       d16, q14
+
+                vshl.u32        q14, q13, #6
+                vshrn.u32       d21, q13, #14
+                vshrn.u32       d19, q13, #4
+                vmovn.u32       d17, q14
+
+                vand            q10, q15
+                vand            q9,  q15
+                vand            q8,  q15
+                blt             2f
+
+                vst3.16         {d0,  d2,  d4},  [r0], r12
+                vst3.16         {d1,  d3,  d5},  [r4], r12
+                vst3.16         {d16, d18, d20}, [r0], r12
+                vst3.16         {d17, d19, d21}, [r4], r12
+
+                bne             1b
+
+11:
+                subs            r7,  #1
+                add             r0,  r1
+                add             r8,  #128
+                bne             10b
+
+                pop             {r4-r8, pc}
+
+@ Partial final write
+2:
+                cmp             r5,  #24-48
+                blt             1f
+                vst3.16         {d0,  d2,  d4},  [r0], r12
+                vst3.16         {d1,  d3,  d5},  [r4]
+                beq             11b
+                vmov            q0,  q8
+                sub             r5,  #24
+                vmov            q1,  q9
+                vmov            q2,  q10
+1:
+                cmp             r5,  #12-48
+                blt             1f
+                vst3.16         {d0,  d2,  d4},  [r0]!
+                beq             11b
+                vmov            d0, d1
+                sub             r5, #12
+                vmov            d2, d3
+                vmov            d4, d5
+1:
+                cmp             r5,  #6-48
+                add             r4,  r0,  #6    @ avoid [r0]! on sequential instructions
+                blt             1f
+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]
+                vst3.16         {d0[1], d2[1], d4[1]}, [r4]
+                add             r0,  #12
+                beq             11b
+                vmov            s0,  s1
+                sub             r5,  #6
+                vmov            s4,  s5
+                vmov            s8,  s9
+1:
+                cmp             r5, #3-48
+                blt             1f
+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]!
+                beq             11b
+                sub             r5, #3
+                vshr.u32        d0, #16
+                vshr.u32        d2, #16
+1:
+                cmp             r5, #2-48
+                blt             1f
+                vst2.16         {d0[0], d2[0]}, [r0]!
+                b               11b
+1:
+                vst1.16         {d0[0]}, [r0]!
+                b               11b
+
+endfunc
+
+
+
diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h
new file mode 100644
index 0000000000..447f367bea
--- /dev/null
+++ b/libavutil/arm/rpi_sand_neon.h
@@ -0,0 +1,99 @@
+/*
+Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+#ifndef AVUTIL_ARM_SAND_NEON_H
+#define AVUTIL_ARM_SAND_NEON_H
+
+void ff_rpi_sand128b_stripe_to_8_10(
+  uint8_t * dest,             // [r0]
+  const uint8_t * src1,       // [r1]
+  const uint8_t * src2,       // [r2]
+  unsigned int lines);        // [r3]
+
+void ff_rpi_sand8_lines_to_planar_y8(
+  uint8_t * dest,             // [r0]
+  unsigned int dst_stride,    // [r1]
+  const uint8_t * src,        // [r2]
+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+  unsigned int src_stride2,   // [sp, #0]  -> r3
+  unsigned int _x,            // [sp, #4]  Ignored - 0
+  unsigned int y,             // [sp, #8]  (r7 in prefix)
+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+  unsigned int h);            // [sp, #16] -> r7
+
+void ff_rpi_sand8_lines_to_planar_c8(
+  uint8_t * dst_u,            // [r0]
+  unsigned int dst_stride_u,  // [r1]
+  uint8_t * dst_v,            // [r2]
+  unsigned int dst_stride_v,  // [r3]
+  const uint8_t * src,        // [sp, #0]  -> r4, r5
+  unsigned int stride1,       // [sp, #4]  128
+  unsigned int stride2,       // [sp, #8]  -> r8
+  unsigned int _x,            // [sp, #12] 0
+  unsigned int y,             // [sp, #16] (r7 in prefix)
+  unsigned int _w,            // [sp, #20] -> r12, r6
+  unsigned int h);            // [sp, #24] -> r7
+
+void ff_rpi_sand30_lines_to_planar_y16(
+  uint8_t * dest,             // [r0]
+  unsigned int dst_stride,    // [r1]
+  const uint8_t * src,        // [r2]
+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+  unsigned int src_stride2,   // [sp, #0]  -> r3
+  unsigned int _x,            // [sp, #4]  Ignored - 0
+  unsigned int y,             // [sp, #8]  (r7 in prefix)
+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+  unsigned int h);            // [sp, #16] -> r7
+
+void ff_rpi_sand30_lines_to_planar_c16(
+  uint8_t * dst_u,            // [r0]
+  unsigned int dst_stride_u,  // [r1]
+  uint8_t * dst_v,            // [r2]
+  unsigned int dst_stride_v,  // [r3]
+  const uint8_t * src,        // [sp, #0]  -> r4, r5
+  unsigned int stride1,       // [sp, #4]  128
+  unsigned int stride2,       // [sp, #8]  -> r8
+  unsigned int _x,            // [sp, #12] 0
+  unsigned int y,             // [sp, #16] (r7 in prefix)
+  unsigned int _w,            // [sp, #20] -> r6, r9
+  unsigned int h);            // [sp, #24] -> r7
+
+void ff_rpi_sand30_lines_to_planar_p010(
+  uint8_t * dest,             // [r0]
+  unsigned int dst_stride,    // [r1]
+  const uint8_t * src,        // [r2]
+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+  unsigned int src_stride2,   // [sp, #0]  -> r3
+  unsigned int _x,            // [sp, #4]  Ignored - 0
+  unsigned int y,             // [sp, #8]  (r7 in prefix)
+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+  unsigned int h);            // [sp, #16] -> r7
+
+#endif // AVUTIL_ARM_SAND_NEON_H
+
diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
index 62a2ae08d9..cb73521ea7 100644
--- a/libavutil/pixdesc.c
+++ b/libavutil/pixdesc.c
@@ -2717,6 +2717,50 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
         .flags = AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_FLOAT |
                  AV_PIX_FMT_FLAG_ALPHA,
     },
+    [AV_PIX_FMT_SAND128] = {
+        .name = "sand128",
+        .nb_components = 3,
+        .log2_chroma_w = 1,
+        .log2_chroma_h = 1,
+        .comp = {
+            { 0, 1, 0, 0, 8 },        /* Y */
+            { 1, 2, 0, 0, 8 },        /* U */
+            { 1, 2, 1, 0, 8 },        /* V */
+        },
+        .flags = 0,
+    },
+    [AV_PIX_FMT_SAND64_10] = {
+        .name = "sand64_10",
+        .nb_components = 3,
+        .log2_chroma_w = 1,
+        .log2_chroma_h = 1,
+        .comp = {
+            { 0, 2, 0, 0, 10 },        /* Y */
+            { 1, 4, 0, 0, 10 },        /* U */
+            { 1, 4, 2, 0, 10 },        /* V */
+        },
+        .flags = 0,
+    },
+    [AV_PIX_FMT_SAND64_16] = {
+        .name = "sand64_16",
+        .nb_components = 3,
+        .log2_chroma_w = 1,
+        .log2_chroma_h = 1,
+        .comp = {
+            { 0, 2, 0, 0, 16 },        /* Y */
+            { 1, 4, 0, 0, 16 },        /* U */
+            { 1, 4, 2, 0, 16 },        /* V */
+        },
+        .flags = 0,
+    },
+    [AV_PIX_FMT_RPI4_8] = {
+        .name = "rpi4_8",
+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
+    },
+    [AV_PIX_FMT_RPI4_10] = {
+        .name = "rpi4_10",
+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
+    },
 };
 
 static const char * const color_range_names[] = {
diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
index 37c2c79e01..22f70007c3 100644
--- a/libavutil/pixfmt.h
+++ b/libavutil/pixfmt.h
@@ -377,6 +377,12 @@ enum AVPixelFormat {
 
     AV_PIX_FMT_Y210BE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian
     AV_PIX_FMT_Y210LE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian
+// RPI - not on ifdef so can be got at by calling progs
+    AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
+    AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
+    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
+    AV_PIX_FMT_RPI4_8,
+    AV_PIX_FMT_RPI4_10,
 
     AV_PIX_FMT_X2RGB10LE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), little-endian, X=unused/undefined
     AV_PIX_FMT_X2RGB10BE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), big-endian, X=unused/undefined
diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
new file mode 100644
index 0000000000..0324f6826d
--- /dev/null
+++ b/libavutil/rpi_sand_fn_pw.h
@@ -0,0 +1,227 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+// * Included twice from rpi_sand_fn with different PW
+
+#define STRCAT(x,y) x##y
+
+#if PW == 1
+#define pixel uint8_t
+#define FUNC(f) STRCAT(f, 8)
+#elif PW == 2
+#define pixel uint16_t
+#define FUNC(f) STRCAT(f, 16)
+#else
+#error Unexpected PW
+#endif
+
+// Fetches a single patch - offscreen fixup not done here
+// w <= stride1
+// unclipped
+void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x = _x;
+    const unsigned int w = _w;
+    const unsigned int mask = stride1 - 1;
+
+#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64)
+    if (_x == 0) {
+        ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride,
+                                     src, stride1, stride2, _x, y, _w, h);
+        return;
+    }
+#endif
+
+    if ((x & ~mask) == ((x + w) & ~mask)) {
+        // All in one sand stripe
+        const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
+            memcpy(dst, p, w);
+        }
+    }
+    else
+    {
+        // Two+ stripe
+        const unsigned int sstride = stride1 * stride2;
+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        const uint8_t * p2 = p1 + sstride - (x & mask);
+        const unsigned int w1 = stride1 - (x & mask);
+        const unsigned int w3 = (x + w) & mask;
+        const unsigned int w2 = w - (w1 + w3);
+
+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
+            unsigned int j;
+            const uint8_t * p = p2;
+            uint8_t * d = dst;
+            memcpy(d, p1, w1);
+            d += w1;
+            for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
+                memcpy(d, p, stride1);
+            }
+            memcpy(d, p, w3);
+        }
+    }
+}
+
+// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
+
+void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x = _x * 2;
+    const unsigned int w = _w * 2;
+    const unsigned int mask = stride1 - 1;
+
+#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64)
+    if (_x == 0) {
+        ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v,
+                                     src, stride1, stride2, _x, y, _w, h);
+        return;
+    }
+#endif
+
+    if ((x & ~mask) == ((x + w) & ~mask)) {
+        // All in one sand stripe
+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
+            pixel * du = (pixel *)dst_u;
+            pixel * dv = (pixel *)dst_v;
+            const pixel * p = (const pixel *)p1;
+            for (unsigned int k = 0; k < w; k += 2 * PW) {
+                *du++ = *p++;
+                *dv++ = *p++;
+            }
+        }
+    }
+    else
+    {
+        // Two+ stripe
+        const unsigned int sstride = stride1 * stride2;
+        const unsigned int sstride_p = (sstride - stride1) / PW;
+
+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        const uint8_t * p2 = p1 + sstride - (x & mask);
+        const unsigned int w1 = stride1 - (x & mask);
+        const unsigned int w3 = (x + w) & mask;
+        const unsigned int w2 = w - (w1 + w3);
+
+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
+            unsigned int j;
+            const pixel * p = (const pixel *)p1;
+            pixel * du = (pixel *)dst_u;
+            pixel * dv = (pixel *)dst_v;
+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
+                *du++ = *p++;
+                *dv++ = *p++;
+            }
+            for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
+                    *du++ = *p++;
+                    *dv++ = *p++;
+                }
+            }
+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
+                *du++ = *p++;
+                *dv++ = *p++;
+            }
+        }
+    }
+}
+
+void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
+                             unsigned int stride1, unsigned int stride2,
+                             const uint8_t * src_u, const unsigned int src_stride_u,
+                             const uint8_t * src_v, const unsigned int src_stride_v,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x = _x * 2;
+    const unsigned int w = _w * 2;
+    const unsigned int mask = stride1 - 1;
+    if ((x & ~mask) == ((x + w) & ~mask)) {
+        // All in one sand stripe
+        uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
+            const pixel * su = (const pixel *)src_u;
+            const pixel * sv = (const pixel *)src_v;
+            pixel * p = (pixel *)p1;
+            for (unsigned int k = 0; k < w; k += 2 * PW) {
+                *p++ = *su++;
+                *p++ = *sv++;
+            }
+        }
+    }
+    else
+    {
+        // Two+ stripe
+        const unsigned int sstride = stride1 * stride2;
+        const unsigned int sstride_p = (sstride - stride1) / PW;
+
+        const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+        const uint8_t * p2 = p1 + sstride - (x & mask);
+        const unsigned int w1 = stride1 - (x & mask);
+        const unsigned int w3 = (x + w) & mask;
+        const unsigned int w2 = w - (w1 + w3);
+
+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
+            unsigned int j;
+            const pixel * su = (const pixel *)src_u;
+            const pixel * sv = (const pixel *)src_v;
+            pixel * p = (pixel *)p1;
+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
+                *p++ = *su++;
+                *p++ = *sv++;
+            }
+            for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
+                    *p++ = *su++;
+                    *p++ = *sv++;
+                }
+            }
+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
+                *p++ = *su++;
+                *p++ = *sv++;
+            }
+        }
+    }
+}
+
+
+#undef pixel
+#undef STRCAT
+#undef FUNC
+
diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
new file mode 100644
index 0000000000..ed0261b02f
--- /dev/null
+++ b/libavutil/rpi_sand_fns.c
@@ -0,0 +1,353 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+#include "config.h"
+#include <stdint.h>
+#include <string.h>
+#include "rpi_sand_fns.h"
+#include "avassert.h"
+#include "frame.h"
+
+#if ARCH_ARM && HAVE_NEON
+#include "arm/rpi_sand_neon.h"
+#define HAVE_SAND_ASM 1
+#else
+#define HAVE_SAND_ASM 0
+#endif
+
+#define PW 1
+#include "rpi_sand_fn_pw.h"
+#undef PW
+
+#define PW 2
+#include "rpi_sand_fn_pw.h"
+#undef PW
+
+#if 1
+// Simple round
+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
+{
+    const unsigned int rnd = (1 << shr) >> 1;
+    const uint16_t * src = (const uint16_t *)_src;
+
+    for (; n != 0; --n) {
+        *dst++ = (*src++ + rnd) >> shr;
+    }
+}
+#else
+// Dithered variation
+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
+{
+    unsigned int rnd = (1 << shr) >> 1;
+    const unsigned int mask = ((1 << shr) - 1);
+    const uint16_t * src = (const uint16_t *)_src;
+
+    for (; n != 0; --n) {
+        rnd = *src++ + (rnd & mask);
+        *dst++ = rnd >> shr;
+    }
+}
+#endif
+
+// Fetches a single patch - offscreen fixup not done here
+// w <= stride1
+// unclipped
+// _x & _w in pixels, strides in bytes
+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
+    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
+    const unsigned int x1 = ((_x + _w) / 3) * 4;
+    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
+    const unsigned int mask = stride1 - 1;
+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
+
+#if HAVE_SAND_ASM
+    if (_x == 0) {
+        ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
+        return;
+    }
+#endif
+
+    if (x0 == x1) {
+        // *******************
+        // Partial single word xfer
+        return;
+    }
+
+    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
+    {
+        unsigned int x = x0;
+        const uint32_t * p = (const uint32_t *)p0;
+        uint16_t * d = (uint16_t *)dst;
+
+        if (xskip0 != 0) {
+            const uint32_t p3 = *p++;
+
+            if (xskip0 == 1)
+                *d++ = (p3 >> 10) & 0x3ff;
+            *d++ = (p3 >> 20) & 0x3ff;
+
+            if (((x += 4) & mask) == 0)
+                p += slice_inc;
+        }
+
+        while (x != x1) {
+            const uint32_t p3 = *p++;
+            *d++ = p3 & 0x3ff;
+            *d++ = (p3 >> 10) & 0x3ff;
+            *d++ = (p3 >> 20) & 0x3ff;
+
+            if (((x += 4) & mask) == 0)
+                p += slice_inc;
+        }
+
+        if (xrem1 != 0) {
+            const uint32_t p3 = *p;
+
+            *d++ = p3 & 0x3ff;
+            if (xrem1 == 2)
+                *d++ = (p3 >> 10) & 0x3ff;
+        }
+    }
+}
+
+
+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word
+    const unsigned int xskip0 = _x - (x0 >> 3) * 3;
+    const unsigned int x1 = ((_x + _w) / 3) * 8;
+    const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3;
+    const unsigned int mask = stride1 - 1;
+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
+
+#if HAVE_SAND_ASM
+    if (_x == 0) {
+        ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v,
+                                       src, stride1, stride2, _x, y, _w, h);
+        return;
+    }
+#endif
+
+    if (x0 == x1) {
+        // *******************
+        // Partial single word xfer
+        return;
+    }
+
+    for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1)
+    {
+        unsigned int x = x0;
+        const uint32_t * p = (const uint32_t *)p0;
+        uint16_t * du = (uint16_t *)dst_u;
+        uint16_t * dv = (uint16_t *)dst_v;
+
+        if (xskip0 != 0) {
+            const uint32_t p3a = *p++;
+            const uint32_t p3b = *p++;
+
+            if (xskip0 == 1)
+            {
+                *du++ = (p3a >> 20) & 0x3ff;
+                *dv++ = (p3b >>  0) & 0x3ff;
+            }
+            *du++ = (p3b >> 10) & 0x3ff;
+            *dv++ = (p3b >> 20) & 0x3ff;
+
+            if (((x += 8) & mask) == 0)
+                p += slice_inc;
+        }
+
+        while (x != x1) {
+            const uint32_t p3a = *p++;
+            const uint32_t p3b = *p++;
+
+            *du++ = p3a & 0x3ff;
+            *dv++ = (p3a >> 10) & 0x3ff;
+            *du++ = (p3a >> 20) & 0x3ff;
+            *dv++ = p3b & 0x3ff;
+            *du++ = (p3b >> 10) & 0x3ff;
+            *dv++ = (p3b >> 20) & 0x3ff;
+
+            if (((x += 8) & mask) == 0)
+                p += slice_inc;
+        }
+
+        if (xrem1 != 0) {
+            const uint32_t p3a = *p++;
+            const uint32_t p3b = *p++;
+
+            *du++ = p3a & 0x3ff;
+            *dv++ = (p3a >> 10) & 0x3ff;
+            if (xrem1 == 2)
+            {
+                *du++ = (p3a >> 20) & 0x3ff;
+                *dv++ = p3b & 0x3ff;
+            }
+        }
+    }
+}
+
+
+// w/h in pixels
+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
+                         unsigned int w, unsigned int h, const unsigned int shr)
+{
+    const unsigned int n = dst_stride1 / 2;
+    unsigned int j;
+
+    // This is true for our current layouts
+    av_assert0(dst_stride1 == src_stride1);
+
+    // As we have the same stride1 for src & dest and src is wider than dest
+    // then if we loop on src we can always write contiguously to dest
+    // We make no effort to copy an exact width - round up to nearest src stripe
+    // as we will always have storage in dest for that
+
+#if ARCH_ARM && HAVE_NEON
+    if (shr == 3 && src_stride1 == 128) {
+        for (j = 0; j + n < w; j += dst_stride1) {
+            uint8_t * d = dst + j * dst_stride2;
+            const uint8_t * s1 = src + j * 2 * src_stride2;
+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
+
+            ff_rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
+        }
+    }
+    else
+#endif
+    {
+        for (j = 0; j + n < w; j += dst_stride1) {
+            uint8_t * d = dst + j * dst_stride2;
+            const uint8_t * s1 = src + j * 2 * src_stride2;
+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
+
+            for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
+                cpy16_to_8(d, s1, n, shr);
+                cpy16_to_8(d + n, s2, n, shr);
+            }
+        }
+    }
+
+    // Fix up a trailing dest half stripe
+    if (j < w) {
+        uint8_t * d = dst + j * dst_stride2;
+        const uint8_t * s1 = src + j * 2 * src_stride2;
+
+        for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
+            cpy16_to_8(d, s1, n, shr);
+        }
+    }
+}
+
+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
+{
+    const int w = av_frame_cropped_width(src);
+    const int h = av_frame_cropped_height(src);
+    const int x = src->crop_left;
+    const int y = src->crop_top;
+
+    // We will crop as part of the conversion
+    dst->crop_top = 0;
+    dst->crop_left = 0;
+    dst->crop_bottom = 0;
+    dst->crop_right = 0;
+
+    switch (src->format){
+        case AV_PIX_FMT_SAND128:
+        case AV_PIX_FMT_RPI4_8:
+            switch (dst->format){
+                case AV_PIX_FMT_YUV420P:
+                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
+                                             src->data[0],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x, y, w, h);
+                    av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
+                                             dst->data[2], dst->linesize[2],
+                                             src->data[1],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x/2, y/2,  w/2, h/2);
+                    break;
+                default:
+                    return -1;
+            }
+            break;
+        case AV_PIX_FMT_SAND64_10:
+            switch (dst->format){
+                case AV_PIX_FMT_YUV420P10:
+                    av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0],
+                                             src->data[0],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x*2, y, w*2, h);
+                    av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1],
+                                             dst->data[2], dst->linesize[2],
+                                             src->data[1],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x, y/2,  w, h/2);
+                    break;
+                default:
+                    return -1;
+            }
+            break;
+        case AV_PIX_FMT_RPI4_10:
+            switch (dst->format){
+                case AV_PIX_FMT_YUV420P10:
+                    av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
+                                             src->data[0],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x, y, w, h);
+                    av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
+                                             dst->data[2], dst->linesize[2],
+                                             src->data[1],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x/2, y/2, w/2, h/2);
+                    break;
+                default:
+                    return -1;
+            }
+            break;
+        default:
+            return -1;
+    }
+
+    return av_frame_copy_props(dst, src);
+}
diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
new file mode 100644
index 0000000000..634b55e800
--- /dev/null
+++ b/libavutil/rpi_sand_fns.h
@@ -0,0 +1,183 @@
+/*
+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: John Cox
+*/
+
+#ifndef AVUTIL_RPI_SAND_FNS
+#define AVUTIL_RPI_SAND_FNS
+
+#include "libavutil/frame.h"
+
+// For all these fns _x & _w are measured as coord * PW
+// For the C fns coords are in chroma pels (so luma / 2)
+// Strides are in bytes
+
+void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+
+void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+
+void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
+                             unsigned int stride1, unsigned int stride2,
+                             const uint8_t * src_u, const unsigned int src_stride_u,
+                             const uint8_t * src_v, const unsigned int src_stride_v,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
+                             unsigned int stride1, unsigned int stride2,
+                             const uint8_t * src_u, const unsigned int src_stride_u,
+                             const uint8_t * src_v, const unsigned int src_stride_v,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+
+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
+                             uint8_t * dst_v, const unsigned int dst_stride_v,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
+
+
+// w/h in pixels
+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
+                         unsigned int w, unsigned int h, const unsigned int shr);
+
+
+// dst must contain required pixel format & allocated data buffers
+// Cropping on the src buffer will be honoured and dst crop will be set to zero
+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src);
+
+
+static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
+{
+#ifdef RPI_ZC_SAND128_ONLY
+    // If we are sure we only only support 128 byte sand formats replace the
+    // var with a constant which should allow for better optimisation
+    return 128;
+#else
+    return frame->linesize[0];
+#endif
+}
+
+static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
+{
+    return frame->linesize[3];
+}
+
+
+static inline int av_rpi_is_sand_format(const int format)
+{
+    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10);
+}
+
+static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
+{
+    return av_rpi_is_sand_format(frame->format);
+}
+
+static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
+{
+    return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8);
+}
+
+static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
+{
+    return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
+}
+
+static inline int av_rpi_is_sand30_frame(const AVFrame * const frame)
+{
+    return (frame->format == AV_PIX_FMT_RPI4_10);
+}
+
+static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
+{
+    return av_rpi_is_sand8_frame(frame) ? 0 : 1;
+}
+
+// If x is measured in bytes (not pixels) then this works for sand64_16 as
+// well as sand128 - but in the general case we work that out
+
+static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
+{
+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+    const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
+    const unsigned int x1 = x & (stride1 - 1);
+    const unsigned int x2 = x ^ x1;
+
+    return x1 + stride1 * y + stride2 * x2;
+}
+
+static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
+{
+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+    const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
+    const unsigned int x1 = x & (stride1 - 1);
+    const unsigned int x2 = x ^ x1;
+
+    return x1 + stride1 * y_c + stride2 * x2;
+}
+
+static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+{
+    return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
+}
+
+static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+{
+    return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
+}
+
+#endif
+
-- 
2.43.0


From f46f2fd87f3c6457e2797a5a27944a42c0d6fe70 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 11:36:47 +0100
Subject: [PATCH 002/157] Add aarch64 asm sand conv functions

Many thanks to eiler.mike@gmail.com (Michael Eiler) for these
optimizations
---
 libavutil/aarch64/Makefile        |   2 +
 libavutil/aarch64/rpi_sand_neon.S | 676 ++++++++++++++++++++++++++++++
 libavutil/aarch64/rpi_sand_neon.h |  55 +++
 libavutil/rpi_sand_fn_pw.h        |   4 +-
 libavutil/rpi_sand_fns.c          |   3 +
 5 files changed, 738 insertions(+), 2 deletions(-)
 create mode 100644 libavutil/aarch64/rpi_sand_neon.S
 create mode 100644 libavutil/aarch64/rpi_sand_neon.h

diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile
index eba0151337..1b44beab39 100644
--- a/libavutil/aarch64/Makefile
+++ b/libavutil/aarch64/Makefile
@@ -4,3 +4,5 @@ OBJS += aarch64/cpu.o                                                 \
 
 NEON-OBJS += aarch64/float_dsp_neon.o                                 \
              aarch64/tx_float_neon.o                                  \
+             aarch64/rpi_sand_neon.o                                  \
+
diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
new file mode 100644
index 0000000000..cdcf71ee67
--- /dev/null
+++ b/libavutil/aarch64/rpi_sand_neon.S
@@ -0,0 +1,676 @@
+/*
+Copyright (c) 2021 Michael Eiler
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: Michael Eiler <eiler.mike@gmail.com>
+*/
+
+#include "asm.S"
+
+// void ff_rpi_sand8_lines_to_planar_y8(
+//   uint8_t * dest,            : x0
+//   unsigned int dst_stride,   : w1
+//   const uint8_t * src,       : x2
+//   unsigned int src_stride1,  : w3, always 128
+//   unsigned int src_stride2,  : w4
+//   unsigned int _x,           : w5
+//   unsigned int y,            : w6
+//   unsigned int _w,           : w7
+//   unsigned int h);           : [sp, #0]
+
+function ff_rpi_sand8_lines_to_planar_y8, export=1
+    // w15 contains the number of rows we need to process
+    ldr w15, [sp, #0]
+
+    // w8 will contain the number of blocks per row
+    // w8 = floor(_w/stride1)
+    // stride1 is assumed to always be 128
+    mov w8, w1
+    lsr w8, w8, #7
+
+    // in case the width of the image is not a multiple of 128, there will
+    // be an incomplete block at the end of every row
+    // w9 contains the number of pixels stored within this block
+    // w9 = _w - w8 * 128
+    lsl w9, w8, #7
+    sub w9, w7, w9
+
+    // this is the value we have to add to the src pointer after reading a complete block
+    // it will move the address to the start of the next block
+    // w10 = stride2 * stride1 - stride1 
+    mov w10, w4
+    lsl w10, w10, #7
+    sub w10, w10, #128
+
+    // w11 is the row offset, meaning the start offset of the first block of every collumn
+    // this will be increased with stride1 within every iteration of the row_loop
+    eor w11, w11, w11
+
+    // w12 = 0, processed row count
+    eor w12, w12, w12
+row_loop:
+    // start of the first block within the current row
+    // x13 = row offset + src
+    mov x13, x2
+    add x13, x13, x11
+
+    // w14 = 0, processed block count
+    eor w14, w14, w14
+
+    cmp w8, #0
+    beq no_main_y8
+
+block_loop:
+    // copy 128 bytes (a full block) into the vector registers v0-v7 and increase the src address by 128
+    // fortunately these aren't callee saved ones, meaning we don't need to backup them
+    ld1 { v0.16b,  v1.16b,  v2.16b,  v3.16b}, [x13], #64
+    ld1 { v4.16b,  v5.16b,  v6.16b,  v7.16b}, [x13], #64 
+
+    // write these registers back to the destination vector and increase the dst address by 128
+    st1 { v0.16b,  v1.16b,  v2.16b,  v3.16b }, [x0], #64
+    st1 { v4.16b,  v5.16b,  v6.16b,  v7.16b }, [x0], #64
+
+    // move the source register to the beginning of the next block (x13 = src + block offset)
+    add x13, x13, x10
+    // increase the block counter
+    add w14, w14, #1
+
+    // continue with the block_loop if we haven't copied all full blocks yet
+    cmp w8, w14
+    bgt block_loop
+
+    // handle the last block at the end of each row
+    // at most 127 byte values copied from src to dst
+no_main_y8:
+    eor w5, w5, w5 // i = 0
+incomplete_block_loop_y8:
+    cmp w5, w9
+    bge incomplete_block_loop_end_y8
+
+    ldrb w6, [x13]
+    strb w6, [x0]
+    add x13, x13, #1
+    add x0, x0, #1
+
+    add w5, w5, #1
+    b incomplete_block_loop_y8
+incomplete_block_loop_end_y8:
+    
+   
+    // increase the row offset by 128 (stride1) 
+    add w11, w11, #128
+    // increment the row counter
+    add w12, w12, #1
+    
+    // process the next row if we haven't finished yet
+    cmp w15, w12
+    bgt row_loop
+
+    ret
+endfunc
+
+
+
+// void ff_rpi_sand8_lines_to_planar_c8(
+//   uint8_t * dst_u,           : x0
+//   unsigned int dst_stride_u, : w1 == width
+//   uint8_t * dst_v,           : x2
+//   unsigned int dst_stride_v, : w3 == width
+//   const uint8_t * src,       : x4
+//   unsigned int stride1,      : w5 == 128
+//   unsigned int stride2,      : w6
+//   unsigned int _x,           : w7
+//   unsigned int y,            : [sp, #0]
+//   unsigned int _w,           : [sp, #8]
+//   unsigned int h);           : [sp, #16]
+
+function ff_rpi_sand8_lines_to_planar_c8, export=1
+    // w7 = width
+    ldr w7, [sp, #8]
+
+    // w15 contains the number of rows we need to process
+    // counts down
+    ldr w15, [sp, #16]
+
+    // number of full blocks, w8 = _w / (stride1 >> 1) == _w / 64 == _w >> 6
+    mov w8, w7
+    lsr w8, w8, #6
+
+    // number of pixels in block at the end of every row
+    // w9 = _w - (w8 * 64)
+    lsl w9, w8, #6
+    sub w9, w7, w9
+
+    // Skip at the end of the line to account for stride
+    sub w12, w1, w7
+
+    // address delta to the beginning of the next block
+    // w10 = (stride2 * stride1 - stride1) = stride2 * 128 - 128
+    lsl w10, w6, #7
+    sub w10, w10, #128
+
+    // w11 = row address start offset = 0
+    eor w11, w11, w11
+
+row_loop_c8:
+    // start of the first block within the current row
+    // x13 = row offset + src
+    mov x13, x4
+    add x13, x13, x11
+
+    // w14 = 0, processed block count
+    eor w14, w14, w14
+
+    cmp w8, #0
+    beq no_main_c8
+
+block_loop_c8:
+    // load the full block -> 128 bytes, the block contains 64 interleaved U and V values 
+    ld2 { v0.16b,  v1.16b }, [x13], #32
+    ld2 { v2.16b,  v3.16b }, [x13], #32
+    ld2 { v4.16b,  v5.16b }, [x13], #32
+    ld2 { v6.16b,  v7.16b }, [x13], #32
+
+    // swap register so that we can write them out with a single instruction
+    mov v16.16b, v1.16b
+    mov v17.16b, v3.16b
+    mov v18.16b, v5.16b
+    mov v1.16b, v2.16b
+    mov v2.16b, v4.16b
+    mov v3.16b, v6.16b
+    mov v4.16b, v16.16b
+    mov v5.16b, v17.16b
+    mov v6.16b, v18.16b
+
+    st1 { v0.16b,  v1.16b,  v2.16b,  v3.16b }, [x0], #64
+    st1 { v4.16b,  v5.16b,  v6.16b,  v7.16b }, [x2], #64
+
+    // increment row counter and move src to the beginning of the next block
+    add w14, w14, #1
+    add x13, x13, x10
+    
+    // jump to block_loop_c8 iff the block count is smaller than the number of full blocks
+    cmp w8, w14
+    bgt block_loop_c8
+
+no_main_c8:
+    // handle incomplete block at the end of every row
+    eor w5, w5, w5 // point counter, this might be 
+incomplete_block_loop_c8:
+    cmp w5, w9
+    bge incomplete_block_loop_end_c8
+
+    ldrb w1, [x13]
+    strb w1, [x0]
+    add x13, x13, #1
+
+    ldrb w1, [x13]
+    strb w1, [x2]
+    add x13, x13, #1
+
+    add x0, x0, #1
+    add x2, x2, #1
+
+    add w5, w5, #1
+    b incomplete_block_loop_c8
+incomplete_block_loop_end_c8:
+
+    // increase row_offset by stride1
+    add w11, w11, #128
+    add x0, x0, w12, sxtw
+    add x2, x2, w12, sxtw
+
+    // jump to row_Loop_c8 iff the row count is small than the height
+    subs w15, w15, #1
+    bgt row_loop_c8
+
+    ret
+endfunc
+
+//void ff_rpi_sand30_lines_to_planar_y16(
+//  uint8_t * dest,             // [x0]
+//  unsigned int dst_stride,    // [w1] -> assumed to be equal to _w
+//  const uint8_t * src,        // [x2]
+//  unsigned int src_stride1,   // [w3] -> 128
+//  unsigned int src_stride2,   // [w4]
+//  unsigned int _x,            // [w5]
+//  unsigned int y,             // [w6]
+//  unsigned int _w,            // [w7]
+//  unsigned int h);            // [sp, #0]
+
+function ff_rpi_sand30_lines_to_planar_y16, export=1
+    stp x19, x20, [sp, #-48]!
+    stp x21, x22, [sp, #16]
+    stp x23, x24, [sp, #32]
+
+    // w6 = argument h
+    ldr w6, [sp, #48]
+
+    // slice_inc = ((stride2 - 1) * stride1)
+    mov w5, w4
+    sub w5, w5, #1
+    lsl w5, w5, #7
+
+    // total number of bytes per row = (width / 3) * 4
+    mov w8, w7
+    mov w9, #3
+    udiv w8, w8, w9
+    lsl w8, w8, #2
+
+    // number of full 128 byte blocks to be processed
+    mov w9, #96
+    udiv w9, w7, w9 // = (width * 4) / (3*128) = width/96
+
+    // w10 = number of full integers to process (4 bytes)
+    // w11 = remaning zero to two 10bit values still to copy over
+    mov w12, #96
+    mul w12, w9, w12
+    sub w12, w7, w12  // width - blocks*96 = remaining points per row
+    mov w11, #3
+    udiv w10, w12, w11 // full integers to process = w12 / 3 
+    mul w11, w10, w11  // #integers *3
+    sub w11, w12, w11  // remaining 0-2 points = remaining points - integers*3
+
+    // increase w9 by one if w10+w11 is not zero, and decrease the row count by one
+    // this is to efficiently copy incomplete blocks at the end of the rows
+    // the last row is handled explicitly to avoid writing out of bounds
+    add w22, w10, w11
+    cmp w22, #0
+    cset w22, ne // 1 iff w10+w11 not zero, 0 otherwise
+    add w9, w9, w22
+    sub w6, w6, #1
+
+    // store the number of bytes in w20 which we copy too much for every row
+    // when the width of the frame is not a multiple of 96 (128bytes storing 96 10bit values)
+    mov w20, #96*2
+    mul w20, w20, w9
+    sub w20, w1, w20
+
+    mov w23, #0 // flag to check whether the last line had already been processed
+    
+    // bitmask to clear the uppper 6bits of the result values
+    mov x19, #0x03ff03ff03ff03ff
+    dup v22.2d, x19
+
+    // row counter = 0
+    eor w12, w12, w12
+row_loop_y16:
+    cmp w12, w6               // jump to row_loop_y16_fin if we processed all rows
+    bge row_loop_y16_fin
+
+    mov x13, x2               // row src
+    eor w14, w14, w14         // full block counter
+block_loop_y16:
+    cmp w14, w9
+    bge block_loop_y16_fin
+
+    // load 64 bytes
+    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
+   
+    // process v0 and v1
+    xtn v16.4h, v0.4s
+    ushr v0.4s, v0.4s, #10
+    xtn v17.4h, v0.4s
+    ushr v0.4s, v0.4s, #10
+    xtn v18.4h, v0.4s
+   
+    xtn2 v16.8h, v1.4s
+    and v16.16b, v16.16b, v22.16b
+    ushr v1.4s, v1.4s, #10
+    xtn2 v17.8h, v1.4s
+    and v17.16b, v17.16b, v22.16b
+    ushr v1.4s, v1.4s, #10
+    xtn2 v18.8h, v1.4s
+    and v18.16b, v18.16b, v22.16b
+
+    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
+
+    // process v2 and v3
+    xtn v23.4h, v2.4s
+    ushr v2.4s, v2.4s, #10
+    xtn v24.4h, v2.4s
+    ushr v2.4s, v2.4s, #10
+    xtn v25.4h, v2.4s
+    
+    xtn2 v23.8h, v3.4s
+    and v23.16b, v23.16b, v22.16b
+    ushr v3.4s, v3.4s, #10
+    xtn2 v24.8h, v3.4s
+    and v24.16b, v24.16b, v22.16b
+    ushr v3.4s, v3.4s, #10
+    xtn2 v25.8h, v3.4s
+    and v25.16b, v25.16b, v22.16b
+
+    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
+
+    // load the second half of the block -> 64 bytes into registers v4-v7
+    ld1 { v4.4s,  v5.4s,  v6.4s,  v7.4s }, [x13], #64
+    
+    // process v4 and v5
+    xtn v16.4h, v4.4s
+    ushr v4.4s, v4.4s, #10
+    xtn v17.4h, v4.4s
+    ushr v4.4s, v4.4s, #10
+    xtn v18.4h, v4.4s
+   
+    xtn2 v16.8h, v5.4s 
+    and v16.16b, v16.16b, v22.16b
+    ushr v5.4s, v5.4s, #10
+    xtn2 v17.8h, v5.4s
+    and v17.16b, v17.16b, v22.16b
+    ushr v5.4s, v5.4s, #10
+    xtn2 v18.8h, v5.4s
+    and v18.16b, v18.16b, v22.16b
+
+    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
+
+    // v6 and v7
+    xtn v23.4h, v6.4s
+    ushr v6.4s, v6.4s, #10
+    xtn v24.4h, v6.4s
+    ushr v6.4s, v6.4s, #10
+    xtn v25.4h, v6.4s
+   
+    xtn2 v23.8h, v7.4s 
+    and v23.16b, v23.16b, v22.16b
+    ushr v7.4s, v7.4s, #10
+    xtn2 v24.8h, v7.4s
+    and v24.16b, v24.16b, v22.16b
+    ushr v7.4s, v7.4s, #10
+    xtn2 v25.8h, v7.4s
+    and v25.16b, v25.16b, v22.16b
+
+    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
+ 
+    add x13, x13, x5          // row src += slice_inc
+    add w14, w14, #1
+    b block_loop_y16
+block_loop_y16_fin:
+
+    
+
+
+    add x2, x2, #128          // src += stride1 (start of the next row)
+    add x0, x0, w20, sxtw     // subtract the bytes we copied too much from dst
+    add w12, w12, #1
+    b row_loop_y16
+row_loop_y16_fin:
+
+    // check whether we have incomplete blocks at the end of every row
+    // in that case decrease row block count by one
+    // change height back to it's original value (meaning increase it by 1)
+    // and jump back to another iteration of row_loop_y16
+
+    cmp w23, #1
+    beq row_loop_y16_fin2 // don't continue here if we already processed the last row
+    add w6, w6, #1    // increase height to the original value
+    sub w9, w9, w22   // block count - 1 or 0, depending on the remaining bytes count
+    mov w23, #1
+    b row_loop_y16
+row_loop_y16_fin2:
+
+    sub x0, x0, w20, sxtw // with the last row we didn't actually move the dst ptr to far ahead, therefore readd the diference
+
+    // now we've got to handle the last block in the last row
+    eor w12, w12, w12 // w12 = 0 = counter
+integer_loop_y16:
+    cmp w12, w10
+    bge integer_loop_y16_fin
+    ldr w14, [x13], #4
+    and w15, w14, #0x3ff
+    strh w15, [x0], #2
+    lsr w14, w14, #10
+    and w15, w14, #0x3ff
+    strh w15, [x0], #2
+    lsr w14, w14, #10
+    and w15, w14, #0x3ff
+    strh w15, [x0], #2
+    add w12, w12, #1
+    b integer_loop_y16
+integer_loop_y16_fin:
+
+final_values_y16:
+    // remaining point count = w11
+    ldr w14, [x13], #4
+    cmp w11, #0
+    beq final_values_y16_fin
+    and w15, w14, #0x3ff
+    strh w15, [x0], #2
+    cmp w11, #1
+    beq final_values_y16_fin
+    lsr w14, w14, #10
+    and w15, w14, #0x3ff
+    strh w15, [x0], #2
+final_values_y16_fin:
+
+    ldp x23, x24, [sp, #32]
+    ldp x21, x22, [sp, #16]
+    ldp x19, x20, [sp], #48
+    ret
+endfunc
+
+//void ff_rpi_sand30_lines_to_planar_c16(
+//  uint8_t * dst_u,            // [x0]
+//  unsigned int dst_stride_u,  // [w1] == _w*2
+//  uint8_t * dst_v,            // [x2]
+//  unsigned int dst_stride_v,  // [w3] == _w*2
+//  const uint8_t * src,        // [x4]
+//  unsigned int stride1,       // [w5] == 128
+//  unsigned int stride2,       // [w6] 
+//  unsigned int _x,            // [w7] == 0
+//  unsigned int y,             // [sp, #0] == 0
+//  unsigned int _w,            // [sp, #8] -> w3
+//  unsigned int h);            // [sp, #16] -> w7
+
+.macro rpi_sand30_lines_to_planar_c16_block_half
+    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
+
+    xtn v4.4h, v0.4s
+    ushr v0.4s, v0.4s, #10
+    xtn v5.4h, v0.4s
+    ushr v0.4s, v0.4s, #10
+    xtn v6.4h, v0.4s
+    xtn2 v4.8h, v1.4s
+    ushr v1.4s, v1.4s, #10
+    xtn2 v5.8h, v1.4s
+    ushr v1.4s, v1.4s, #10
+    xtn2 v6.8h, v1.4s
+    and v4.16b, v4.16b, v16.16b
+    and v5.16b, v5.16b, v16.16b
+    and v6.16b, v6.16b, v16.16b
+    st3 { v4.8h, v5.8h, v6.8h }, [sp], #48
+    
+    xtn v4.4h, v2.4s
+    ushr v2.4s, v2.4s, #10
+    xtn v5.4h, v2.4s
+    ushr v2.4s, v2.4s, #10
+    xtn v6.4h, v2.4s
+    xtn2 v4.8h, v3.4s
+    ushr v3.4s, v3.4s, #10
+    xtn2 v5.8h, v3.4s
+    ushr v3.4s, v3.4s, #10
+    xtn2 v6.8h, v3.4s
+    and v4.16b, v4.16b, v16.16b
+    and v5.16b, v5.16b, v16.16b
+    and v6.16b, v6.16b, v16.16b
+    st3 { v4.8h, v5.8h, v6.8h }, [sp]
+    sub sp, sp, #48
+.endm
+
+function ff_rpi_sand30_lines_to_planar_c16, export=1
+    stp x19, x20, [sp, #-48]!
+    stp x21, x22, [sp, #16]
+    stp x23, x24, [sp, #32]
+
+    ldr w3, [sp, #48+8]    // w3 = width
+    ldr w7, [sp, #48+16]   // w7 = height
+
+    // reserve space on the stack for intermediate results
+    sub sp, sp, #256
+
+    // number of 128byte blocks per row, w8 = width / 48
+    mov w9, #48
+    udiv w8, w3, w9
+
+    // remaining pixels (rem_pix) per row, w9 = width - w8 * 48
+    mul w9, w8, w9
+    sub w9, w3, w9
+
+    // row offset, the beginning of the next row to process
+    eor w10, w10, w10
+
+    // offset to the beginning of the next block, w11 = stride2 * 128 - 128
+    lsl w11, w6, #7
+    sub w11, w11, #128
+
+    // decrease the height by one and in case of remaining pixels increase the block count by one
+    sub w7, w7, #1
+    cmp w9, #0
+    cset w19, ne    // w19 == 1 iff reamining pixels != 0
+    add w8, w8, w19
+
+    // bytes we have to move dst back by at the end of every row
+    mov w21, #48*2
+    mul w21, w21, w8
+    sub w21, w1, w21
+
+    mov w20, #0     // w20 = flag, last row processed
+
+    mov x12, #0x03ff03ff03ff03ff
+    dup v16.2d, x12
+
+    // iterate through rows, row counter = w12 = 0
+    eor w12, w12, w12
+row_loop_c16:
+    cmp w12, w7
+    bge row_loop_c16_fin
+
+    // address of row data = src + row_offset
+    mov x13, x4
+    add x13, x13, x10
+
+    eor w14, w14, w14
+block_loop_c16:
+    cmp w14, w8
+    bge block_loop_c16_fin
+
+    rpi_sand30_lines_to_planar_c16_block_half
+
+    ld2 { v0.8h, v1.8h }, [sp], #32
+    ld2 { v2.8h, v3.8h }, [sp], #32
+    ld2 { v4.8h, v5.8h }, [sp]
+    sub sp, sp, #64
+
+    st1 { v0.8h }, [x0], #16
+    st1 { v2.8h }, [x0], #16
+    st1 { v4.8h }, [x0], #16
+    st1 { v1.8h }, [x2], #16
+    st1 { v3.8h }, [x2], #16
+    st1 { v5.8h }, [x2], #16
+
+    rpi_sand30_lines_to_planar_c16_block_half
+
+    ld2 { v0.8h, v1.8h }, [sp], #32
+    ld2 { v2.8h, v3.8h }, [sp], #32
+    ld2 { v4.8h, v5.8h }, [sp]
+    sub sp, sp, #64
+
+    st1 { v0.8h }, [x0], #16
+    st1 { v2.8h }, [x0], #16
+    st1 { v4.8h }, [x0], #16
+    st1 { v1.8h }, [x2], #16
+    st1 { v3.8h }, [x2], #16
+    st1 { v5.8h }, [x2], #16
+
+    add x13, x13, x11 // offset to next block
+    add w14, w14, #1
+    b block_loop_c16
+block_loop_c16_fin:
+
+    add w10, w10, #128
+    add w12, w12, #1
+    add x0, x0, w21, sxtw  // move dst pointers back by x21
+    add x2, x2, w21, sxtw
+    b row_loop_c16
+row_loop_c16_fin:
+
+    cmp w20, #1
+    beq row_loop_c16_fin2
+    mov w20, #1
+    sub w8, w8, w19 // decrease block count by w19
+    add w7, w7, #1 // increase height
+    b row_loop_c16
+
+row_loop_c16_fin2:
+    sub x0, x0, w21, sxtw // readd x21 in case of the last row
+    sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels
+
+    // last incomplete block to be finished
+    // read operations are fine, stride2 is more than large enough even if rem_pix is 0
+    rpi_sand30_lines_to_planar_c16_block_half
+    ld2 { v0.8h, v1.8h }, [sp], #32
+    ld2 { v2.8h, v3.8h }, [sp], #32
+    ld2 { v4.8h, v5.8h }, [sp], #32
+    rpi_sand30_lines_to_planar_c16_block_half
+    ld2 { v0.8h, v1.8h }, [sp], #32
+    ld2 { v2.8h, v3.8h }, [sp], #32
+    ld2 { v4.8h, v5.8h }, [sp]
+    sub sp, sp, #160
+
+    mov x4, sp
+    eor w20, w20, w20
+rem_pix_c16_loop:
+    cmp w20, w9
+    bge rem_pix_c16_fin
+
+    ldr w22, [x4], #4
+    str w22, [x0], #2
+    lsr w22, w22, #16
+    str w22, [x2], #2 
+
+    add w20, w20, #1
+    b rem_pix_c16_loop
+rem_pix_c16_fin:
+
+    add sp, sp, #256
+
+    ldp x23, x24, [sp, #32]
+    ldp x21, x22, [sp, #16]
+    ldp x19, x20, [sp], #48
+    ret
+endfunc
+
+
+
+//void ff_rpi_sand30_lines_to_planar_p010(
+//  uint8_t * dest,
+//  unsigned int dst_stride,
+//  const uint8_t * src,
+//  unsigned int src_stride1,
+//  unsigned int src_stride2,
+//  unsigned int _x,
+//  unsigned int y,
+//  unsigned int _w,
+//  unsigned int h);
+
diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h
new file mode 100644
index 0000000000..b3aa481ea4
--- /dev/null
+++ b/libavutil/aarch64/rpi_sand_neon.h
@@ -0,0 +1,55 @@
+/*
+Copyright (c) 2021 Michael Eiler
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Authors: Michael Eiler <eiler.mike@gmail.com>
+*/
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ff_rpi_sand8_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
+
+void ff_rpi_sand8_lines_to_planar_c8(uint8_t * dst_u, unsigned int dst_stride_u,
+  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src,
+  unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y,
+  unsigned int _w, unsigned int h);
+
+void ff_rpi_sand30_lines_to_planar_y16(uint8_t * dest, unsigned int dst_stride,
+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
+
+void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_u,
+  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1,
+  unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
index 0324f6826d..0d5d203dc3 100644
--- a/libavutil/rpi_sand_fn_pw.h
+++ b/libavutil/rpi_sand_fn_pw.h
@@ -54,7 +54,7 @@ void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
     const unsigned int w = _w;
     const unsigned int mask = stride1 - 1;
 
-#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64)
+#if PW == 1 && HAVE_SAND_ASM
     if (_x == 0) {
         ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride,
                                      src, stride1, stride2, _x, y, _w, h);
@@ -106,7 +106,7 @@ void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_strid
     const unsigned int w = _w * 2;
     const unsigned int mask = stride1 - 1;
 
-#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64)
+#if PW == 1 && HAVE_SAND_ASM
     if (_x == 0) {
         ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v,
                                      src, stride1, stride2, _x, y, _w, h);
diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
index ed0261b02f..1f543e9357 100644
--- a/libavutil/rpi_sand_fns.c
+++ b/libavutil/rpi_sand_fns.c
@@ -37,6 +37,9 @@ Authors: John Cox
 #if ARCH_ARM && HAVE_NEON
 #include "arm/rpi_sand_neon.h"
 #define HAVE_SAND_ASM 1
+#elif ARCH_AARCH64 && HAVE_NEON
+#include "aarch64/rpi_sand_neon.h"
+#define HAVE_SAND_ASM 1
 #else
 #define HAVE_SAND_ASM 0
 #endif
-- 
2.43.0


From 2a4b83949da8fbbdb02204d5808d5121f91389cd Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 11:56:02 +0100
Subject: [PATCH 003/157] Add raw encoding for sand

---
 libavcodec/raw.c    |  6 +++
 libavcodec/rawenc.c | 92 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 96 insertions(+), 2 deletions(-)

diff --git a/libavcodec/raw.c b/libavcodec/raw.c
index 1e5b48d1e0..1e689f9ee0 100644
--- a/libavcodec/raw.c
+++ b/libavcodec/raw.c
@@ -295,6 +295,12 @@ static const PixelFormatTag raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
 
+    /* RPI (Might as well define for everything) */
+    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
+    { AV_PIX_FMT_RPI4_8,      MKTAG('S', 'A', 'N', 'D') },
+    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
+    { AV_PIX_FMT_RPI4_10,     MKTAG('S', 'N', 'D', 'B') },
+
     { AV_PIX_FMT_NONE, 0 },
 };
 
diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
index 8c577006d9..594a77c42a 100644
--- a/libavcodec/rawenc.c
+++ b/libavcodec/rawenc.c
@@ -24,6 +24,7 @@
  * Raw Video Encoder
  */
 
+#include "config.h"
 #include "avcodec.h"
 #include "codec_internal.h"
 #include "encode.h"
@@ -33,6 +34,10 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
+#include "libavutil/avassert.h"
+#if CONFIG_SAND
+#include "libavutil/rpi_sand_fns.h"
+#endif
 
 static av_cold int raw_encode_init(AVCodecContext *avctx)
 {
@@ -46,12 +51,95 @@ static av_cold int raw_encode_init(AVCodecContext *avctx)
     return 0;
 }
 
+#if CONFIG_SAND
+static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+                      const AVFrame *frame)
+{
+    const int width = av_frame_cropped_width(frame);
+    const int height = av_frame_cropped_height(frame);
+    const int x0 = frame->crop_left;
+    const int y0 = frame->crop_top;
+    const int size = width * height * 3 / 2;
+    uint8_t * dst;
+    int ret;
+
+    if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0)
+        return ret;
+
+    dst = pkt->data;
+
+    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
+    dst += width * height;
+    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
+    return 0;
+}
+
+static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+                      const AVFrame *frame)
+{
+    const int width = av_frame_cropped_width(frame);
+    const int height = av_frame_cropped_height(frame);
+    const int x0 = frame->crop_left;
+    const int y0 = frame->crop_top;
+    const int size = width * height * 3;
+    uint8_t * dst;
+    int ret;
+
+    if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0)
+        return ret;
+
+    dst = pkt->data;
+
+    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
+    dst += width * height * 2;
+    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
+    return 0;
+}
+
+static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+                      const AVFrame *frame)
+{
+    const int width = av_frame_cropped_width(frame);
+    const int height = av_frame_cropped_height(frame);
+    const int x0 = frame->crop_left;
+    const int y0 = frame->crop_top;
+    const int size = width * height * 3;
+    uint8_t * dst;
+    int ret;
+
+    if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0)
+        return ret;
+
+    dst = pkt->data;
+
+    av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
+    dst += width * height * 2;
+    av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width,
+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2);
+    return 0;
+}
+#endif
+
+
 static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
                       const AVFrame *frame, int *got_packet)
 {
-    int ret = av_image_get_buffer_size(frame->format,
-                                       frame->width, frame->height, 1);
+    int ret;
 
+#if CONFIG_SAND
+    if (av_rpi_is_sand_frame(frame)) {
+        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) :
+            av_rpi_is_sand16_frame(frame) ? raw_sand16_as_yuv420(avctx, pkt, frame) :
+            av_rpi_is_sand30_frame(frame) ? raw_sand30_as_yuv420(avctx, pkt, frame) : -1;
+        *got_packet = (ret == 0);
+        return ret;
+    }
+#endif
+
+    ret = av_image_get_buffer_size(frame->format,
+                                       frame->width, frame->height, 1);
     if (ret < 0)
         return ret;
 
-- 
2.43.0


From af3dc004ac600f1be2d9165df3f26203cb40e138 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 12:02:09 +0100
Subject: [PATCH 004/157] Deal with the lack of trivial sand cropping

---
 fftools/ffmpeg.c        |  4 ++--
 fftools/ffmpeg_filter.c |  4 ++--
 libavutil/frame.c       | 11 +++++++++++
 libavutil/frame.h       | 10 ++++++++++
 4 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
index c819d30ca5..ca5431aeb4 100644
--- a/fftools/ffmpeg.c
+++ b/fftools/ffmpeg.c
@@ -1996,8 +1996,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame, int keep_ref
                        av_channel_layout_compare(&ifilter->ch_layout, &frame->ch_layout);
         break;
     case AVMEDIA_TYPE_VIDEO:
-        need_reinit |= ifilter->width  != frame->width ||
-                       ifilter->height != frame->height;
+        need_reinit |= ifilter->width  != av_frame_cropped_width(frame) ||
+                       ifilter->height != av_frame_cropped_height(frame);
         break;
     }
 
diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c
index 686a33c2ba..cfe3351c52 100644
--- a/fftools/ffmpeg_filter.c
+++ b/fftools/ffmpeg_filter.c
@@ -1283,8 +1283,8 @@ int ifilter_parameters_from_frame(InputFilter *ifilter, const AVFrame *frame)
 
     ifilter->format = frame->format;
 
-    ifilter->width               = frame->width;
-    ifilter->height              = frame->height;
+    ifilter->width               = av_frame_cropped_width(frame);
+    ifilter->height              = av_frame_cropped_height(frame);
     ifilter->sample_aspect_ratio = frame->sample_aspect_ratio;
 
     ifilter->sample_rate         = frame->sample_rate;
diff --git a/libavutil/frame.c b/libavutil/frame.c
index 9545477acc..48621e4098 100644
--- a/libavutil/frame.c
+++ b/libavutil/frame.c
@@ -16,6 +16,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "config.h"
+
 #include "channel_layout.h"
 #include "avassert.h"
 #include "buffer.h"
@@ -27,6 +29,9 @@
 #include "mem.h"
 #include "samplefmt.h"
 #include "hwcontext.h"
+#if CONFIG_SAND
+#include "rpi_sand_fns.h"
+#endif
 
 #if FF_API_OLD_CHANNEL_LAYOUT
 #define CHECK_CHANNELS_CONSISTENCY(frame) \
@@ -874,6 +879,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags)
         (frame->crop_top + frame->crop_bottom) >= frame->height)
         return AVERROR(ERANGE);
 
+#if CONFIG_SAND
+    // Sand cannot be cropped - do not try
+    if (av_rpi_is_sand_format(frame->format))
+        return 0;
+#endif
+
     desc = av_pix_fmt_desc_get(frame->format);
     if (!desc)
         return AVERROR_BUG;
diff --git a/libavutil/frame.h b/libavutil/frame.h
index 2580269549..3a9d323325 100644
--- a/libavutil/frame.h
+++ b/libavutil/frame.h
@@ -957,6 +957,16 @@ int av_frame_apply_cropping(AVFrame *frame, int flags);
  */
 const char *av_frame_side_data_name(enum AVFrameSideDataType type);
 
+
+static inline int av_frame_cropped_width(const AVFrame * const frame)
+{
+    return frame->width - (frame->crop_left + frame->crop_right);
+}
+static inline int av_frame_cropped_height(const AVFrame * const frame)
+{
+    return frame->height - (frame->crop_top + frame->crop_bottom);
+}
+
 /**
  * @}
  */
-- 
2.43.0


From 3c70a261a9205e0bc29dc2b436c621d6ca568761 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 12:31:16 +0100
Subject: [PATCH 005/157] Add an unsand filter

---
 configure                |   1 +
 libavfilter/Makefile     |   1 +
 libavfilter/allfilters.c |   1 +
 libavfilter/buffersrc.c  |   2 +-
 libavfilter/vf_unsand.c  | 228 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 232 insertions(+), 1 deletion(-)
 create mode 100644 libavfilter/vf_unsand.c

diff --git a/configure b/configure
index 5a5ada2071..986f51b75b 100755
--- a/configure
+++ b/configure
@@ -3754,6 +3754,7 @@ tonemap_opencl_filter_deps="opencl const_nan"
 transpose_opencl_filter_deps="opencl"
 transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags"
 transpose_vulkan_filter_deps="vulkan spirv_compiler"
+unsand_filter_select="sand"
 unsharp_opencl_filter_deps="opencl"
 uspp_filter_deps="gpl avcodec"
 vaguedenoiser_filter_deps="gpl"
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index b3d3d981dd..c14fc995a0 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -518,6 +518,7 @@ OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER)        += vf_transpose_vaapi.o vaapi_vpp.o
 OBJS-$(CONFIG_TRANSPOSE_VULKAN_FILTER)       += vf_transpose_vulkan.o vulkan.o vulkan_filter.o
 OBJS-$(CONFIG_TRIM_FILTER)                   += trim.o
 OBJS-$(CONFIG_UNPREMULTIPLY_FILTER)          += vf_premultiply.o framesync.o
+OBJS-$(CONFIG_UNSAND_FILTER)                 += vf_unsand.o
 OBJS-$(CONFIG_UNSHARP_FILTER)                += vf_unsharp.o
 OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER)         += vf_unsharp_opencl.o opencl.o \
                                                 opencl/unsharp.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index d7db46c2af..b990a00152 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -490,6 +490,7 @@ extern const AVFilter ff_vf_trim;
 extern const AVFilter ff_vf_unpremultiply;
 extern const AVFilter ff_vf_unsharp;
 extern const AVFilter ff_vf_unsharp_opencl;
+extern const AVFilter ff_vf_unsand;
 extern const AVFilter ff_vf_untile;
 extern const AVFilter ff_vf_uspp;
 extern const AVFilter ff_vf_v360;
diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c
index ba17450b93..0dbe5d2335 100644
--- a/libavfilter/buffersrc.c
+++ b/libavfilter/buffersrc.c
@@ -201,7 +201,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
         switch (ctx->outputs[0]->type) {
         case AVMEDIA_TYPE_VIDEO:
-            CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height,
+            CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame),
                                      frame->format, frame->pts);
             break;
         case AVMEDIA_TYPE_AUDIO:
diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c
new file mode 100644
index 0000000000..7100f2fc9b
--- /dev/null
+++ b/libavfilter/vf_unsand.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2007 Bobby Bingham
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * format and noformat video filters
+ */
+
+#include <string.h>
+
+#include "libavutil/internal.h"
+#include "libavutil/mem.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/opt.h"
+#include "libavutil/rpi_sand_fns.h"
+
+#include "avfilter.h"
+#include "formats.h"
+#include "internal.h"
+#include "video.h"
+
+typedef struct UnsandContext {
+    const AVClass *class;
+} UnsandContext;
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+//    UnsandContext *s = ctx->priv;
+}
+
+static av_cold int init(AVFilterContext *ctx)
+{
+//    UnsandContext *s = ctx->priv;
+
+    return 0;
+}
+
+
+static int filter_frame(AVFilterLink *link, AVFrame *in)
+{
+    AVFilterLink * const outlink = link->dst->outputs[0];
+    AVFrame *out = NULL;
+    int rv = 0;
+
+    if (outlink->format == in->format) {
+        // If nothing to do then do nothing
+        out = in;
+    }
+    else
+    {
+        if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL)
+        {
+            rv = AVERROR(ENOMEM);
+            goto fail;
+        }
+        if (av_rpi_sand_to_planar_frame(out, in) != 0)
+        {
+            rv = -1;
+            goto fail;
+        }
+
+        av_frame_free(&in);
+    }
+
+    return ff_filter_frame(outlink, out);
+
+fail:
+    av_frame_free(&out);
+    av_frame_free(&in);
+    return rv;
+}
+
+#if 0
+static void dump_fmts(const AVFilterFormats * fmts)
+{
+    int i;
+    if (fmts== NULL) {
+        printf("NULL\n");
+        return;
+    }
+    for (i = 0; i < fmts->nb_formats; ++i) {
+        printf(" %d", fmts->formats[i]);
+    }
+    printf("\n");
+}
+#endif
+
+static int query_formats(AVFilterContext *ctx)
+{
+//    UnsandContext *s = ctx->priv;
+    int ret;
+
+    // If we aren't connected at both ends then just do nothing
+    if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL)
+        return 0;
+
+    // Our output formats depend on our input formats and we can't/don't
+    // want to convert between bit depths so we need to wait for the source
+    // to have an opinion before we do
+    if (ctx->inputs[0]->incfg.formats == NULL)
+        return AVERROR(EAGAIN);
+
+    // Accept anything
+    if (ctx->inputs[0]->outcfg.formats == NULL &&
+        (ret = ff_formats_ref(ctx->inputs[0]->incfg.formats, &ctx->inputs[0]->outcfg.formats)) < 0)
+        return ret;
+
+    // Filter out sand formats
+
+    // Generate a container if we don't already have one
+    if (ctx->outputs[0]->incfg.formats == NULL)
+    {
+        // Somewhat rubbish way of ensuring we have a good structure
+        const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE};
+        AVFilterFormats *formats = ff_make_format_list(out_fmts);
+
+        if (formats == NULL)
+            return AVERROR(ENOMEM);
+        if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats)) < 0)
+            return ret;
+    }
+
+    // Replace old format list with new filtered list derived from what our
+    // input says it can do
+    {
+        const AVFilterFormats * const src_ff = ctx->inputs[0]->outcfg.formats;
+        AVFilterFormats * const dst_ff = ctx->outputs[0]->incfg.formats;
+        enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats);
+        int i;
+        int n = 0;
+        int seen_420p = 0;
+        int seen_420p10 = 0;
+
+        for (i = 0; i < src_ff->nb_formats; ++i) {
+            const enum AVPixelFormat f = src_ff->formats[i];
+
+            switch (f){
+                case AV_PIX_FMT_YUV420P:
+                case AV_PIX_FMT_SAND128:
+                case AV_PIX_FMT_RPI4_8:
+                    if (!seen_420p) {
+                        seen_420p = 1;
+                        dst_fmts[n++] = AV_PIX_FMT_YUV420P;
+                    }
+                    break;
+                case AV_PIX_FMT_SAND64_10:
+                case AV_PIX_FMT_YUV420P10:
+                case AV_PIX_FMT_RPI4_10:
+                    if (!seen_420p10) {
+                        seen_420p10 = 1;
+                        dst_fmts[n++] = AV_PIX_FMT_YUV420P10;
+                    }
+                    break;
+                default:
+                    dst_fmts[n++] = f;
+                    break;
+            }
+        }
+
+        av_freep(&dst_ff->formats);
+        dst_ff->formats = dst_fmts;
+        dst_ff->nb_formats = n;
+    }
+
+//    printf("Unsand: %s calc: ", __func__);
+//    dump_fmts(ctx->outputs[0]->incfg.formats);
+
+    return 0;
+}
+
+
+#define OFFSET(x) offsetof(UnsandContext, x)
+static const AVOption unsand_options[] = {
+    { NULL }
+};
+
+
+AVFILTER_DEFINE_CLASS(unsand);
+
+static const AVFilterPad avfilter_vf_unsand_inputs[] = {
+    {
+        .name             = "default",
+        .type             = AVMEDIA_TYPE_VIDEO,
+        .filter_frame = filter_frame,
+    },
+    { NULL }
+};
+
+static const AVFilterPad avfilter_vf_unsand_outputs[] = {
+    {
+        .name = "default",
+        .type = AVMEDIA_TYPE_VIDEO
+    },
+};
+
+AVFilter ff_vf_unsand = {
+    .name          = "unsand",
+    .description   = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"),
+
+    .init          = init,
+    .uninit        = uninit,
+
+    FILTER_QUERY_FUNC(query_formats),
+
+    .priv_size     = sizeof(UnsandContext),
+    .priv_class    = &unsand_class,
+
+    FILTER_INPUTS(avfilter_vf_unsand_inputs),
+    FILTER_OUTPUTS(avfilter_vf_unsand_outputs),
+};
+
-- 
2.43.0


From 8c6e8933be8abf4615b8087456ac9df6ef2460ff Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 12:37:07 +0100
Subject: [PATCH 006/157] Reduce mmal compile warnings

---
 libavcodec/mmaldec.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
index 3092f58510..6f41b41ac4 100644
--- a/libavcodec/mmaldec.c
+++ b/libavcodec/mmaldec.c
@@ -24,6 +24,9 @@
  * MMAL Video Decoder
  */
 
+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
 #include <bcm_host.h>
 #include <interface/mmal/mmal.h>
 #include <interface/mmal/mmal_parameters_video.h>
@@ -31,6 +34,7 @@
 #include <interface/mmal/util/mmal_util_params.h>
 #include <interface/mmal/util/mmal_default_components.h>
 #include <interface/mmal/vc/mmal_vc_api.h>
+#pragma GCC diagnostic pop
 #include <stdatomic.h>
 
 #include "avcodec.h"
-- 
2.43.0


From b7b2aa1f99dbf314c41f2637934534950cde095e Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 17:56:16 +0100
Subject: [PATCH 007/157] Add chroma location to hevc parse

---
 libavcodec/hevc_parser.c | 13 +++++++++++++
 libavcodec/hevcdec.c     | 13 +++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/libavcodec/hevc_parser.c b/libavcodec/hevc_parser.c
index 59f9a0ff3e..4ae7222e8b 100644
--- a/libavcodec/hevc_parser.c
+++ b/libavcodec/hevc_parser.c
@@ -97,6 +97,19 @@ static int hevc_parse_slice_header(AVCodecParserContext *s, H2645NAL *nal,
     avctx->profile  = ps->sps->ptl.general_ptl.profile_idc;
     avctx->level    = ps->sps->ptl.general_ptl.level_idc;
 
+    if (ps->sps->chroma_format_idc == 1) {
+        avctx->chroma_sample_location = ps->sps->vui.common.chroma_loc_info_present_flag ?
+            ps->sps->vui.common.chroma_sample_loc_type_top_field + 1 :
+            AVCHROMA_LOC_LEFT;
+    }
+    else if (ps->sps->chroma_format_idc == 2 ||
+             ps->sps->chroma_format_idc == 3) {
+        avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
+    }
+    else {
+        avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
+    }
+
     if (ps->vps->vps_timing_info_present_flag) {
         num = ps->vps->vps_num_units_in_tick;
         den = ps->vps->vps_time_scale;
diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
index 0e2844f47c..88482fd521 100644
--- a/libavcodec/hevcdec.c
+++ b/libavcodec/hevcdec.c
@@ -347,6 +347,19 @@ static void export_stream_params(HEVCContext *s, const HEVCSPS *sps)
     else
         avctx->color_range = AVCOL_RANGE_MPEG;
 
+    if (sps->chroma_format_idc == 1) {
+        avctx->chroma_sample_location = sps->vui.common.chroma_loc_info_present_flag ?
+            sps->vui.common.chroma_sample_loc_type_top_field + 1 :
+            AVCHROMA_LOC_LEFT;
+    }
+    else if (sps->chroma_format_idc == 2 ||
+             sps->chroma_format_idc == 3) {
+        avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
+    }
+    else {
+        avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
+    }
+
     if (sps->vui.common.colour_description_present_flag) {
         avctx->color_primaries = sps->vui.common.colour_primaries;
         avctx->color_trc       = sps->vui.common.transfer_characteristics;
-- 
2.43.0


From b7e941515558f7af1f87d931e9f8a790ac9b0783 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 26 Sep 2022 18:20:50 +0100
Subject: [PATCH 008/157] hwaccel: Add .abort_frame & use in hevcdec

---
 libavcodec/avcodec.h | 11 +++++++++++
 libavcodec/hevcdec.c |  7 ++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 39881a1d2b..32bc78e2be 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -2221,6 +2221,17 @@ typedef struct AVHWAccel {
      * that avctx->hwaccel_priv_data is invalid.
      */
     int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
+
+    /**
+     * Called if parsing fails
+     *
+     * An error has occured, end_frame will not be called
+     * start_frame & decode_slice may or may not have been called
+     * Optional
+     *
+     * @param avctx the codec context
+     */
+    void (*abort_frame)(AVCodecContext *avctx);
 } AVHWAccel;
 
 /**
diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
index 88482fd521..4ee564f3e0 100644
--- a/libavcodec/hevcdec.c
+++ b/libavcodec/hevcdec.c
@@ -3378,8 +3378,13 @@ static int hevc_decode_frame(AVCodecContext *avctx, AVFrame *rframe,
 
     s->ref = NULL;
     ret    = decode_nal_units(s, avpkt->data, avpkt->size);
-    if (ret < 0)
+    if (ret < 0) {
+        // Ensure that hwaccel knows this frame is over
+        if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame)
+            s->avctx->hwaccel->abort_frame(s->avctx);
+
         return ret;
+    }
 
     if (avctx->hwaccel) {
         if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {
-- 
2.43.0


From 3758444702032f3b87e3ccddb92e74d14e3d84fe Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 26 Sep 2022 18:26:17 +0100
Subject: [PATCH 009/157] hwaccel: Add CAP_MT_SAFE for accels that can use
 multi-thread

---
 libavcodec/hwconfig.h      | 1 +
 libavcodec/pthread_frame.c | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h
index 721424912c..c43ad55245 100644
--- a/libavcodec/hwconfig.h
+++ b/libavcodec/hwconfig.h
@@ -24,6 +24,7 @@
 
 
 #define HWACCEL_CAP_ASYNC_SAFE      (1 << 0)
+#define HWACCEL_CAP_MT_SAFE         (1 << 1)
 
 
 typedef struct AVCodecHWConfigInternal {
diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c
index d9d5afaa82..2cc89a41f5 100644
--- a/libavcodec/pthread_frame.c
+++ b/libavcodec/pthread_frame.c
@@ -204,7 +204,8 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
 
         /* if the previous thread uses hwaccel then we take the lock to ensure
          * the threads don't run concurrently */
-        if (avctx->hwaccel) {
+        if (avctx->hwaccel &&
+            !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
             pthread_mutex_lock(&p->parent->hwaccel_mutex);
             p->hwaccel_serializing = 1;
         }
@@ -590,7 +591,9 @@ void ff_thread_finish_setup(AVCodecContext *avctx) {
 
     if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return;
 
-    if (avctx->hwaccel && !p->hwaccel_serializing) {
+    if (avctx->hwaccel &&
+        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) &&
+        !p->hwaccel_serializing) {
         pthread_mutex_lock(&p->parent->hwaccel_mutex);
         p->hwaccel_serializing = 1;
     }
-- 
2.43.0


From 6d49d364ed434446ab135f2f2b9e06b6b9d0b3ad Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 17:59:08 +0100
Subject: [PATCH 010/157] Weak link utils

---
 libavcodec/weak_link.c | 102 +++++++++++++++++++++++++++++++++++++++++
 libavcodec/weak_link.h |  23 ++++++++++
 2 files changed, 125 insertions(+)
 create mode 100644 libavcodec/weak_link.c
 create mode 100644 libavcodec/weak_link.h

diff --git a/libavcodec/weak_link.c b/libavcodec/weak_link.c
new file mode 100644
index 0000000000..f234a985b9
--- /dev/null
+++ b/libavcodec/weak_link.c
@@ -0,0 +1,102 @@
+#include <stdlib.h>
+#include <pthread.h>
+#include <stdatomic.h>
+#include "weak_link.h"
+
+struct ff_weak_link_master {
+    atomic_int ref_count;    /* 0 is single ref for easier atomics */
+    pthread_rwlock_t lock;
+    void * ptr;
+};
+
+static inline struct ff_weak_link_master * weak_link_x(struct ff_weak_link_client * c)
+{
+    return (struct ff_weak_link_master *)c;
+}
+
+struct ff_weak_link_master * ff_weak_link_new(void * p)
+{
+    struct ff_weak_link_master * w = malloc(sizeof(*w));
+    if (!w)
+        return NULL;
+    w->ptr = p;
+    if (pthread_rwlock_init(&w->lock, NULL)) {
+        free(w);
+        return NULL;
+    }
+    return w;
+}
+
+static void weak_link_do_unref(struct ff_weak_link_master * const w)
+{
+    int n = atomic_fetch_sub(&w->ref_count, 1);
+    if (n)
+        return;
+
+    pthread_rwlock_destroy(&w->lock);
+    free(w);
+}
+
+// Unref & break link
+void ff_weak_link_break(struct ff_weak_link_master ** ppLink)
+{
+    struct ff_weak_link_master * const w = *ppLink;
+    if (!w)
+        return;
+
+    *ppLink = NULL;
+    pthread_rwlock_wrlock(&w->lock);
+    w->ptr = NULL;
+    pthread_rwlock_unlock(&w->lock);
+
+    weak_link_do_unref(w);
+}
+
+struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w)
+{
+    if (!w)
+        return NULL;
+    atomic_fetch_add(&w->ref_count, 1);
+    return (struct ff_weak_link_client*)w;
+}
+
+void ff_weak_link_unref(struct ff_weak_link_client ** ppLink)
+{
+    struct ff_weak_link_master * const w = weak_link_x(*ppLink);
+    if (!w)
+        return;
+
+    *ppLink = NULL;
+    weak_link_do_unref(w);
+}
+
+void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink)
+{
+    struct ff_weak_link_master * const w = weak_link_x(*ppLink);
+
+    if (!w)
+        return NULL;
+
+    if (pthread_rwlock_rdlock(&w->lock))
+        goto broken;
+
+    if (w->ptr)
+        return w->ptr;
+
+    pthread_rwlock_unlock(&w->lock);
+
+broken:
+    *ppLink = NULL;
+    weak_link_do_unref(w);
+    return NULL;
+}
+
+// Ignores a NULL c (so can be on the return path of both broken & live links)
+void ff_weak_link_unlock(struct ff_weak_link_client * c)
+{
+    struct ff_weak_link_master * const w = weak_link_x(c);
+    if (w)
+        pthread_rwlock_unlock(&w->lock);
+}
+
+
diff --git a/libavcodec/weak_link.h b/libavcodec/weak_link.h
new file mode 100644
index 0000000000..415b6a27a0
--- /dev/null
+++ b/libavcodec/weak_link.h
@@ -0,0 +1,23 @@
+struct ff_weak_link_master;
+struct ff_weak_link_client;
+
+struct ff_weak_link_master * ff_weak_link_new(void * p);
+void ff_weak_link_break(struct ff_weak_link_master ** ppLink);
+
+struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w);
+void ff_weak_link_unref(struct ff_weak_link_client ** ppLink);
+
+// Returns NULL if link broken - in this case it will also zap
+//   *ppLink and unref the weak_link.
+// Returns NULL if *ppLink is NULL (so a link once broken stays broken)
+//
+// The above does mean that there is a race if this is called simultainiously
+// by two threads using the same weak_link_client (so don't do that)
+void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink);
+void ff_weak_link_unlock(struct ff_weak_link_client * c);
+
+
+
+
+
+
-- 
2.43.0


From b3b2312ce05d54f8f85c0799dc2943f1118af610 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 19:23:26 +0100
Subject: [PATCH 011/157] Add v4l2_req V4L2 request H265 drm_prime decode

Has the abiliy to switch between kernel API versions at runtime. This
could be removed later once teher is no chance of usage on an old
kernel.
---
 configure                       |   14 +
 libavcodec/Makefile             |    4 +
 libavcodec/hevc-ctrls-v1.h      |  229 +++++
 libavcodec/hevc-ctrls-v2.h      |  257 +++++
 libavcodec/hevcdec.c            |   10 +
 libavcodec/hwaccels.h           |    1 +
 libavcodec/hwconfig.h           |    2 +
 libavcodec/v4l2_req_decode_q.c  |   84 ++
 libavcodec/v4l2_req_decode_q.h  |   25 +
 libavcodec/v4l2_req_devscan.c   |  449 +++++++++
 libavcodec/v4l2_req_devscan.h   |   23 +
 libavcodec/v4l2_req_dmabufs.c   |  266 ++++++
 libavcodec/v4l2_req_dmabufs.h   |   40 +
 libavcodec/v4l2_req_hevc_v1.c   |    3 +
 libavcodec/v4l2_req_hevc_v2.c   |    3 +
 libavcodec/v4l2_req_hevc_vx.c   | 1213 +++++++++++++++++++++++
 libavcodec/v4l2_req_media.c     | 1596 +++++++++++++++++++++++++++++++
 libavcodec/v4l2_req_media.h     |  151 +++
 libavcodec/v4l2_req_pollqueue.c |  361 +++++++
 libavcodec/v4l2_req_pollqueue.h |   18 +
 libavcodec/v4l2_req_utils.h     |   27 +
 libavcodec/v4l2_request_hevc.c  |  297 ++++++
 libavcodec/v4l2_request_hevc.h  |  102 ++
 23 files changed, 5175 insertions(+)
 create mode 100644 libavcodec/hevc-ctrls-v1.h
 create mode 100644 libavcodec/hevc-ctrls-v2.h
 create mode 100644 libavcodec/v4l2_req_decode_q.c
 create mode 100644 libavcodec/v4l2_req_decode_q.h
 create mode 100644 libavcodec/v4l2_req_devscan.c
 create mode 100644 libavcodec/v4l2_req_devscan.h
 create mode 100644 libavcodec/v4l2_req_dmabufs.c
 create mode 100644 libavcodec/v4l2_req_dmabufs.h
 create mode 100644 libavcodec/v4l2_req_hevc_v1.c
 create mode 100644 libavcodec/v4l2_req_hevc_v2.c
 create mode 100644 libavcodec/v4l2_req_hevc_vx.c
 create mode 100644 libavcodec/v4l2_req_media.c
 create mode 100644 libavcodec/v4l2_req_media.h
 create mode 100644 libavcodec/v4l2_req_pollqueue.c
 create mode 100644 libavcodec/v4l2_req_pollqueue.h
 create mode 100644 libavcodec/v4l2_req_utils.h
 create mode 100644 libavcodec/v4l2_request_hevc.c
 create mode 100644 libavcodec/v4l2_request_hevc.h

diff --git a/configure b/configure
index 986f51b75b..c091446730 100755
--- a/configure
+++ b/configure
@@ -281,6 +281,7 @@ External library support:
                            if openssl, gnutls or mbedtls is not used [no]
   --enable-libtwolame      enable MP2 encoding via libtwolame [no]
   --enable-libuavs3d       enable AVS3 decoding via libuavs3d [no]
+  --enable-libudev         enable libudev [no]
   --enable-libv4l2         enable libv4l2/v4l-utils [no]
   --enable-libvidstab      enable video stabilization using vid.stab [no]
   --enable-libvmaf         enable vmaf filter via libvmaf [no]
@@ -351,6 +352,7 @@ External library support:
   --enable-omx-rpi         enable OpenMAX IL code for Raspberry Pi [no]
   --enable-rkmpp           enable Rockchip Media Process Platform code [no]
   --disable-v4l2-m2m       disable V4L2 mem2mem code [autodetect]
+  --enable-v4l2-request    enable V4L2 request API code [no]
   --disable-vaapi          disable Video Acceleration API (mainly Unix/Intel) code [autodetect]
   --disable-vdpau          disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
   --disable-videotoolbox   disable VideoToolbox code [autodetect]
@@ -1858,6 +1860,7 @@ EXTERNAL_LIBRARY_LIST="
     libtheora
     libtwolame
     libuavs3d
+    libudev
     libv4l2
     libvmaf
     libvorbis
@@ -1914,6 +1917,7 @@ HWACCEL_LIBRARY_LIST="
     mmal
     omx
     opencl
+    v4l2_request
 "
 
 DOCUMENT_LIST="
@@ -3002,6 +3006,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext"
 dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32"
 ffnvcodec_deps_any="libdl LoadLibrary"
 nvdec_deps="ffnvcodec"
+v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev"
 vaapi_x11_deps="xlib_x11"
 videotoolbox_hwaccel_deps="videotoolbox pthreads"
 videotoolbox_hwaccel_extralibs="-framework QuartzCore"
@@ -3045,6 +3050,8 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC"
 hevc_dxva2_hwaccel_select="hevc_decoder"
 hevc_nvdec_hwaccel_deps="nvdec"
 hevc_nvdec_hwaccel_select="hevc_decoder"
+hevc_v4l2request_hwaccel_deps="v4l2_request"
+hevc_v4l2request_hwaccel_select="hevc_decoder"
 hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
 hevc_vaapi_hwaccel_select="hevc_decoder"
 hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC"
@@ -6696,6 +6703,7 @@ enabled libtwolame        && require libtwolame twolame.h twolame_init -ltwolame
                              { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame ||
                                die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; }
 enabled libuavs3d         && require_pkg_config libuavs3d "uavs3d >= 1.1.41" uavs3d.h uavs3d_decode
+enabled libudev           && require_pkg_config libudev libudev libudev.h udev_new
 enabled libv4l2           && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl
 enabled libvidstab        && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit
 enabled libvmaf           && require_pkg_config libvmaf "libvmaf >= 2.0.0" libvmaf.h vmaf_init
@@ -6798,6 +6806,10 @@ enabled rkmpp             && { require_pkg_config rkmpp rockchip_mpp  rockchip/r
                                { enabled libdrm ||
                                  die "ERROR: rkmpp requires --enable-libdrm"; }
                              }
+enabled v4l2_request      && { enabled libdrm ||
+                               die "ERROR: v4l2-request requires --enable-libdrm"; } &&
+                             { enabled libudev ||
+                               die "ERROR: v4l2-request requires --enable-libudev"; }
 enabled vapoursynth       && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init
 
 
@@ -6880,6 +6892,8 @@ if enabled v4l2_m2m; then
     check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
 fi
 
+check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
+check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
 check_headers sys/videoio.h
 test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
 
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 389253f5d0..2d440b5648 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -170,6 +170,8 @@ OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
 OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
 OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
 OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
+OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\
+					  v4l2_req_devscan.o weak_link.o
 OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
 OBJS-$(CONFIG_WMV2DSP)                 += wmv2dsp.o
 
@@ -996,6 +998,8 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)       += dxva2_hevc.o
 OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
 OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
 OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec.o
+OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o\
+                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o
 OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
diff --git a/libavcodec/hevc-ctrls-v1.h b/libavcodec/hevc-ctrls-v1.h
new file mode 100644
index 0000000000..72cbba0953
--- /dev/null
+++ b/libavcodec/hevc-ctrls-v1.h
@@ -0,0 +1,229 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * These are the HEVC state controls for use with stateless HEVC
+ * codec drivers.
+ *
+ * It turns out that these structs are not stable yet and will undergo
+ * more changes. So keep them private until they are stable and ready to
+ * become part of the official public API.
+ */
+
+#ifndef _HEVC_CTRLS_H_
+#define _HEVC_CTRLS_H_
+
+#include <linux/videodev2.h>
+
+/* The pixel format isn't stable at the moment and will likely be renamed. */
+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
+
+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_MPEG_BASE + 1008)
+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_MPEG_BASE + 1009)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_MPEG_BASE + 1010)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_MPEG_BASE + 1011)
+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_MPEG_BASE + 1015)
+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_MPEG_BASE + 1016)
+
+/* enum v4l2_ctrl_type type values */
+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
+
+enum v4l2_mpeg_video_hevc_decode_mode {
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
+};
+
+enum v4l2_mpeg_video_hevc_start_code {
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
+};
+
+#define V4L2_HEVC_SLICE_TYPE_B	0
+#define V4L2_HEVC_SLICE_TYPE_P	1
+#define V4L2_HEVC_SLICE_TYPE_I	2
+
+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
+
+/* The controls are not stable at the moment and will likely be reworked. */
+struct v4l2_ctrl_hevc_sps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
+	__u16	pic_width_in_luma_samples;
+	__u16	pic_height_in_luma_samples;
+	__u8	bit_depth_luma_minus8;
+	__u8	bit_depth_chroma_minus8;
+	__u8	log2_max_pic_order_cnt_lsb_minus4;
+	__u8	sps_max_dec_pic_buffering_minus1;
+	__u8	sps_max_num_reorder_pics;
+	__u8	sps_max_latency_increase_plus1;
+	__u8	log2_min_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_luma_coding_block_size;
+	__u8	log2_min_luma_transform_block_size_minus2;
+	__u8	log2_diff_max_min_luma_transform_block_size;
+	__u8	max_transform_hierarchy_depth_inter;
+	__u8	max_transform_hierarchy_depth_intra;
+	__u8	pcm_sample_bit_depth_luma_minus1;
+	__u8	pcm_sample_bit_depth_chroma_minus1;
+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
+	__u8	num_short_term_ref_pic_sets;
+	__u8	num_long_term_ref_pics_sps;
+	__u8	chroma_format_idc;
+	__u8	sps_max_sub_layers_minus1;
+
+	__u64	flags;
+};
+
+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 0)
+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
+
+struct v4l2_ctrl_hevc_pps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
+	__u8	num_extra_slice_header_bits;
+	__s8	init_qp_minus26;
+	__u8	diff_cu_qp_delta_depth;
+	__s8	pps_cb_qp_offset;
+	__s8	pps_cr_qp_offset;
+	__u8	num_tile_columns_minus1;
+	__u8	num_tile_rows_minus1;
+	__u8	column_width_minus1[20];
+	__u8	row_height_minus1[22];
+	__s8	pps_beta_offset_div2;
+	__s8	pps_tc_offset_div2;
+	__u8	log2_parallel_merge_level_minus2;
+
+	__u8	padding[4];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
+
+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
+
+struct v4l2_hevc_dpb_entry {
+	__u64	timestamp;
+	__u8	rps;
+	__u8	field_pic;
+	__u16	pic_order_cnt[2];
+	__u8	padding[2];
+};
+
+struct v4l2_hevc_pred_weight_table {
+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__u8	padding[6];
+
+	__u8	luma_log2_weight_denom;
+	__s8	delta_chroma_log2_weight_denom;
+};
+
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 9)
+
+struct v4l2_ctrl_hevc_slice_params {
+	__u32	bit_size;
+	__u32	data_bit_offset;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u32	slice_segment_addr;
+	__u32	num_entry_point_offsets;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+	__u8	nal_unit_type;
+	__u8	nuh_temporal_id_plus1;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	slice_type;
+	__u8	colour_plane_id;
+	__u16	slice_pic_order_cnt;
+	__u8	num_ref_idx_l0_active_minus1;
+	__u8	num_ref_idx_l1_active_minus1;
+	__u8	collocated_ref_idx;
+	__u8	five_minus_max_num_merge_cand;
+	__s8	slice_qp_delta;
+	__s8	slice_cb_qp_offset;
+	__s8	slice_cr_qp_offset;
+	__s8	slice_act_y_qp_offset;
+	__s8	slice_act_cb_qp_offset;
+	__s8	slice_act_cr_qp_offset;
+	__s8	slice_beta_offset_div2;
+	__s8	slice_tc_offset_div2;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
+	__u8	pic_struct;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	num_active_dpb_entries;
+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+
+	__u8	num_rps_poc_st_curr_before;
+	__u8	num_rps_poc_st_curr_after;
+	__u8	num_rps_poc_lt_curr;
+
+	__u8	padding;
+
+	__u32	entry_point_offset_minus1[256];
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
+	struct v4l2_hevc_pred_weight_table pred_weight_table;
+
+	__u64	flags;
+};
+
+struct v4l2_ctrl_hevc_scaling_matrix {
+	__u8	scaling_list_4x4[6][16];
+	__u8	scaling_list_8x8[6][64];
+	__u8	scaling_list_16x16[6][64];
+	__u8	scaling_list_32x32[2][64];
+	__u8	scaling_list_dc_coef_16x16[6];
+	__u8	scaling_list_dc_coef_32x32[2];
+};
+
+#endif
diff --git a/libavcodec/hevc-ctrls-v2.h b/libavcodec/hevc-ctrls-v2.h
new file mode 100644
index 0000000000..7cbbbf055f
--- /dev/null
+++ b/libavcodec/hevc-ctrls-v2.h
@@ -0,0 +1,257 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * These are the HEVC state controls for use with stateless HEVC
+ * codec drivers.
+ *
+ * It turns out that these structs are not stable yet and will undergo
+ * more changes. So keep them private until they are stable and ready to
+ * become part of the official public API.
+ */
+
+#ifndef _HEVC_CTRLS_H_
+#define _HEVC_CTRLS_H_
+
+#include <linux/videodev2.h>
+
+/* The pixel format isn't stable at the moment and will likely be renamed. */
+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
+
+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
+
+/* enum v4l2_ctrl_type type values */
+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
+#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
+
+enum v4l2_mpeg_video_hevc_decode_mode {
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
+};
+
+enum v4l2_mpeg_video_hevc_start_code {
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
+};
+
+#define V4L2_HEVC_SLICE_TYPE_B	0
+#define V4L2_HEVC_SLICE_TYPE_P	1
+#define V4L2_HEVC_SLICE_TYPE_I	2
+
+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
+
+/* The controls are not stable at the moment and will likely be reworked. */
+struct v4l2_ctrl_hevc_sps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
+	__u16	pic_width_in_luma_samples;
+	__u16	pic_height_in_luma_samples;
+	__u8	bit_depth_luma_minus8;
+	__u8	bit_depth_chroma_minus8;
+	__u8	log2_max_pic_order_cnt_lsb_minus4;
+	__u8	sps_max_dec_pic_buffering_minus1;
+	__u8	sps_max_num_reorder_pics;
+	__u8	sps_max_latency_increase_plus1;
+	__u8	log2_min_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_luma_coding_block_size;
+	__u8	log2_min_luma_transform_block_size_minus2;
+	__u8	log2_diff_max_min_luma_transform_block_size;
+	__u8	max_transform_hierarchy_depth_inter;
+	__u8	max_transform_hierarchy_depth_intra;
+	__u8	pcm_sample_bit_depth_luma_minus1;
+	__u8	pcm_sample_bit_depth_chroma_minus1;
+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
+	__u8	num_short_term_ref_pic_sets;
+	__u8	num_long_term_ref_pics_sps;
+	__u8	chroma_format_idc;
+	__u8	sps_max_sub_layers_minus1;
+
+	__u64	flags;
+};
+
+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
+
+struct v4l2_ctrl_hevc_pps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
+	__u8	num_extra_slice_header_bits;
+	__u8	num_ref_idx_l0_default_active_minus1;
+	__u8	num_ref_idx_l1_default_active_minus1;
+	__s8	init_qp_minus26;
+	__u8	diff_cu_qp_delta_depth;
+	__s8	pps_cb_qp_offset;
+	__s8	pps_cr_qp_offset;
+	__u8	num_tile_columns_minus1;
+	__u8	num_tile_rows_minus1;
+	__u8	column_width_minus1[20];
+	__u8	row_height_minus1[22];
+	__s8	pps_beta_offset_div2;
+	__s8	pps_tc_offset_div2;
+	__u8	log2_parallel_merge_level_minus2;
+
+	__u8	padding[4];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
+
+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
+
+struct v4l2_hevc_dpb_entry {
+	__u64	timestamp;
+	__u8	rps;
+	__u8	field_pic;
+	__u16	pic_order_cnt[2];
+	__u8	padding[2];
+};
+
+struct v4l2_hevc_pred_weight_table {
+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__u8	padding[6];
+
+	__u8	luma_log2_weight_denom;
+	__s8	delta_chroma_log2_weight_denom;
+};
+
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
+
+struct v4l2_ctrl_hevc_slice_params {
+	__u32	bit_size;
+	__u32	data_bit_offset;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u32	slice_segment_addr;
+	__u32	num_entry_point_offsets;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+	__u8	nal_unit_type;
+	__u8	nuh_temporal_id_plus1;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	slice_type;
+	__u8	colour_plane_id;
+	__u16	slice_pic_order_cnt;
+	__u8	num_ref_idx_l0_active_minus1;
+	__u8	num_ref_idx_l1_active_minus1;
+	__u8	collocated_ref_idx;
+	__u8	five_minus_max_num_merge_cand;
+	__s8	slice_qp_delta;
+	__s8	slice_cb_qp_offset;
+	__s8	slice_cr_qp_offset;
+	__s8	slice_act_y_qp_offset;
+	__s8	slice_act_cb_qp_offset;
+	__s8	slice_act_cr_qp_offset;
+	__s8	slice_beta_offset_div2;
+	__s8	slice_tc_offset_div2;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
+	__u8	pic_struct;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+
+	__u8	padding[5];
+
+	__u32	entry_point_offset_minus1[256];
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
+	struct v4l2_hevc_pred_weight_table pred_weight_table;
+
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
+
+struct v4l2_ctrl_hevc_decode_params {
+	__s32	pic_order_cnt_val;
+	__u8	num_active_dpb_entries;
+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	num_poc_st_curr_before;
+	__u8	num_poc_st_curr_after;
+	__u8	num_poc_lt_curr;
+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u64	flags;
+};
+
+/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
+#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
+/*
+ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
+ * the number of data (in bits) to skip in the
+ * slice segment header.
+ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
+ * to before syntax element "slice_temporal_mvp_enabled_flag".
+ * If IDR, the skipped bits are just "pic_output_flag"
+ * (separate_colour_plane_flag is not supported).
+ */
+#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
+
+struct v4l2_ctrl_hevc_scaling_matrix {
+	__u8	scaling_list_4x4[6][16];
+	__u8	scaling_list_8x8[6][64];
+	__u8	scaling_list_16x16[6][64];
+	__u8	scaling_list_32x32[2][64];
+	__u8	scaling_list_dc_coef_16x16[6];
+	__u8	scaling_list_dc_coef_32x32[2];
+};
+
+#endif
diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
index 4ee564f3e0..e892436f94 100644
--- a/libavcodec/hevcdec.c
+++ b/libavcodec/hevcdec.c
@@ -416,6 +416,7 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
 #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \
                      CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \
                      CONFIG_HEVC_NVDEC_HWACCEL + \
+                     CONFIG_HEVC_V4L2REQUEST_HWACCEL + \
                      CONFIG_HEVC_VAAPI_HWACCEL + \
                      CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \
                      CONFIG_HEVC_VDPAU_HWACCEL)
@@ -442,6 +443,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
 #endif
 #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
         *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
+#endif
+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
+        *fmt++ = AV_PIX_FMT_DRM_PRIME;
 #endif
         break;
     case AV_PIX_FMT_YUV420P10:
@@ -463,6 +467,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
 #endif
 #if CONFIG_HEVC_NVDEC_HWACCEL
         *fmt++ = AV_PIX_FMT_CUDA;
+#endif
+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
+        *fmt++ = AV_PIX_FMT_DRM_PRIME;
 #endif
         break;
     case AV_PIX_FMT_YUV444P:
@@ -3752,6 +3759,9 @@ const FFCodec ff_hevc_decoder = {
 #if CONFIG_HEVC_NVDEC_HWACCEL
                                HWACCEL_NVDEC(hevc),
 #endif
+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
+                               HWACCEL_V4L2REQUEST(hevc),
+#endif
 #if CONFIG_HEVC_VAAPI_HWACCEL
                                HWACCEL_VAAPI(hevc),
 #endif
diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
index aca55831f3..f32d1c4ec4 100644
--- a/libavcodec/hwaccels.h
+++ b/libavcodec/hwaccels.h
@@ -40,6 +40,7 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel;
 extern const AVHWAccel ff_hevc_d3d11va2_hwaccel;
 extern const AVHWAccel ff_hevc_dxva2_hwaccel;
 extern const AVHWAccel ff_hevc_nvdec_hwaccel;
+extern const AVHWAccel ff_hevc_v4l2request_hwaccel;
 extern const AVHWAccel ff_hevc_vaapi_hwaccel;
 extern const AVHWAccel ff_hevc_vdpau_hwaccel;
 extern const AVHWAccel ff_hevc_videotoolbox_hwaccel;
diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h
index c43ad55245..b8aa383071 100644
--- a/libavcodec/hwconfig.h
+++ b/libavcodec/hwconfig.h
@@ -71,6 +71,8 @@ typedef struct AVCodecHWConfigInternal {
     HW_CONFIG_HWACCEL(1, 1, 0, D3D11,        D3D11VA,      ff_ ## codec ## _d3d11va2_hwaccel)
 #define HWACCEL_NVDEC(codec) \
     HW_CONFIG_HWACCEL(1, 1, 0, CUDA,         CUDA,         ff_ ## codec ## _nvdec_hwaccel)
+#define HWACCEL_V4L2REQUEST(codec) \
+    HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME,    DRM,          ff_ ## codec ## _v4l2request_hwaccel)
 #define HWACCEL_VAAPI(codec) \
     HW_CONFIG_HWACCEL(1, 1, 1, VAAPI,        VAAPI,        ff_ ## codec ## _vaapi_hwaccel)
 #define HWACCEL_VDPAU(codec) \
diff --git a/libavcodec/v4l2_req_decode_q.c b/libavcodec/v4l2_req_decode_q.c
new file mode 100644
index 0000000000..5b3fb958fa
--- /dev/null
+++ b/libavcodec/v4l2_req_decode_q.c
@@ -0,0 +1,84 @@
+#include <memory.h>
+#include <semaphore.h>
+#include <pthread.h>
+
+#include "v4l2_req_decode_q.h"
+
+int decode_q_in_q(const req_decode_ent * const d)
+{
+    return d->in_q;
+}
+
+void decode_q_add(req_decode_q * const q, req_decode_ent * const d)
+{
+    pthread_mutex_lock(&q->q_lock);
+    if (!q->head) {
+        q->head = d;
+        q->tail = d;
+        d->prev = NULL;
+    }
+    else {
+        q->tail->next = d;
+        d->prev = q->tail;
+        q->tail = d;
+    }
+    d->next = NULL;
+    d->in_q = 1;
+    pthread_mutex_unlock(&q->q_lock);
+}
+
+// Remove entry from Q - if head wake-up anything that was waiting
+void decode_q_remove(req_decode_q * const q, req_decode_ent * const d)
+{
+    int try_signal = 0;
+
+    if (!d->in_q)
+        return;
+
+    pthread_mutex_lock(&q->q_lock);
+    if (d->prev)
+        d->prev->next = d->next;
+    else {
+        try_signal = 1;  // Only need to signal if we were head
+        q->head = d->next;
+    }
+
+    if (d->next)
+        d->next->prev = d->prev;
+    else
+        q->tail = d->prev;
+
+    // Not strictly needed but makes debug easier
+    d->next = NULL;
+    d->prev = NULL;
+    d->in_q = 0;
+    pthread_mutex_unlock(&q->q_lock);
+
+    if (try_signal)
+        pthread_cond_broadcast(&q->q_cond);
+}
+
+void decode_q_wait(req_decode_q * const q, req_decode_ent * const d)
+{
+    pthread_mutex_lock(&q->q_lock);
+
+    while (q->head != d)
+        pthread_cond_wait(&q->q_cond, &q->q_lock);
+
+    pthread_mutex_unlock(&q->q_lock);
+}
+
+void decode_q_uninit(req_decode_q * const q)
+{
+    pthread_mutex_destroy(&q->q_lock);
+    pthread_cond_destroy(&q->q_cond);
+}
+
+void decode_q_init(req_decode_q * const q)
+{
+    memset(q, 0, sizeof(*q));
+    pthread_mutex_init(&q->q_lock, NULL);
+    pthread_cond_init(&q->q_cond, NULL);
+}
+
+
diff --git a/libavcodec/v4l2_req_decode_q.h b/libavcodec/v4l2_req_decode_q.h
new file mode 100644
index 0000000000..af7bbe1de4
--- /dev/null
+++ b/libavcodec/v4l2_req_decode_q.h
@@ -0,0 +1,25 @@
+#ifndef AVCODEC_V4L2_REQ_DECODE_Q_H
+#define AVCODEC_V4L2_REQ_DECODE_Q_H
+
+typedef struct req_decode_ent {
+    struct req_decode_ent * next;
+    struct req_decode_ent * prev;
+    int in_q;
+} req_decode_ent;
+
+typedef struct req_decode_q {
+    pthread_mutex_t q_lock;
+    pthread_cond_t q_cond;
+    req_decode_ent * head;
+    req_decode_ent * tail;
+} req_decode_q;
+
+int decode_q_in_q(const req_decode_ent * const d);
+void decode_q_add(req_decode_q * const q, req_decode_ent * const d);
+void decode_q_remove(req_decode_q * const q, req_decode_ent * const d);
+void decode_q_wait(req_decode_q * const q, req_decode_ent * const d);
+void decode_q_uninit(req_decode_q * const q);
+void decode_q_init(req_decode_q * const q);
+
+#endif
+
diff --git a/libavcodec/v4l2_req_devscan.c b/libavcodec/v4l2_req_devscan.c
new file mode 100644
index 0000000000..cfa94d55c4
--- /dev/null
+++ b/libavcodec/v4l2_req_devscan.c
@@ -0,0 +1,449 @@
+#include <errno.h>
+#include <fcntl.h>
+#include <libudev.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/sysmacros.h>
+
+#include <linux/media.h>
+#include <linux/videodev2.h>
+
+#include "v4l2_req_devscan.h"
+#include "v4l2_req_utils.h"
+
+struct decdev {
+    enum v4l2_buf_type src_type;
+    uint32_t src_fmt_v4l2;
+    const char * vname;
+    const char * mname;
+};
+
+struct devscan {
+    struct decdev env;
+    unsigned int dev_size;
+    unsigned int dev_count;
+    struct decdev *devs;
+};
+
+static int video_src_pixfmt_supported(uint32_t fmt)
+{
+    return 1;
+}
+
+static void v4l2_setup_format(struct v4l2_format *format, unsigned int type,
+                  unsigned int width, unsigned int height,
+                  unsigned int pixelformat)
+{
+    unsigned int sizeimage;
+
+    memset(format, 0, sizeof(*format));
+    format->type = type;
+
+    sizeimage = V4L2_TYPE_IS_OUTPUT(type) ? 4 * 1024 * 1024 : 0;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
+        format->fmt.pix_mp.width = width;
+        format->fmt.pix_mp.height = height;
+        format->fmt.pix_mp.plane_fmt[0].sizeimage = sizeimage;
+        format->fmt.pix_mp.pixelformat = pixelformat;
+    } else {
+        format->fmt.pix.width = width;
+        format->fmt.pix.height = height;
+        format->fmt.pix.sizeimage = sizeimage;
+        format->fmt.pix.pixelformat = pixelformat;
+    }
+}
+
+static int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat,
+            unsigned int width, unsigned int height)
+{
+    struct v4l2_format format;
+
+    v4l2_setup_format(&format, type, width, height, pixelformat);
+
+    return ioctl(video_fd, VIDIOC_S_FMT, &format) ? -errno : 0;
+}
+
+static int v4l2_query_capabilities(int video_fd, unsigned int *capabilities)
+{
+    struct v4l2_capability capability = { 0 };
+    int rc;
+
+    rc = ioctl(video_fd, VIDIOC_QUERYCAP, &capability);
+    if (rc < 0)
+        return -errno;
+
+    if (capabilities != NULL) {
+        if ((capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0)
+            *capabilities = capability.device_caps;
+        else
+            *capabilities = capability.capabilities;
+    }
+
+    return 0;
+}
+
+static int devscan_add(struct devscan *const scan,
+                       enum v4l2_buf_type src_type,
+                       uint32_t src_fmt_v4l2,
+                       const char * vname,
+                       const char * mname)
+{
+    struct decdev *d;
+
+    if (scan->dev_size <= scan->dev_count) {
+        unsigned int n = !scan->dev_size ? 4 : scan->dev_size * 2;
+        d = realloc(scan->devs, n * sizeof(*d));
+        if (!d)
+            return -ENOMEM;
+        scan->devs = d;
+        scan->dev_size = n;
+    }
+
+    d = scan->devs + scan->dev_count;
+    d->src_type = src_type;
+    d->src_fmt_v4l2 = src_fmt_v4l2;
+    d->vname = strdup(vname);
+    if (!d->vname)
+        return -ENOMEM;
+    d->mname = strdup(mname);
+    if (!d->mname) {
+        free((char *)d->vname);
+        return -ENOMEM;
+    }
+    ++scan->dev_count;
+    return 0;
+}
+
+void devscan_delete(struct devscan **const pScan)
+{
+    unsigned int i;
+    struct devscan * const scan = *pScan;
+
+    if (!scan)
+        return;
+    *pScan = NULL;
+
+    for (i = 0; i < scan->dev_count; ++i) {
+        free((char*)scan->devs[i].mname);
+        free((char*)scan->devs[i].vname);
+    }
+    free(scan->devs);
+    free(scan);
+}
+
+#define REQ_BUF_CAPS (\
+    V4L2_BUF_CAP_SUPPORTS_DMABUF |\
+    V4L2_BUF_CAP_SUPPORTS_REQUESTS |\
+    V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF)
+
+static void probe_formats(void * const dc,
+              struct devscan *const scan,
+              const int fd,
+              const unsigned int type_v4l2,
+              const char *const mpath,
+              const char *const vpath)
+{
+    unsigned int i;
+    for (i = 0;; ++i) {
+        struct v4l2_fmtdesc fmtdesc = {
+            .index = i,
+            .type = type_v4l2
+        };
+        struct v4l2_requestbuffers rbufs = {
+            .count = 0,
+            .type = type_v4l2,
+            .memory = V4L2_MEMORY_MMAP
+        };
+        while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
+            if (errno == EINTR)
+                continue;
+            if (errno != EINVAL)
+                request_err(dc, "Enum[%d] failed for type=%d\n", i, type_v4l2);
+            return;
+        }
+        if (!video_src_pixfmt_supported(fmtdesc.pixelformat))
+            continue;
+
+        if (v4l2_set_format(fd, type_v4l2, fmtdesc.pixelformat, 720, 480)) {
+            request_debug(dc, "Set failed for type=%d, pf=%.4s\n", type_v4l2, (char*)&fmtdesc.pixelformat);
+            continue;
+        }
+
+        while (ioctl(fd, VIDIOC_REQBUFS, &rbufs)) {
+            if (errno != EINTR) {
+                request_debug(dc, "%s: Reqbufs failed\n", vpath);
+                continue;
+            }
+        }
+
+        if ((rbufs.capabilities & REQ_BUF_CAPS) != REQ_BUF_CAPS) {
+            request_debug(dc, "%s: Buf caps %#x insufficient\n", vpath, rbufs.capabilities);
+            continue;
+        }
+
+        request_debug(dc, "Adding: %s,%s pix=%#x, type=%d\n",
+                 mpath, vpath, fmtdesc.pixelformat, type_v4l2);
+        devscan_add(scan, type_v4l2, fmtdesc.pixelformat, vpath, mpath);
+    }
+}
+
+
+static int probe_video_device(void * const dc,
+                   struct udev_device *const device,
+                   struct devscan *const scan,
+                   const char *const mpath)
+{
+    int ret;
+    unsigned int capabilities = 0;
+    int video_fd = -1;
+
+    const char *path = udev_device_get_devnode(device);
+    if (!path) {
+        request_err(dc, "%s: get video device devnode failed\n", __func__);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    video_fd = open(path, O_RDWR, 0);
+    if (video_fd == -1) {
+        ret = -errno;
+        request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
+        goto fail;
+    }
+
+    ret = v4l2_query_capabilities(video_fd, &capabilities);
+    if (ret < 0) {
+        request_err(dc, "%s: get video capability failed, %s (%d)\n", __func__, strerror(-ret), -ret);
+        goto fail;
+    }
+
+    request_debug(dc, "%s: path=%s capabilities=%#x\n", __func__, path, capabilities);
+
+    if (!(capabilities & V4L2_CAP_STREAMING)) {
+        request_debug(dc, "%s: missing required streaming capability\n", __func__);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    if (!(capabilities & (V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M))) {
+        request_debug(dc, "%s: missing required mem2mem capability\n", __func__);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* Should check capture formats too... */
+    if ((capabilities & V4L2_CAP_VIDEO_M2M) != 0)
+        probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT, mpath, path);
+    if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) != 0)
+        probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, mpath, path);
+
+    close(video_fd);
+    return 0;
+
+fail:
+    if (video_fd >= 0)
+        close(video_fd);
+    return ret;
+}
+
+static int probe_media_device(void * const dc,
+                   struct udev_device *const device,
+                   struct devscan *const scan)
+{
+    int ret;
+    int rv;
+    struct media_device_info device_info = { 0 };
+    struct media_v2_topology topology = { 0 };
+    struct media_v2_interface *interfaces = NULL;
+    struct udev *udev = udev_device_get_udev(device);
+    struct udev_device *video_device;
+    dev_t devnum;
+    int media_fd = -1;
+
+    const char *path = udev_device_get_devnode(device);
+    if (!path) {
+        request_err(dc, "%s: get media device devnode failed\n", __func__);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    media_fd = open(path, O_RDWR, 0);
+    if (media_fd < 0) {
+        ret = -errno;
+        request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(-ret), -ret);
+        goto fail;
+    }
+
+    rv = ioctl(media_fd, MEDIA_IOC_DEVICE_INFO, &device_info);
+    if (rv < 0) {
+        ret = -errno;
+        request_err(dc, "%s: get media device info failed, %s (%d)\n", __func__, strerror(-ret), -ret);
+        goto fail;
+    }
+
+    rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
+    if (rv < 0) {
+        ret = -errno;
+        request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
+        goto fail;
+    }
+
+    if (topology.num_interfaces <= 0) {
+        request_err(dc, "%s: media device has no interfaces\n", __func__);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    interfaces = calloc(topology.num_interfaces, sizeof(*interfaces));
+    if (!interfaces) {
+        request_err(dc, "%s: allocating media interface struct failed\n", __func__);
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    topology.ptr_interfaces = (__u64)(uintptr_t)interfaces;
+    rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
+    if (rv < 0) {
+        ret = -errno;
+        request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
+        goto fail;
+    }
+
+    for (int i = 0; i < topology.num_interfaces; i++) {
+        if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO)
+            continue;
+
+        devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor);
+        video_device = udev_device_new_from_devnum(udev, 'c', devnum);
+        if (!video_device) {
+            ret = -errno;
+            request_err(dc, "%s: video_device[%d]=%p\n", __func__, i, video_device);
+            continue;
+        }
+
+        ret = probe_video_device(dc, video_device, scan, path);
+        udev_device_unref(video_device);
+
+        if (ret != 0)
+            goto fail;
+    }
+
+fail:
+    free(interfaces);
+    if (media_fd != -1)
+        close(media_fd);
+    return ret;
+}
+
+const char *decdev_media_path(const struct decdev *const dev)
+{
+    return !dev ? NULL : dev->mname;
+}
+
+const char *decdev_video_path(const struct decdev *const dev)
+{
+    return !dev ? NULL : dev->vname;
+}
+
+enum v4l2_buf_type decdev_src_type(const struct decdev *const dev)
+{
+    return !dev ? 0 : dev->src_type;
+}
+
+uint32_t decdev_src_pixelformat(const struct decdev *const dev)
+{
+    return !dev ? 0 : dev->src_fmt_v4l2;
+}
+
+
+const struct decdev *devscan_find(struct devscan *const scan,
+                  const uint32_t src_fmt_v4l2)
+{
+    unsigned int i;
+
+    if (scan->env.mname && scan->env.vname)
+        return &scan->env;
+
+    if (!src_fmt_v4l2)
+        return scan->dev_count ? scan->devs + 0 : NULL;
+
+    for (i = 0; i != scan->dev_count; ++i) {
+        if (scan->devs[i].src_fmt_v4l2 == src_fmt_v4l2)
+            return scan->devs + i;
+    }
+    return NULL;
+}
+
+int devscan_build(void * const dc, struct devscan **pscan)
+{
+    int ret;
+    struct udev *udev;
+    struct udev_enumerate *enumerate;
+    struct udev_list_entry *devices;
+    struct udev_list_entry *entry;
+    struct udev_device *device;
+    struct devscan * scan;
+
+    *pscan = NULL;
+
+    scan = calloc(1, sizeof(*scan));
+    if (!scan) {
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    scan->env.mname = getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH");
+    scan->env.vname = getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH");
+    if (scan->env.mname && scan->env.vname) {
+        request_info(dc, "Media/video device env overrides found: %s,%s\n",
+                 scan->env.mname, scan->env.vname);
+        *pscan = scan;
+        return 0;
+    }
+
+    udev = udev_new();
+    if (!udev) {
+        request_err(dc, "%s: allocating udev context failed\n", __func__);
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    enumerate = udev_enumerate_new(udev);
+    if (!enumerate) {
+        request_err(dc, "%s: allocating udev enumerator failed\n", __func__);
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    udev_enumerate_add_match_subsystem(enumerate, "media");
+    udev_enumerate_scan_devices(enumerate);
+
+    devices = udev_enumerate_get_list_entry(enumerate);
+    udev_list_entry_foreach(entry, devices) {
+        const char *path = udev_list_entry_get_name(entry);
+        if (!path)
+            continue;
+
+        device = udev_device_new_from_syspath(udev, path);
+        if (!device)
+            continue;
+
+        probe_media_device(dc, device, scan);
+        udev_device_unref(device);
+    }
+
+    udev_enumerate_unref(enumerate);
+
+    *pscan = scan;
+    return 0;
+
+fail:
+    udev_unref(udev);
+    devscan_delete(&scan);
+    return ret;
+}
+
diff --git a/libavcodec/v4l2_req_devscan.h b/libavcodec/v4l2_req_devscan.h
new file mode 100644
index 0000000000..956d9234f1
--- /dev/null
+++ b/libavcodec/v4l2_req_devscan.h
@@ -0,0 +1,23 @@
+#ifndef _DEVSCAN_H_
+#define _DEVSCAN_H_
+
+#include <stdint.h>
+
+struct devscan;
+struct decdev;
+enum v4l2_buf_type;
+
+/* These return pointers to data in the devscan structure and so are vaild
+ * for the lifetime of that
+ */
+const char *decdev_media_path(const struct decdev *const dev);
+const char *decdev_video_path(const struct decdev *const dev);
+enum v4l2_buf_type decdev_src_type(const struct decdev *const dev);
+uint32_t decdev_src_pixelformat(const struct decdev *const dev);
+
+const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2);
+
+int devscan_build(void * const dc, struct devscan **pscan);
+void devscan_delete(struct devscan **const pScan);
+
+#endif
diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
new file mode 100644
index 0000000000..ae6c648369
--- /dev/null
+++ b/libavcodec/v4l2_req_dmabufs.c
@@ -0,0 +1,266 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <linux/mman.h>
+#include <linux/dma-buf.h>
+#include <linux/dma-heap.h>
+
+#include "v4l2_req_dmabufs.h"
+#include "v4l2_req_utils.h"
+
+#define DMABUF_NAME1  "/dev/dma_heap/linux,cma"
+#define DMABUF_NAME2  "/dev/dma_heap/reserved"
+
+#define TRACE_ALLOC 0
+
+struct dmabufs_ctl {
+    int fd;
+    size_t page_size;
+};
+
+struct dmabuf_h {
+    int fd;
+    size_t size;
+    size_t len;
+    void * mapptr;
+};
+
+#if TRACE_ALLOC
+static unsigned int total_bufs = 0;
+static size_t total_size = 0;
+#endif
+
+struct dmabuf_h * dmabuf_import(int fd, size_t size)
+{
+    struct dmabuf_h *dh;
+
+    fd = dup(fd);
+    if (fd < 0  || size == 0)
+        return NULL;
+
+    dh = malloc(sizeof(*dh));
+    if (!dh) {
+        close(fd);
+        return NULL;
+    }
+
+    *dh = (struct dmabuf_h) {
+        .fd = fd,
+        .size = size,
+        .mapptr = MAP_FAILED
+    };
+
+#if TRACE_ALLOC
+    ++total_bufs;
+    total_size += dh->size;
+    request_log("%s: Import: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
+#endif
+
+    return dh;
+}
+
+struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size)
+{
+    struct dmabuf_h * dh;
+    struct dma_heap_allocation_data data = {
+        .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1),
+        .fd = 0,
+        .fd_flags = O_RDWR,
+        .heap_flags = 0
+    };
+
+    if (old != NULL) {
+        if (old->size == data.len) {
+            return old;
+        }
+        dmabuf_free(old);
+    }
+
+    if (size == 0 ||
+        (dh = malloc(sizeof(*dh))) == NULL)
+        return NULL;
+
+    while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) {
+        int err = errno;
+        request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n",
+                (uint64_t)data.len,
+                dbsc->fd,
+                err,
+                strerror(err));
+        if (err == EINTR)
+            continue;
+        goto fail;
+    }
+
+    *dh = (struct dmabuf_h){
+        .fd = data.fd,
+        .size = (size_t)data.len,
+        .mapptr = MAP_FAILED
+    };
+
+#if TRACE_ALLOC
+    ++total_bufs;
+    total_size += dh->size;
+    request_log("%s: Alloc: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
+#endif
+
+    return dh;
+
+fail:
+    free(dh);
+    return NULL;
+}
+
+int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags)
+{
+    struct dma_buf_sync sync = {
+        .flags = flags
+    };
+    while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) {
+        const int err = errno;
+        if (errno == EINTR)
+            continue;
+        request_log("%s: ioctl failed: flags=%#x\n", __func__, flags);
+        return -err;
+    }
+    return 0;
+}
+
+int dmabuf_write_start(struct dmabuf_h * const dh)
+{
+    return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE);
+}
+
+int dmabuf_write_end(struct dmabuf_h * const dh)
+{
+    return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE);
+}
+
+int dmabuf_read_start(struct dmabuf_h * const dh)
+{
+    if (!dmabuf_map(dh))
+        return -1;
+    return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ);
+}
+
+int dmabuf_read_end(struct dmabuf_h * const dh)
+{
+    return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ);
+}
+
+
+void * dmabuf_map(struct dmabuf_h * const dh)
+{
+    if (!dh)
+        return NULL;
+    if (dh->mapptr != MAP_FAILED)
+        return dh->mapptr;
+    dh->mapptr = mmap(NULL, dh->size,
+              PROT_READ | PROT_WRITE,
+              MAP_SHARED | MAP_POPULATE,
+              dh->fd, 0);
+    if (dh->mapptr == MAP_FAILED) {
+        request_log("%s: Map failed\n", __func__);
+        return NULL;
+    }
+    return dh->mapptr;
+}
+
+int dmabuf_fd(const struct dmabuf_h * const dh)
+{
+    if (!dh)
+        return -1;
+    return dh->fd;
+}
+
+size_t dmabuf_size(const struct dmabuf_h * const dh)
+{
+    if (!dh)
+        return 0;
+    return dh->size;
+}
+
+size_t dmabuf_len(const struct dmabuf_h * const dh)
+{
+    if (!dh)
+        return 0;
+    return dh->len;
+}
+
+void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len)
+{
+    dh->len = len;
+}
+
+
+
+void dmabuf_free(struct dmabuf_h * dh)
+{
+    if (!dh)
+        return;
+
+#if TRACE_ALLOC
+    --total_bufs;
+    total_size -= dh->size;
+    request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
+#endif
+
+    if (dh->mapptr != MAP_FAILED)
+        munmap(dh->mapptr, dh->size);
+    while (close(dh->fd) == -1 && errno == EINTR)
+        /* loop */;
+    free(dh);
+}
+
+struct dmabufs_ctl * dmabufs_ctl_new(void)
+{
+    struct dmabufs_ctl * dbsc = malloc(sizeof(*dbsc));
+
+    if (!dbsc)
+        return NULL;
+
+    while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 &&
+           errno == EINTR)
+        /* Loop */;
+
+    if (dbsc->fd == -1) {
+        while ((dbsc->fd = open(DMABUF_NAME2, O_RDWR)) == -1 &&
+               errno == EINTR)
+            /* Loop */;
+        if (dbsc->fd == -1) {
+            request_log("Unable to open either %s or %s\n",
+                    DMABUF_NAME1, DMABUF_NAME2);
+            goto fail;
+        }
+    }
+
+    dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE);
+
+    return dbsc;
+
+fail:
+    free(dbsc);
+    return NULL;
+}
+
+void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
+{
+    struct dmabufs_ctl * const dbsc = *pDbsc;
+
+    if (!dbsc)
+        return;
+    *pDbsc = NULL;
+
+    while (close(dbsc->fd) == -1 && errno == EINTR)
+        /* loop */;
+
+    free(dbsc);
+}
+
+
diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h
new file mode 100644
index 0000000000..cfb17e801d
--- /dev/null
+++ b/libavcodec/v4l2_req_dmabufs.h
@@ -0,0 +1,40 @@
+#ifndef DMABUFS_H
+#define DMABUFS_H
+
+#include <stddef.h>
+
+struct dmabufs_ctl;
+struct dmabuf_h;
+
+struct dmabufs_ctl * dmabufs_ctl_new(void);
+void dmabufs_ctl_delete(struct dmabufs_ctl ** const pdbsc);
+
+// Need not preserve old contents
+// On NULL return old buffer is freed
+struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h *, size_t size);
+
+static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t size) {
+    return dmabuf_realloc(dbsc, NULL, size);
+}
+/* Create from existing fd - dups(fd) */
+struct dmabuf_h * dmabuf_import(int fd, size_t size);
+void * dmabuf_map(struct dmabuf_h * const dh);
+
+/* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */
+int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags);
+
+int dmabuf_write_start(struct dmabuf_h * const dh);
+int dmabuf_write_end(struct dmabuf_h * const dh);
+int dmabuf_read_start(struct dmabuf_h * const dh);
+int dmabuf_read_end(struct dmabuf_h * const dh);
+
+int dmabuf_fd(const struct dmabuf_h * const dh);
+/* Allocated size */
+size_t dmabuf_size(const struct dmabuf_h * const dh);
+/* Bytes in use */
+size_t dmabuf_len(const struct dmabuf_h * const dh);
+/* Set bytes in use */
+void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len);
+void dmabuf_free(struct dmabuf_h * dh);
+
+#endif
diff --git a/libavcodec/v4l2_req_hevc_v1.c b/libavcodec/v4l2_req_hevc_v1.c
new file mode 100644
index 0000000000..169b532832
--- /dev/null
+++ b/libavcodec/v4l2_req_hevc_v1.c
@@ -0,0 +1,3 @@
+#define HEVC_CTRLS_VERSION 1
+#include "v4l2_req_hevc_vx.c"
+
diff --git a/libavcodec/v4l2_req_hevc_v2.c b/libavcodec/v4l2_req_hevc_v2.c
new file mode 100644
index 0000000000..42af98e156
--- /dev/null
+++ b/libavcodec/v4l2_req_hevc_v2.c
@@ -0,0 +1,3 @@
+#define HEVC_CTRLS_VERSION 2
+#include "v4l2_req_hevc_vx.c"
+
diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
new file mode 100644
index 0000000000..0ae03b10c4
--- /dev/null
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -0,0 +1,1213 @@
+// File included by v4l2_req_hevc_v* - not compiled on its own
+
+#include "decode.h"
+#include "hevcdec.h"
+#include "hwconfig.h"
+#include "internal.h"
+#include "thread.h"
+
+#include "v4l2_request_hevc.h"
+
+#if HEVC_CTRLS_VERSION == 1
+#include "hevc-ctrls-v1.h"
+
+// Fixup renamed entries
+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT
+
+#elif HEVC_CTRLS_VERSION == 2
+#include "hevc-ctrls-v2.h"
+#else
+#error Unknown HEVC_CTRLS_VERSION
+#endif
+
+#include "libavutil/hwcontext_drm.h"
+
+#include <semaphore.h>
+#include <pthread.h>
+
+#include "v4l2_req_devscan.h"
+#include "v4l2_req_dmabufs.h"
+#include "v4l2_req_pollqueue.h"
+#include "v4l2_req_media.h"
+#include "v4l2_req_utils.h"
+
+// Attached to buf[0] in frame
+// Pooled in hwcontext so generally create once - 1/frame
+typedef struct V4L2MediaReqDescriptor {
+    AVDRMFrameDescriptor drm;
+
+    // Media
+    uint64_t timestamp;
+    struct qent_dst * qe_dst;
+
+    // Decode only - should be NULL by the time we emit the frame
+    struct req_decode_ent decode_ent;
+
+    struct media_request *req;
+    struct qent_src *qe_src;
+
+#if HEVC_CTRLS_VERSION >= 2
+    struct v4l2_ctrl_hevc_decode_params dec;
+#endif
+
+    size_t num_slices;
+    size_t alloced_slices;
+    struct v4l2_ctrl_hevc_slice_params * slice_params;
+    struct slice_info * slices;
+
+} V4L2MediaReqDescriptor;
+
+struct slice_info {
+    const uint8_t * ptr;
+    size_t len; // bytes
+};
+
+// Handy container for accumulating controls before setting
+struct req_controls {
+    int has_scaling;
+    struct timeval tv;
+    struct v4l2_ctrl_hevc_sps sps;
+    struct v4l2_ctrl_hevc_pps pps;
+    struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix;
+};
+
+//static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 };
+
+
+// Get an FFmpeg format from the v4l2 format
+static enum AVPixelFormat pixel_format_from_format(const struct v4l2_format *const format)
+{
+    switch (V4L2_TYPE_IS_MULTIPLANAR(format->type) ?
+            format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat) {
+    case V4L2_PIX_FMT_YUV420:
+        return AV_PIX_FMT_YUV420P;
+    case V4L2_PIX_FMT_NV12:
+        return AV_PIX_FMT_NV12;
+#if CONFIG_SAND
+    case V4L2_PIX_FMT_NV12_COL128:
+        return AV_PIX_FMT_RPI4_8;
+    case V4L2_PIX_FMT_NV12_10_COL128:
+        return AV_PIX_FMT_RPI4_10;
+#endif
+    default:
+        break;
+    }
+    return AV_PIX_FMT_NONE;
+}
+
+static inline uint64_t frame_capture_dpb(const AVFrame * const frame)
+{
+    const V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
+    return rd->timestamp;
+}
+
+static inline void frame_set_capture_dpb(AVFrame * const frame, const uint64_t dpb_stamp)
+{
+    V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
+    rd->timestamp = dpb_stamp;
+}
+
+static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table)
+{
+    int32_t luma_weight_denom, chroma_weight_denom;
+    const SliceHeader *sh = &h->sh;
+
+    if (sh->slice_type == HEVC_SLICE_I ||
+        (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) ||
+        (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag))
+        return;
+
+    table->luma_log2_weight_denom = sh->luma_log2_weight_denom;
+
+    if (h->ps.sps->chroma_format_idc)
+        table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom;
+
+    luma_weight_denom = (1 << sh->luma_log2_weight_denom);
+    chroma_weight_denom = (1 << sh->chroma_log2_weight_denom);
+
+    for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) {
+        table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom;
+        table->luma_offset_l0[i] = sh->luma_offset_l0[i];
+        table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom;
+        table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom;
+        table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0];
+        table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1];
+    }
+
+    if (sh->slice_type != HEVC_SLICE_B)
+        return;
+
+    for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) {
+        table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom;
+        table->luma_offset_l1[i] = sh->luma_offset_l1[i];
+        table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom;
+        table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom;
+        table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0];
+        table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1];
+    }
+}
+
+static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
+{
+    const HEVCFrame *frame;
+    int i;
+
+    for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) {
+        frame = h->rps[ST_CURR_BEF].ref[i];
+        if (frame && timestamp == frame_capture_dpb(frame->frame))
+            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE;
+    }
+
+    for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) {
+        frame = h->rps[ST_CURR_AFT].ref[i];
+        if (frame && timestamp == frame_capture_dpb(frame->frame))
+            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER;
+    }
+
+    for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) {
+        frame = h->rps[LT_CURR].ref[i];
+        if (frame && timestamp == frame_capture_dpb(frame->frame))
+            return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR;
+    }
+
+    return 0;
+}
+
+static unsigned int
+get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
+                  const struct v4l2_hevc_dpb_entry * const entries,
+                  const unsigned int num_entries)
+{
+    uint64_t timestamp;
+
+    if (!frame)
+        return 0;
+
+    timestamp = frame_capture_dpb(frame->frame);
+
+    for (unsigned int i = 0; i < num_entries; i++) {
+        if (entries[i].timestamp == timestamp)
+            return i;
+    }
+
+    return 0;
+}
+
+static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx)
+{
+    unsigned int z = 0;
+    while (idx--) {
+        if (*b++ == 0) {
+            ++z;
+            if (z >= 2 && *b == 3) {
+                ++b;
+                z = 0;
+            }
+        }
+        else {
+            z = 0;
+        }
+    }
+    return b;
+}
+
+static int slice_add(V4L2MediaReqDescriptor * const rd)
+{
+    if (rd->num_slices >= rd->alloced_slices) {
+        struct v4l2_ctrl_hevc_slice_params * p2;
+        struct slice_info * s2;
+        size_t n2 = rd->num_slices == 0 ? 8 : rd->num_slices * 2;
+
+        p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2));
+        if (p2 == NULL)
+            return AVERROR(ENOMEM);
+        rd->slice_params = p2;
+
+        s2 = av_realloc_array(rd->slices, n2, sizeof(*s2));
+        if (s2 == NULL)
+            return AVERROR(ENOMEM);
+        rd->slices = s2;
+
+        rd->alloced_slices = n2;
+    }
+    ++rd->num_slices;
+    return 0;
+}
+
+static unsigned int
+fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries)
+{
+    unsigned int i;
+    unsigned int n = 0;
+    const HEVCFrame * const pic = h->ref;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) {
+        const HEVCFrame * const frame = &h->DPB[i];
+        if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) {
+            struct v4l2_hevc_dpb_entry * const entry = entries + n++;
+
+            entry->timestamp = frame_capture_dpb(frame->frame);
+            entry->rps = find_frame_rps_type(h, entry->timestamp);
+            entry->field_pic = frame->frame->interlaced_frame;
+
+            /* TODO: Interleaved: Get the POC for each field. */
+            entry->pic_order_cnt[0] = frame->poc;
+            entry->pic_order_cnt[1] = frame->poc;
+        }
+    }
+    return n;
+}
+
+static void fill_slice_params(const HEVCContext * const h,
+#if HEVC_CTRLS_VERSION >= 2
+                              const struct v4l2_ctrl_hevc_decode_params * const dec,
+#endif
+                              struct v4l2_ctrl_hevc_slice_params *slice_params,
+                              uint32_t bit_size, uint32_t bit_offset)
+{
+    const SliceHeader * const sh = &h->sh;
+#if HEVC_CTRLS_VERSION >= 2
+    const struct v4l2_hevc_dpb_entry *const dpb = dec->dpb;
+    const unsigned int dpb_n = dec->num_active_dpb_entries;
+#else
+    struct v4l2_hevc_dpb_entry *const dpb = slice_params->dpb;
+    unsigned int dpb_n;
+#endif
+    unsigned int i;
+    RefPicList *rpl;
+
+    *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
+        .bit_size = bit_size,
+        .data_bit_offset = bit_offset,
+
+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+        .slice_segment_addr = sh->slice_segment_addr,
+
+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+        .nal_unit_type = h->nal_unit_type,
+        .nuh_temporal_id_plus1 = h->temporal_id + 1,
+
+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+        .slice_type = sh->slice_type,
+        .colour_plane_id = sh->colour_plane_id,
+        .slice_pic_order_cnt = h->ref->poc,
+        .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0,
+        .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0,
+        .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0,
+        .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand,
+        .slice_qp_delta = sh->slice_qp_delta,
+        .slice_cb_qp_offset = sh->slice_cb_qp_offset,
+        .slice_cr_qp_offset = sh->slice_cr_qp_offset,
+        .slice_act_y_qp_offset = 0,
+        .slice_act_cb_qp_offset = 0,
+        .slice_act_cr_qp_offset = 0,
+        .slice_beta_offset_div2 = sh->beta_offset / 2,
+        .slice_tc_offset_div2 = sh->tc_offset / 2,
+
+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
+        .pic_struct = h->sei.picture_timing.picture_struct,
+
+#if HEVC_CTRLS_VERSION < 2
+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+        .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
+        .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
+        .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs,
+#endif
+    };
+
+    if (sh->slice_sample_adaptive_offset_flag[0])
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA;
+
+    if (sh->slice_sample_adaptive_offset_flag[1])
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA;
+
+    if (sh->slice_temporal_mvp_enabled_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED;
+
+    if (sh->mvd_l1_zero_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO;
+
+    if (sh->cabac_init_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT;
+
+    if (sh->collocated_list == L0)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0;
+
+    if (sh->disable_deblocking_filter_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED;
+
+    if (sh->slice_loop_filter_across_slices_enabled_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED;
+
+    if (sh->dependent_slice_segment_flag)
+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT;
+
+#if HEVC_CTRLS_VERSION < 2
+    dpb_n = fill_dpb_entries(h, dpb);
+    slice_params->num_active_dpb_entries = dpb_n;
+#endif
+
+    if (sh->slice_type != HEVC_SLICE_I) {
+        rpl = &h->ref->refPicList[0];
+        for (i = 0; i < rpl->nb_refs; i++)
+            slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
+    }
+
+    if (sh->slice_type == HEVC_SLICE_B) {
+        rpl = &h->ref->refPicList[1];
+        for (i = 0; i < rpl->nb_refs; i++)
+            slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
+    }
+
+    fill_pred_table(h, &slice_params->pred_weight_table);
+
+    slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
+    if (slice_params->num_entry_point_offsets > 256) {
+        slice_params->num_entry_point_offsets = 256;
+        av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
+    }
+
+    for (i = 0; i < slice_params->num_entry_point_offsets; i++)
+        slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
+}
+
+#if HEVC_CTRLS_VERSION >= 2
+static void
+fill_decode_params(const HEVCContext * const h,
+                   struct v4l2_ctrl_hevc_decode_params * const dec)
+{
+    unsigned int i;
+
+    *dec = (struct v4l2_ctrl_hevc_decode_params){
+        .pic_order_cnt_val = h->poc,
+        .num_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
+        .num_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
+        .num_poc_lt_curr = h->rps[LT_CURR].nb_refs,
+    };
+
+    dec->num_active_dpb_entries = fill_dpb_entries(h, dec->dpb);
+
+    // The docn does seem to ask that we fit our 32 bit signed POC into
+    // a U8 so... (To be fair 16 bits would be enough)
+    // Luckily we (Pi) don't use these fields
+    for (i = 0; i != h->rps[ST_CURR_BEF].nb_refs; ++i)
+        dec->poc_st_curr_before[i] = h->rps[ST_CURR_BEF].ref[i]->poc;
+    for (i = 0; i != h->rps[ST_CURR_AFT].nb_refs; ++i)
+        dec->poc_st_curr_after[i] = h->rps[ST_CURR_AFT].ref[i]->poc;
+    for (i = 0; i != h->rps[LT_CURR].nb_refs; ++i)
+        dec->poc_lt_curr[i] = h->rps[LT_CURR].ref[i]->poc;
+
+    if (IS_IRAP(h))
+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC;
+    if (IS_IDR(h))
+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC;
+    if (h->sh.no_output_of_prior_pics_flag)
+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR;
+
+}
+#endif
+
+static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCSPS *sps)
+{
+    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
+    *ctrl = (struct v4l2_ctrl_hevc_sps) {
+        .chroma_format_idc = sps->chroma_format_idc,
+        .pic_width_in_luma_samples = sps->width,
+        .pic_height_in_luma_samples = sps->height,
+        .bit_depth_luma_minus8 = sps->bit_depth - 8,
+        .bit_depth_chroma_minus8 = sps->bit_depth - 8,
+        .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
+        .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1,
+        .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics,
+        .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1,
+        .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3,
+        .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size,
+        .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2,
+        .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size,
+        .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter,
+        .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra,
+        .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1,
+        .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1,
+        .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3,
+        .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size,
+        .num_short_term_ref_pic_sets = sps->nb_st_rps,
+        .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps,
+        .chroma_format_idc = sps->chroma_format_idc,
+        .sps_max_sub_layers_minus1 = sps->max_sub_layers - 1,
+    };
+
+    if (sps->separate_colour_plane_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE;
+
+    if (sps->scaling_list_enable_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED;
+
+    if (sps->amp_enabled_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED;
+
+    if (sps->sao_enabled)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET;
+
+    if (sps->pcm_enabled_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED;
+
+    if (sps->pcm.loop_filter_disable_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED;
+
+    if (sps->long_term_ref_pics_present_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT;
+
+    if (sps->sps_temporal_mvp_enabled_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED;
+
+    if (sps->sps_strong_intra_smoothing_enable_flag)
+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED;
+}
+
+static void fill_scaling_matrix(const ScalingList * const sl,
+                                struct v4l2_ctrl_hevc_scaling_matrix * const sm)
+{
+    unsigned int i;
+
+    for (i = 0; i < 6; i++) {
+        unsigned int j;
+
+        for (j = 0; j < 16; j++)
+            sm->scaling_list_4x4[i][j] = sl->sl[0][i][j];
+        for (j = 0; j < 64; j++) {
+            sm->scaling_list_8x8[i][j]   = sl->sl[1][i][j];
+            sm->scaling_list_16x16[i][j] = sl->sl[2][i][j];
+            if (i < 2)
+                sm->scaling_list_32x32[i][j] = sl->sl[3][i * 3][j];
+        }
+        sm->scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i];
+        if (i < 2)
+            sm->scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3];
+    }
+}
+
+static void fill_pps(struct v4l2_ctrl_hevc_pps * const ctrl, const HEVCPPS * const pps)
+{
+    uint64_t flags = 0;
+
+    if (pps->dependent_slice_segments_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED;
+
+    if (pps->output_flag_present_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT;
+
+    if (pps->sign_data_hiding_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED;
+
+    if (pps->cabac_init_present_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT;
+
+    if (pps->constrained_intra_pred_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED;
+
+    if (pps->transform_skip_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED;
+
+    if (pps->cu_qp_delta_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED;
+
+    if (pps->pic_slice_level_chroma_qp_offsets_present_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT;
+
+    if (pps->weighted_pred_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED;
+
+    if (pps->weighted_bipred_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED;
+
+    if (pps->transquant_bypass_enable_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED;
+
+    if (pps->tiles_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED;
+
+    if (pps->entropy_coding_sync_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED;
+
+    if (pps->loop_filter_across_tiles_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED;
+
+    if (pps->seq_loop_filter_across_slices_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED;
+
+    if (pps->deblocking_filter_override_enabled_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED;
+
+    if (pps->disable_dbf)
+        flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER;
+
+    if (pps->lists_modification_present_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT;
+
+    if (pps->slice_header_extension_present_flag)
+        flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT;
+
+    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
+    *ctrl = (struct v4l2_ctrl_hevc_pps) {
+        .num_extra_slice_header_bits = pps->num_extra_slice_header_bits,
+        .init_qp_minus26 = pps->pic_init_qp_minus26,
+        .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth,
+        .pps_cb_qp_offset = pps->cb_qp_offset,
+        .pps_cr_qp_offset = pps->cr_qp_offset,
+        .pps_beta_offset_div2 = pps->beta_offset / 2,
+        .pps_tc_offset_div2 = pps->tc_offset / 2,
+        .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2,
+        .flags = flags
+    };
+
+
+    if (pps->tiles_enabled_flag) {
+        ctrl->num_tile_columns_minus1 = pps->num_tile_columns - 1;
+        ctrl->num_tile_rows_minus1 = pps->num_tile_rows - 1;
+
+        for (int i = 0; i < pps->num_tile_columns; i++)
+            ctrl->column_width_minus1[i] = pps->column_width[i] - 1;
+
+        for (int i = 0; i < pps->num_tile_rows; i++)
+            ctrl->row_height_minus1[i] = pps->row_height[i] - 1;
+    }
+}
+
+// Called before finally returning the frame to the user
+// Set corrupt flag here as this is actually the frame structure that
+// is going to the user (in MT land each thread has its own pool)
+static int frame_post_process(void *logctx, AVFrame *frame)
+{
+    V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)frame->data[0];
+
+//    av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
+    frame->flags &= ~AV_FRAME_FLAG_CORRUPT;
+    if (rd->qe_dst) {
+        MediaBufsStatus stat = qent_dst_wait(rd->qe_dst);
+        if (stat != MEDIABUFS_STATUS_SUCCESS) {
+            av_log(logctx, AV_LOG_ERROR, "%s: Decode fail\n", __func__);
+            frame->flags |= AV_FRAME_FLAG_CORRUPT;
+        }
+    }
+
+    return 0;
+}
+
+static inline struct timeval cvt_dpb_to_tv(uint64_t t)
+{
+    t /= 1000;
+    return (struct timeval){
+        .tv_usec = t % 1000000,
+        .tv_sec = t / 1000000
+    };
+}
+
+static inline uint64_t cvt_timestamp_to_dpb(const unsigned int t)
+{
+    return (uint64_t)t * 1000;
+}
+
+static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
+                                         av_unused const uint8_t *buffer,
+                                         av_unused uint32_t size)
+{
+    const HEVCContext *h = avctx->priv_data;
+    V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+
+//    av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
+    decode_q_add(&ctx->decode_q, &rd->decode_ent);
+
+    rd->num_slices = 0;
+    ctx->timestamp++;
+    rd->timestamp = cvt_timestamp_to_dpb(ctx->timestamp);
+
+    {
+        FrameDecodeData * const fdd = (FrameDecodeData*)h->ref->frame->private_ref->data;
+        fdd->post_process = frame_post_process;
+    }
+
+    // qe_dst needs to be bound to the data buffer and only returned when that is
+    if (!rd->qe_dst)
+    {
+        if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
+            av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
+
+    return 0;
+}
+
+// Object fd & size will be zapped by this & need setting later
+static int drm_from_format(AVDRMFrameDescriptor * const desc, const struct v4l2_format * const format)
+{
+    AVDRMLayerDescriptor *layer = &desc->layers[0];
+    unsigned int width;
+    unsigned int height;
+    unsigned int bpl;
+    uint32_t pixelformat;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
+        width       = format->fmt.pix_mp.width;
+        height      = format->fmt.pix_mp.height;
+        pixelformat = format->fmt.pix_mp.pixelformat;
+        bpl         = format->fmt.pix_mp.plane_fmt[0].bytesperline;
+    }
+    else {
+        width       = format->fmt.pix.width;
+        height      = format->fmt.pix.height;
+        pixelformat = format->fmt.pix.pixelformat;
+        bpl         = format->fmt.pix.bytesperline;
+    }
+
+    switch (pixelformat) {
+    case V4L2_PIX_FMT_NV12:
+        layer->format = DRM_FORMAT_NV12;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        break;
+#if CONFIG_SAND
+    case V4L2_PIX_FMT_NV12_COL128:
+        layer->format = DRM_FORMAT_NV12;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
+        break;
+    case V4L2_PIX_FMT_NV12_10_COL128:
+        layer->format = DRM_FORMAT_P030;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
+        break;
+#endif
+#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
+    case V4L2_PIX_FMT_SUNXI_TILED_NV12:
+        layer->format = DRM_FORMAT_NV12;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED;
+        break;
+#endif
+#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15)
+    case V4L2_PIX_FMT_NV15:
+        layer->format = DRM_FORMAT_NV15;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        break;
+#endif
+    case V4L2_PIX_FMT_NV16:
+        layer->format = DRM_FORMAT_NV16;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        break;
+#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20)
+    case V4L2_PIX_FMT_NV20:
+        layer->format = DRM_FORMAT_NV20;
+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        break;
+#endif
+    default:
+        return -1;
+    }
+
+    desc->nb_objects = 1;
+    desc->objects[0].fd = -1;
+    desc->objects[0].size = 0;
+
+    desc->nb_layers = 1;
+    layer->nb_planes = 2;
+
+    layer->planes[0].object_index = 0;
+    layer->planes[0].offset = 0;
+    layer->planes[0].pitch = bpl;
+#if CONFIG_SAND
+    if (pixelformat == V4L2_PIX_FMT_NV12_COL128) {
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = height * 128;
+        layer->planes[0].pitch = width;
+        layer->planes[1].pitch = width;
+    }
+    else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = height * 128;
+        layer->planes[0].pitch = width * 2; // Lies but it keeps DRM import happy
+        layer->planes[1].pitch = width * 2;
+    }
+    else
+#endif
+    {
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = layer->planes[0].pitch * height;
+        layer->planes[1].pitch = layer->planes[0].pitch;
+    }
+
+    return 0;
+}
+
+static int
+set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
+    struct req_controls *const controls,
+#if HEVC_CTRLS_VERSION >= 2
+    struct v4l2_ctrl_hevc_decode_params * const dec,
+#endif
+    struct v4l2_ctrl_hevc_slice_params * const slices,
+    const unsigned int slice_no,
+    const unsigned int slice_count)
+{
+    int rv;
+
+    struct v4l2_ext_control control[] = {
+        {
+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
+            .ptr = &controls->sps,
+            .size = sizeof(controls->sps),
+        },
+        {
+            .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS,
+            .ptr = &controls->pps,
+            .size = sizeof(controls->pps),
+        },
+#if HEVC_CTRLS_VERSION >= 2
+        {
+            .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS,
+            .ptr = dec,
+            .size = sizeof(*dec),
+        },
+#endif
+        {
+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS,
+            .ptr = slices + slice_no,
+            .size = sizeof(*slices) * slice_count,
+        },
+        // Optional
+        {
+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX,
+            .ptr = &controls->scaling_matrix,
+            .size = sizeof(controls->scaling_matrix),
+        },
+    };
+
+    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control,
+            controls->has_scaling ?
+                FF_ARRAY_ELEMS(control) :
+                FF_ARRAY_ELEMS(control) - 1);
+
+    return rv;
+}
+
+static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
+{
+    const HEVCContext * const h = avctx->priv_data;
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
+    int bcount = get_bits_count(&h->HEVClc->gb);
+    uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount;
+
+    int rv;
+    struct slice_info * si;
+
+    if ((rv = slice_add(rd)) != 0)
+        return rv;
+
+    si = rd->slices + rd->num_slices - 1;
+    si->ptr = buffer;
+    si->len = size;
+
+    if (ctx->multi_slice && rd->num_slices > 1) {
+        struct slice_info *const si0 = rd->slices;
+        const size_t offset = (buffer - si0->ptr);
+        boff += offset * 8;
+        size += offset;
+        si0->len = si->len + offset;
+    }
+
+#if HEVC_CTRLS_VERSION >= 2
+    if (rd->num_slices == 1)
+        fill_decode_params(h, &rd->dec);
+    fill_slice_params(h, &rd->dec, rd->slice_params + rd->num_slices - 1, size * 8, boff);
+#else
+    fill_slice_params(h, rd->slice_params + rd->num_slices - 1, size * 8, boff);
+#endif
+
+    return 0;
+}
+
+static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx)
+{
+    const HEVCContext * const h = avctx->priv_data;
+    if (h->ref != NULL) {
+        V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
+        V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+
+        media_request_abort(&rd->req);
+        mediabufs_src_qent_abort(ctx->mbufs, &rd->qe_src);
+
+        decode_q_remove(&ctx->decode_q, &rd->decode_ent);
+    }
+}
+
+static int send_slice(AVCodecContext * const avctx,
+                      V4L2MediaReqDescriptor * const rd,
+                      struct req_controls *const controls,
+                      const unsigned int i, const unsigned int j)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+
+    struct slice_info *const si = rd->slices + i;
+    struct media_request * req = NULL;
+    struct qent_src * src = NULL;
+    MediaBufsStatus stat;
+
+    if ((req = media_request_get(ctx->mpool)) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__);
+        return AVERROR(ENOMEM);
+    }
+
+    if (set_req_ctls(ctx, req,
+                     controls,
+#if HEVC_CTRLS_VERSION >= 2
+                     &rd->dec,
+#endif
+                     rd->slice_params,
+                     i, j - i)) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__);
+        goto fail1;
+    }
+
+    if ((src = mediabufs_src_qent_get(ctx->mbufs)) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to get src buffer\n", __func__);
+        goto fail1;
+    }
+
+    if (qent_src_data_copy(src, 0, si->ptr, si->len, ctx->dbufs) != 0) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed data copy\n", __func__);
+        goto fail2;
+    }
+
+    if (qent_src_params_set(src, &controls->tv)) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed src param set\n", __func__);
+        goto fail2;
+    }
+
+#warning ANNEX_B start code
+//        if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
+//        }
+
+    stat = mediabufs_start_request(ctx->mbufs, &req, &src,
+                                   i == 0 ? rd->qe_dst : NULL,
+                                   j == rd->num_slices);
+
+    if (stat != MEDIABUFS_STATUS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__);
+        return AVERROR_UNKNOWN;
+    }
+    return 0;
+
+fail2:
+    mediabufs_src_qent_abort(ctx->mbufs, &src);
+fail1:
+    media_request_abort(&req);
+    return AVERROR_UNKNOWN;
+}
+
+static int v4l2_request_hevc_end_frame(AVCodecContext *avctx)
+{
+    const HEVCContext * const h = avctx->priv_data;
+    V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
+    struct req_controls rc;
+    unsigned int i;
+    int rv;
+
+    // It is possible, though maybe a bug, to get an end_frame without
+    // a previous start_frame.  If we do then give up.
+    if (!decode_q_in_q(&rd->decode_ent)) {
+        av_log(avctx, AV_LOG_DEBUG, "%s: Frame not in decode Q\n", __func__);
+        return AVERROR_INVALIDDATA;
+    }
+
+    {
+        const ScalingList *sl = h->ps.pps->scaling_list_data_present_flag ?
+                                    &h->ps.pps->scaling_list :
+                                h->ps.sps->scaling_list_enable_flag ?
+                                    &h->ps.sps->scaling_list : NULL;
+
+
+        memset(&rc, 0, sizeof(rc));
+        rc.tv = cvt_dpb_to_tv(rd->timestamp);
+        fill_sps(&rc.sps, h->ps.sps);
+        fill_pps(&rc.pps, h->ps.pps);
+        if (sl) {
+            rc.has_scaling = 1;
+            fill_scaling_matrix(sl, &rc.scaling_matrix);
+        }
+    }
+
+    decode_q_wait(&ctx->decode_q, &rd->decode_ent);
+
+    // qe_dst needs to be bound to the data buffer and only returned when that is
+    // Alloc almost certainly wants to be serialised if there is any chance of blocking
+    // so we get the next frame to be free in the thread that needs it for decode first.
+    //
+    // In our current world this probably isn't a concern but put it here anyway
+    if (!rd->qe_dst)
+    {
+        if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
+            av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
+            rv = AVERROR(ENOMEM);
+            goto fail;
+        }
+    }
+
+    // Send as slices
+    if (ctx->multi_slice)
+    {
+        if ((rv = send_slice(avctx, rd, &rc, 0, rd->num_slices)) != 0)
+            goto fail;
+    }
+    else
+    {
+        for (i = 0; i != rd->num_slices; ++i) {
+            if ((rv = send_slice(avctx, rd, &rc, i, i + 1)) != 0)
+                goto fail;
+        }
+    }
+
+    // Set the drm_prime desriptor
+    drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs));
+    rd->drm.objects[0].fd = dmabuf_fd(qent_dst_dmabuf(rd->qe_dst, 0));
+    rd->drm.objects[0].size = dmabuf_size(qent_dst_dmabuf(rd->qe_dst, 0));
+
+    decode_q_remove(&ctx->decode_q, &rd->decode_ent);
+    return 0;
+
+fail:
+    decode_q_remove(&ctx->decode_q, &rd->decode_ent);
+    return rv;
+}
+
+// Initial check & init
+static int
+probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
+{
+    const HEVCContext *h = avctx->priv_data;
+    const HEVCSPS * const sps = h->ps.sps;
+    struct v4l2_ctrl_hevc_sps ctrl_sps;
+    unsigned int i;
+
+    // Check for var slice array
+    struct v4l2_query_ext_ctrl qc[] = {
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS },
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS },
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS },
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX },
+#if HEVC_CTRLS_VERSION >= 2
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS },
+#endif
+    };
+    // Order & size must match!
+    static const size_t ctrl_sizes[] = {
+        sizeof(struct v4l2_ctrl_hevc_slice_params),
+        sizeof(struct v4l2_ctrl_hevc_sps),
+        sizeof(struct v4l2_ctrl_hevc_pps),
+        sizeof(struct v4l2_ctrl_hevc_scaling_matrix),
+#if HEVC_CTRLS_VERSION >= 2
+        sizeof(struct v4l2_ctrl_hevc_decode_params),
+#endif
+    };
+    const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc);
+
+    if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) {
+        av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION);
+        return AVERROR(EINVAL);
+    }
+    for (i = 0; i != noof_ctrls; ++i) {
+        if (ctrl_sizes[i] != (size_t)qc[i].elem_size) {
+            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n",
+                   HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size);
+            return AVERROR(EINVAL);
+        }
+    }
+
+    fill_sps(&ctrl_sps, sps);
+
+    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_MPEG_VIDEO_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n");
+        return AVERROR(EINVAL);
+    }
+
+    ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0;
+    return 0;
+}
+
+// Final init
+static int
+set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
+{
+    int ret;
+
+    struct v4l2_query_ext_ctrl querys[] = {
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, },
+    };
+
+    struct v4l2_ext_control ctrls[] = {
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
+    };
+
+    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));
+
+    ctx->decode_mode = querys[0].default_value;
+
+    if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED &&
+        ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) {
+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
+        return AVERROR(EINVAL);
+    }
+
+    ctx->start_code = querys[1].default_value;
+    if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE &&
+        ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
+        return AVERROR(EINVAL);
+    }
+
+    ctx->max_slices = querys[2].elems;
+    if (ctx->max_slices > MAX_SLICES) {
+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices);
+        return AVERROR(EINVAL);
+    }
+
+    ctrls[0].value = ctx->decode_mode;
+    ctrls[1].value = ctx->start_code;
+
+    ret = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, NULL, ctrls, FF_ARRAY_ELEMS(ctrls));
+    return !ret ? 0 : AVERROR(-ret);
+}
+
+static void v4l2_req_frame_free(void *opaque, uint8_t *data)
+{
+    AVCodecContext *avctx = opaque;
+    V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)data;
+
+    av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p\n", __func__, avctx, data);
+
+    qent_dst_unref(&rd->qe_dst);
+
+    // We don't expect req or qe_src to be set
+    if (rd->req || rd->qe_src)
+        av_log(NULL, AV_LOG_ERROR, "%s: qe_src %p or req %p not NULL\n", __func__, rd->req, rd->qe_src);
+
+    av_freep(&rd->slices);
+    av_freep(&rd->slice_params);
+
+    av_free(rd);
+}
+
+static AVBufferRef *v4l2_req_frame_alloc(void *opaque, int size)
+{
+    AVCodecContext *avctx = opaque;
+//    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
+//    V4L2MediaReqDescriptor *req;
+    AVBufferRef *ref;
+    uint8_t *data;
+//    int ret;
+
+    data = av_mallocz(size);
+    if (!data)
+        return NULL;
+
+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data);
+    ref = av_buffer_create(data, size, v4l2_req_frame_free, avctx, 0);
+    if (!ref) {
+        av_freep(&data);
+        return NULL;
+    }
+    return ref;
+}
+
+#if 0
+static void v4l2_req_pool_free(void *opaque)
+{
+    av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque);
+}
+
+static void v4l2_req_hwframe_ctx_free(AVHWFramesContext *hwfc)
+{
+    av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool);
+
+    av_buffer_pool_uninit(&hwfc->pool);
+}
+#endif
+
+static int frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
+{
+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
+    AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data;
+    const struct v4l2_format *vfmt = mediabufs_dst_fmt(ctx->mbufs);
+
+    hwfc->format = AV_PIX_FMT_DRM_PRIME;
+    hwfc->sw_format = pixel_format_from_format(vfmt);
+    if (V4L2_TYPE_IS_MULTIPLANAR(vfmt->type)) {
+        hwfc->width = vfmt->fmt.pix_mp.width;
+        hwfc->height = vfmt->fmt.pix_mp.height;
+    } else {
+        hwfc->width = vfmt->fmt.pix.width;
+        hwfc->height = vfmt->fmt.pix.height;
+    }
+#if 0
+    hwfc->pool = av_buffer_pool_init2(sizeof(V4L2MediaReqDescriptor), avctx, v4l2_req_frame_alloc, v4l2_req_pool_free);
+    if (!hwfc->pool)
+        return AVERROR(ENOMEM);
+
+    hwfc->free = v4l2_req_hwframe_ctx_free;
+
+    hwfc->initial_pool_size = 1;
+
+    switch (avctx->codec_id) {
+    case AV_CODEC_ID_VP9:
+        hwfc->initial_pool_size += 8;
+        break;
+    case AV_CODEC_ID_VP8:
+        hwfc->initial_pool_size += 3;
+        break;
+    default:
+        hwfc->initial_pool_size += 2;
+    }
+#endif
+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size);
+
+    return 0;
+}
+
+static int alloc_frame(AVCodecContext * avctx, AVFrame *frame)
+{
+    int rv;
+
+    frame->buf[0] = v4l2_req_frame_alloc(avctx, sizeof(V4L2MediaReqDescriptor));
+    if (!frame->buf[0])
+        return AVERROR(ENOMEM);
+
+    frame->data[0] = frame->buf[0]->data;
+
+    frame->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx);
+
+    if ((rv = ff_attach_decode_data(frame)) != 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to attach decode data to frame\n");
+        av_frame_unref(frame);
+        return rv;
+    }
+
+    return 0;
+}
+
+const v4l2_req_decode_fns V(ff_v4l2_req_hevc) = {
+    .src_pix_fmt_v4l2 = V4L2_PIX_FMT_HEVC_SLICE,
+    .name = "V4L2 HEVC stateless V" STR(HEVC_CTRLS_VERSION),
+    .probe = probe,
+    .set_controls = set_controls,
+
+    .start_frame    = v4l2_request_hevc_start_frame,
+    .decode_slice   = v4l2_request_hevc_decode_slice,
+    .end_frame      = v4l2_request_hevc_end_frame,
+    .abort_frame    = v4l2_request_hevc_abort_frame,
+    .frame_params   = frame_params,
+    .alloc_frame    = alloc_frame,
+};
+
diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c
new file mode 100644
index 0000000000..eb00ecb406
--- /dev/null
+++ b/libavcodec/v4l2_req_media.c
@@ -0,0 +1,1596 @@
+/*
+ * Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <linux/media.h>
+#include <sys/ioctl.h>
+#include <sys/select.h>
+#include <sys/ioctl.h>
+
+#include <linux/videodev2.h>
+
+#include "v4l2_req_dmabufs.h"
+#include "v4l2_req_media.h"
+#include "v4l2_req_pollqueue.h"
+#include "v4l2_req_utils.h"
+#include "weak_link.h"
+
+
+/* floor(log2(x)) */
+static unsigned int log2_size(size_t x)
+{
+    unsigned int n = 0;
+
+    if (x & ~0xffff) {
+        n += 16;
+        x >>= 16;
+    }
+    if (x & ~0xff) {
+        n += 8;
+        x >>= 8;
+    }
+    if (x & ~0xf) {
+        n += 4;
+        x >>= 4;
+    }
+    if (x & ~3) {
+        n += 2;
+        x >>= 2;
+    }
+    return (x & ~1) ? n + 1 : n;
+}
+
+static size_t round_up_size(const size_t x)
+{
+    /* Admit no size < 256 */
+    const unsigned int n = x < 256 ? 8 : log2_size(x) - 1;
+
+    return x >= (3 << n) ? 4 << n : (3 << n);
+}
+
+struct media_request;
+
+struct media_pool {
+    int fd;
+    sem_t sem;
+    pthread_mutex_t lock;
+    struct media_request * free_reqs;
+    struct pollqueue * pq;
+};
+
+struct media_request {
+    struct media_request * next;
+    struct media_pool * mp;
+    int fd;
+    struct polltask * pt;
+};
+
+
+static inline int do_trywait(sem_t *const sem)
+{
+    while (sem_trywait(sem)) {
+        if (errno != EINTR)
+            return -errno;
+    }
+    return 0;
+}
+
+static inline int do_wait(sem_t *const sem)
+{
+    while (sem_wait(sem)) {
+        if (errno != EINTR)
+            return -errno;
+    }
+    return 0;
+}
+
+static int request_buffers(int video_fd, unsigned int type,
+                           enum v4l2_memory memory, unsigned int buffers_count)
+{
+    struct v4l2_requestbuffers buffers;
+    int rc;
+
+    memset(&buffers, 0, sizeof(buffers));
+    buffers.type = type;
+    buffers.memory = memory;
+    buffers.count = buffers_count;
+
+    rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers);
+    if (rc < 0) {
+        rc = -errno;
+        request_log("Unable to request %d type %d buffers: %s\n", buffers_count, type, strerror(-rc));
+        return rc;
+    }
+
+    return 0;
+}
+
+
+static int set_stream(int video_fd, unsigned int type, bool enable)
+{
+    enum v4l2_buf_type buf_type = type;
+    int rc;
+
+    rc = ioctl(video_fd, enable ? VIDIOC_STREAMON : VIDIOC_STREAMOFF,
+           &buf_type);
+    if (rc < 0) {
+        rc = -errno;
+        request_log("Unable to %sable stream: %s\n",
+                enable ? "en" : "dis", strerror(-rc));
+        return rc;
+    }
+
+    return 0;
+}
+
+
+
+struct media_request * media_request_get(struct media_pool * const mp)
+{
+    struct media_request *req = NULL;
+
+    /* Timeout handled by poll code */
+    if (do_wait(&mp->sem))
+        return NULL;
+
+    pthread_mutex_lock(&mp->lock);
+    req = mp->free_reqs;
+    if (req) {
+        mp->free_reqs = req->next;
+        req->next = NULL;
+    }
+    pthread_mutex_unlock(&mp->lock);
+    return req;
+}
+
+int media_request_fd(const struct media_request * const req)
+{
+    return req->fd;
+}
+
+int media_request_start(struct media_request * const req)
+{
+    while (ioctl(req->fd, MEDIA_REQUEST_IOC_QUEUE, NULL) == -1)
+    {
+        const int err = errno;
+        if (err == EINTR)
+            continue;
+        request_log("%s: Failed to Q media: (%d) %s\n", __func__, err, strerror(err));
+        return -err;
+    }
+
+    pollqueue_add_task(req->pt, 2000);
+    return 0;
+}
+
+static void media_request_done(void *v, short revents)
+{
+    struct media_request *const req = v;
+    struct media_pool *const mp = req->mp;
+
+    /* ** Not sure what to do about timeout */
+
+    if (ioctl(req->fd, MEDIA_REQUEST_IOC_REINIT, NULL) < 0)
+        request_log("Unable to reinit media request: %s\n",
+                strerror(errno));
+
+    pthread_mutex_lock(&mp->lock);
+    req->next = mp->free_reqs;
+    mp->free_reqs = req;
+    pthread_mutex_unlock(&mp->lock);
+    sem_post(&mp->sem);
+}
+
+int media_request_abort(struct media_request ** const preq)
+{
+    struct media_request * const req = *preq;
+
+    if (req == NULL)
+        return 0;
+    *preq = NULL;
+
+    media_request_done(req, 0);
+    return 0;
+}
+
+static void delete_req_chain(struct media_request * const chain)
+{
+    struct media_request * next = chain;
+    while (next) {
+        struct media_request * const req = next;
+        next = req->next;
+        if (req->pt)
+            polltask_delete(&req->pt);
+        if (req->fd != -1)
+            close(req->fd);
+        free(req);
+    }
+}
+
+struct media_pool * media_pool_new(const char * const media_path,
+                   struct pollqueue * const pq,
+                   const unsigned int n)
+{
+    struct media_pool * const mp = calloc(1, sizeof(*mp));
+    unsigned int i;
+
+    if (!mp)
+        goto fail0;
+
+    mp->pq = pq;
+    pthread_mutex_init(&mp->lock, NULL);
+    mp->fd = open(media_path, O_RDWR | O_NONBLOCK);
+    if (mp->fd == -1) {
+        request_log("Failed to open '%s': %s\n", media_path, strerror(errno));
+        goto fail1;
+    }
+
+    for (i = 0; i != n; ++i) {
+        struct media_request * req = malloc(sizeof(*req));
+        if (!req)
+            goto fail4;
+
+        *req = (struct media_request){
+            .next = mp->free_reqs,
+            .mp = mp,
+            .fd = -1
+        };
+        mp->free_reqs = req;
+
+        if (ioctl(mp->fd, MEDIA_IOC_REQUEST_ALLOC, &req->fd) == -1) {
+            request_log("Failed to alloc request %d: %s\n", i, strerror(errno));
+            goto fail4;
+        }
+
+        req->pt = polltask_new(pq, req->fd, POLLPRI, media_request_done, req);
+        if (!req->pt)
+            goto fail4;
+    }
+
+    sem_init(&mp->sem, 0, n);
+
+    return mp;
+
+fail4:
+    delete_req_chain(mp->free_reqs);
+    close(mp->fd);
+    pthread_mutex_destroy(&mp->lock);
+fail1:
+    free(mp);
+fail0:
+    return NULL;
+}
+
+void media_pool_delete(struct media_pool ** pMp)
+{
+    struct media_pool * const mp = *pMp;
+
+    if (!mp)
+        return;
+    *pMp = NULL;
+
+    delete_req_chain(mp->free_reqs);
+    close(mp->fd);
+    sem_destroy(&mp->sem);
+    pthread_mutex_destroy(&mp->lock);
+    free(mp);
+}
+
+
+#define INDEX_UNSET (~(uint32_t)0)
+
+enum qent_status {
+    QENT_NEW = 0,       // Initial state - shouldn't last
+    QENT_FREE,          // On free chain
+    QENT_PENDING,       // User has ent
+    QENT_WAITING,       // On inuse
+    QENT_DONE,          // Frame rx
+    QENT_ERROR,         // Error
+    QENT_IMPORT
+};
+
+struct qent_base {
+    atomic_int ref_count;
+    struct qent_base *next;
+    struct qent_base *prev;
+    enum qent_status status;
+    uint32_t index;
+    struct dmabuf_h *dh[VIDEO_MAX_PLANES];
+    struct timeval timestamp;
+};
+
+struct qent_src {
+    struct qent_base base;
+    int fixed_size;
+};
+
+struct qent_dst {
+    struct qent_base base;
+    bool waiting;
+    pthread_mutex_t lock;
+    pthread_cond_t cond;
+    struct ff_weak_link_client * mbc_wl;
+};
+
+struct qe_list_head {
+    struct qent_base *head;
+    struct qent_base *tail;
+};
+
+struct buf_pool {
+    pthread_mutex_t lock;
+    sem_t free_sem;
+    enum v4l2_buf_type buf_type;
+    struct qe_list_head free;
+    struct qe_list_head inuse;
+};
+
+
+static inline struct qent_dst *base_to_dst(struct qent_base *be)
+{
+    return (struct qent_dst *)be;
+}
+
+static inline struct qent_src *base_to_src(struct qent_base *be)
+{
+    return (struct qent_src *)be;
+}
+
+
+#define QENT_BASE_INITIALIZER {\
+    .ref_count = ATOMIC_VAR_INIT(0),\
+    .status = QENT_NEW,\
+    .index  = INDEX_UNSET\
+}
+
+static void qe_base_uninit(struct qent_base *const be)
+{
+    unsigned int i;
+    for (i = 0; i != VIDEO_MAX_PLANES; ++i) {
+        dmabuf_free(be->dh[i]);
+        be->dh[i] = NULL;
+    }
+}
+
+static void qe_src_free(struct qent_src *const be_src)
+{
+    if (!be_src)
+        return;
+    qe_base_uninit(&be_src->base);
+    free(be_src);
+}
+
+static struct qent_src * qe_src_new(void)
+{
+    struct qent_src *const be_src = malloc(sizeof(*be_src));
+    if (!be_src)
+        return NULL;
+    *be_src = (struct qent_src){
+        .base = QENT_BASE_INITIALIZER
+    };
+    return be_src;
+}
+
+static void qe_dst_free(struct qent_dst *const be_dst)
+{
+    if (!be_dst)
+        return;
+
+    ff_weak_link_unref(&be_dst->mbc_wl);
+    pthread_cond_destroy(&be_dst->cond);
+    pthread_mutex_destroy(&be_dst->lock);
+    qe_base_uninit(&be_dst->base);
+    free(be_dst);
+}
+
+static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl)
+{
+    struct qent_dst *const be_dst = malloc(sizeof(*be_dst));
+    if (!be_dst)
+        return NULL;
+    *be_dst = (struct qent_dst){
+        .base = QENT_BASE_INITIALIZER,
+        .lock = PTHREAD_MUTEX_INITIALIZER,
+        .cond = PTHREAD_COND_INITIALIZER,
+        .mbc_wl = ff_weak_link_ref(wl)
+    };
+    return be_dst;
+}
+
+static void ql_add_tail(struct qe_list_head * const ql, struct qent_base * be)
+{
+    if (ql->tail)
+        ql->tail->next = be;
+    else
+        ql->head = be;
+    be->prev = ql->tail;
+    be->next = NULL;
+    ql->tail = be;
+}
+
+static struct qent_base * ql_extract(struct qe_list_head * const ql, struct qent_base * be)
+{
+    if (!be)
+        return NULL;
+
+    if (be->next)
+        be->next->prev = be->prev;
+    else
+        ql->tail = be->prev;
+    if (be->prev)
+        be->prev->next = be->next;
+    else
+        ql->head = be->next;
+    be->next = NULL;
+    be->prev = NULL;
+    return be;
+}
+
+
+static void bq_put_free(struct buf_pool *const bp, struct qent_base * be)
+{
+    ql_add_tail(&bp->free, be);
+}
+
+static struct qent_base * bq_get_free(struct buf_pool *const bp)
+{
+    return ql_extract(&bp->free, bp->free.head);
+}
+
+static struct qent_base * bq_extract_inuse(struct buf_pool *const bp, struct qent_base *const be)
+{
+    return ql_extract(&bp->inuse, be);
+}
+
+static struct qent_base * bq_get_inuse(struct buf_pool *const bp)
+{
+    return ql_extract(&bp->inuse, bp->inuse.head);
+}
+
+static void bq_free_all_free_src(struct buf_pool *const bp)
+{
+    struct qent_base *be;
+    while ((be = bq_get_free(bp)) != NULL)
+        qe_src_free(base_to_src(be));
+}
+
+static void bq_free_all_inuse_src(struct buf_pool *const bp)
+{
+    struct qent_base *be;
+    while ((be = bq_get_inuse(bp)) != NULL)
+        qe_src_free(base_to_src(be));
+}
+
+static void bq_free_all_free_dst(struct buf_pool *const bp)
+{
+    struct qent_base *be;
+    while ((be = bq_get_free(bp)) != NULL)
+        qe_dst_free(base_to_dst(be));
+}
+
+static void queue_put_free(struct buf_pool *const bp, struct qent_base *be)
+{
+    unsigned int i;
+
+    pthread_mutex_lock(&bp->lock);
+    /* Clear out state vars */
+    be->timestamp.tv_sec = 0;
+    be->timestamp.tv_usec = 0;
+    be->status = QENT_FREE;
+    for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i)
+        dmabuf_len_set(be->dh[i], 0);
+    bq_put_free(bp, be);
+    pthread_mutex_unlock(&bp->lock);
+    sem_post(&bp->free_sem);
+}
+
+static bool queue_is_inuse(const struct buf_pool *const bp)
+{
+    return bp->inuse.tail != NULL;
+}
+
+static void queue_put_inuse(struct buf_pool *const bp, struct qent_base *be)
+{
+    if (!be)
+        return;
+    pthread_mutex_lock(&bp->lock);
+    ql_add_tail(&bp->inuse, be);
+    be->status = QENT_WAITING;
+    pthread_mutex_unlock(&bp->lock);
+}
+
+static struct qent_base *queue_get_free(struct buf_pool *const bp)
+{
+    struct qent_base *buf;
+
+    if (do_wait(&bp->free_sem))
+        return NULL;
+    pthread_mutex_lock(&bp->lock);
+    buf = bq_get_free(bp);
+    pthread_mutex_unlock(&bp->lock);
+    return buf;
+}
+
+static struct qent_base *queue_tryget_free(struct buf_pool *const bp)
+{
+    struct qent_base *buf;
+
+    if (do_trywait(&bp->free_sem))
+        return NULL;
+    pthread_mutex_lock(&bp->lock);
+    buf = bq_get_free(bp);
+    pthread_mutex_unlock(&bp->lock);
+    return buf;
+}
+
+static struct qent_base * queue_find_extract_fd(struct buf_pool *const bp, const int fd)
+{
+    struct qent_base *be;
+
+    pthread_mutex_lock(&bp->lock);
+    /* Expect 1st in Q, but allow anywhere */
+    for (be = bp->inuse.head; be; be = be->next) {
+        if (dmabuf_fd(be->dh[0]) == fd) {
+            bq_extract_inuse(bp, be);
+            break;
+        }
+    }
+    pthread_mutex_unlock(&bp->lock);
+
+    return be;
+}
+
+static void queue_delete(struct buf_pool *const bp)
+{
+    sem_destroy(&bp->free_sem);
+    pthread_mutex_destroy(&bp->lock);
+    free(bp);
+}
+
+static struct buf_pool* queue_new(const int vfd)
+{
+    struct buf_pool *bp = calloc(1, sizeof(*bp));
+    if (!bp)
+        return NULL;
+    pthread_mutex_init(&bp->lock, NULL);
+    sem_init(&bp->free_sem, 0, 0);
+    return bp;
+}
+
+
+struct mediabufs_ctl {
+    atomic_int ref_count;  /* 0 is single ref for easier atomics */
+    void * dc;
+    int vfd;
+    bool stream_on;
+    bool polling;
+    bool dst_fixed;             // Dst Q is fixed size
+    pthread_mutex_t lock;
+    struct buf_pool * src;
+    struct buf_pool * dst;
+    struct polltask * pt;
+    struct pollqueue * pq;
+    struct ff_weak_link_master * this_wlm;
+
+    struct v4l2_format src_fmt;
+    struct v4l2_format dst_fmt;
+};
+
+static int qe_v4l2_queue(struct qent_base *const be,
+               const int vfd, struct media_request *const mreq,
+               const struct v4l2_format *const fmt,
+               const bool is_dst, const bool hold_flag)
+{
+    struct v4l2_buffer buffer = {
+        .type = fmt->type,
+        .memory = V4L2_MEMORY_DMABUF,
+        .index = be->index
+    };
+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        unsigned int i;
+        for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) {
+            if (is_dst)
+                dmabuf_len_set(be->dh[i], 0);
+
+            /* *** Really need a pixdesc rather than a format so we can fill in data_offset */
+            planes[i].length = dmabuf_size(be->dh[i]);
+            planes[i].bytesused = dmabuf_len(be->dh[i]);
+            planes[i].m.fd = dmabuf_fd(be->dh[i]);
+        }
+        buffer.m.planes = planes;
+        buffer.length = i;
+    }
+    else {
+        if (is_dst)
+            dmabuf_len_set(be->dh[0], 0);
+
+        buffer.bytesused = dmabuf_len(be->dh[0]);
+        buffer.length = dmabuf_size(be->dh[0]);
+        buffer.m.fd = dmabuf_fd(be->dh[0]);
+    }
+
+    if (!is_dst && mreq) {
+        buffer.flags |= V4L2_BUF_FLAG_REQUEST_FD;
+        buffer.request_fd = media_request_fd(mreq);
+        if (hold_flag)
+            buffer.flags |= V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF;
+    }
+
+    if (is_dst)
+        be->timestamp = (struct timeval){0,0};
+
+    buffer.timestamp = be->timestamp;
+
+    while (ioctl(vfd, VIDIOC_QBUF, &buffer)) {
+        const int err = errno;
+        if (err != EINTR) {
+            request_log("%s: Failed to Q buffer: err=%d (%s)\n", __func__, err, strerror(err));
+            return -err;
+        }
+    }
+    return 0;
+}
+
+static struct qent_base * qe_dequeue(struct buf_pool *const bp,
+                     const int vfd,
+                     const struct v4l2_format * const f)
+{
+    int fd;
+    struct qent_base *be;
+    int rc;
+    const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type);
+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
+    struct v4l2_buffer buffer = {
+        .type =  f->type,
+        .memory = V4L2_MEMORY_DMABUF
+    };
+    if (mp) {
+        buffer.length = f->fmt.pix_mp.num_planes;
+        buffer.m.planes = planes;
+    }
+
+    while ((rc = ioctl(vfd, VIDIOC_DQBUF, &buffer)) != 0 &&
+           errno == EINTR)
+        /* Loop */;
+    if (rc) {
+        request_log("Error DQing buffer type %d: %s\n", f->type, strerror(errno));
+        return NULL;
+    }
+
+    fd = mp ? planes[0].m.fd : buffer.m.fd;
+    be = queue_find_extract_fd(bp, fd);
+    if (!be) {
+        request_log("Failed to find fd %d in Q\n", fd);
+        return NULL;
+    }
+
+    be->timestamp = buffer.timestamp;
+    be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE;
+    return be;
+}
+
+static void qe_dst_done(struct qent_dst * dst_be)
+{
+    pthread_mutex_lock(&dst_be->lock);
+    dst_be->waiting = false;
+    pthread_cond_broadcast(&dst_be->cond);
+    pthread_mutex_unlock(&dst_be->lock);
+
+    qent_dst_unref(&dst_be);
+}
+
+static bool qe_dst_waiting(struct qent_dst *const dst_be)
+{
+    bool waiting;
+    pthread_mutex_lock(&dst_be->lock);
+    waiting = dst_be->waiting;
+    dst_be->waiting = true;
+    pthread_mutex_unlock(&dst_be->lock);
+    return waiting;
+}
+
+
+static bool mediabufs_wants_poll(const struct mediabufs_ctl *const mbc)
+{
+    return queue_is_inuse(mbc->src) || queue_is_inuse(mbc->dst);
+}
+
+static void mediabufs_poll_cb(void * v, short revents)
+{
+    struct mediabufs_ctl *mbc = v;
+    struct qent_src *src_be = NULL;
+    struct qent_dst *dst_be = NULL;
+
+    if (!revents)
+        request_err(mbc->dc, "%s: Timeout\n", __func__);
+
+    pthread_mutex_lock(&mbc->lock);
+    mbc->polling = false;
+
+    if ((revents & POLLOUT) != 0)
+        src_be = base_to_src(qe_dequeue(mbc->src, mbc->vfd, &mbc->src_fmt));
+    if ((revents & POLLIN) != 0)
+        dst_be = base_to_dst(qe_dequeue(mbc->dst, mbc->vfd, &mbc->dst_fmt));
+
+    /* Reschedule */
+    if (mediabufs_wants_poll(mbc)) {
+        mbc->polling = true;
+        pollqueue_add_task(mbc->pt, 2000);
+    }
+    pthread_mutex_unlock(&mbc->lock);
+
+    if (src_be)
+        queue_put_free(mbc->src, &src_be->base);
+    if (dst_be)
+        qe_dst_done(dst_be);
+}
+
+int qent_src_params_set(struct qent_src *const be_src, const struct timeval * timestamp)
+{
+    struct qent_base *const be = &be_src->base;
+
+    be->timestamp = *timestamp;
+    return 0;
+}
+
+struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst)
+{
+    return be_dst->base.timestamp;
+}
+
+static int qent_base_realloc(struct qent_base *const be, const size_t len, struct dmabufs_ctl * dbsc)
+{
+    if (!be->dh[0] || len > dmabuf_size(be->dh[0])) {
+        size_t newsize = round_up_size(len);
+        request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize);
+        if (!dbsc) {
+            request_log("%s: No dmbabuf_ctrl for realloc\n", __func__);
+            return -ENOMEM;
+        }
+        if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) {
+            request_log("%s: Realloc %zd failed\n", __func__, newsize);
+            return -ENOMEM;
+        }
+    }
+    return 0;
+}
+
+int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc)
+{
+    struct qent_base *const be = &be_src->base;
+    return qent_base_realloc(be, len, dbsc);
+}
+
+
+int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc)
+{
+    void * dst;
+    struct qent_base *const be = &be_src->base;
+    int rv;
+
+    // Realloc doesn't copy so don't alloc if offset != 0
+    if ((rv = qent_base_realloc(be, offset + len,
+                                be_src->fixed_size || offset ? NULL : dbsc)) != 0)
+        return rv;
+
+    dmabuf_write_start(be->dh[0]);
+    dst = dmabuf_map(be->dh[0]);
+    if (!dst)
+        return -1;
+    memcpy((char*)dst + offset, src, len);
+    dmabuf_len_set(be->dh[0], len);
+    dmabuf_write_end(be->dh[0]);
+    return 0;
+}
+
+const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be_dst, unsigned int plane)
+{
+    const struct qent_base *const be = &be_dst->base;
+
+    return (plane >= sizeof(be->dh)/sizeof(be->dh[0])) ? NULL : be->dh[plane];
+}
+
+int qent_dst_dup_fd(const struct qent_dst *const be_dst, unsigned int plane)
+{
+    return dup(dmabuf_fd(qent_dst_dmabuf(be_dst, plane)));
+}
+
+MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
+                struct media_request **const pmreq,
+                struct qent_src **const psrc_be,
+                struct qent_dst *const dst_be,
+                const bool is_final)
+{
+    struct media_request * mreq = *pmreq;
+    struct qent_src *const src_be = *psrc_be;
+
+    // Req & src are always both "consumed"
+    *pmreq = NULL;
+    *psrc_be = NULL;
+
+    pthread_mutex_lock(&mbc->lock);
+
+    if (!src_be)
+        goto fail1;
+
+    if (dst_be) {
+        if (qe_dst_waiting(dst_be)) {
+            request_info(mbc->dc, "Request buffer already waiting on start\n");
+            goto fail1;
+        }
+        dst_be->base.timestamp = (struct timeval){0,0};
+        if (qe_v4l2_queue(&dst_be->base, mbc->vfd, NULL, &mbc->dst_fmt, true, false))
+            goto fail1;
+
+        qent_dst_ref(dst_be);
+        queue_put_inuse(mbc->dst, &dst_be->base);
+    }
+
+    if (qe_v4l2_queue(&src_be->base, mbc->vfd, mreq, &mbc->src_fmt, false, !is_final))
+        goto fail1;
+    queue_put_inuse(mbc->src, &src_be->base);
+
+    if (!mbc->polling && mediabufs_wants_poll(mbc)) {
+        mbc->polling = true;
+        pollqueue_add_task(mbc->pt, 2000);
+    }
+    pthread_mutex_unlock(&mbc->lock);
+
+    if (media_request_start(mreq))
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+
+    return MEDIABUFS_STATUS_SUCCESS;
+
+fail1:
+    media_request_abort(&mreq);
+    if (src_be)
+        queue_put_free(mbc->src, &src_be->base);
+
+// *** TODO: If src Q fails this doesnt unwind properly - separate dst Q from src Q
+    if (dst_be) {
+        dst_be->base.status = QENT_ERROR;
+        qe_dst_done(dst_be);
+    }
+    pthread_mutex_unlock(&mbc->lock);
+    return MEDIABUFS_ERROR_OPERATION_FAILED;
+}
+
+
+static int qe_alloc_from_fmt(struct qent_base *const be,
+                   struct dmabufs_ctl *const dbsc,
+                   const struct v4l2_format *const fmt)
+{
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        unsigned int i;
+        for (i = 0; i != fmt->fmt.pix_mp.num_planes; ++i) {
+            be->dh[i] = dmabuf_realloc(dbsc, be->dh[i],
+                fmt->fmt.pix_mp.plane_fmt[i].sizeimage);
+            /* On failure tidy up and die */
+            if (!be->dh[i]) {
+                while (i--) {
+                    dmabuf_free(be->dh[i]);
+                    be->dh[i] = NULL;
+                }
+                return -1;
+            }
+        }
+    }
+    else {
+//      be->dh[0] = dmabuf_alloc(dbsc, fmt->fmt.pix.sizeimage);
+        size_t size = fmt->fmt.pix.sizeimage;
+        be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], size);
+        if (!be->dh[0])
+            return -1;
+    }
+    return 0;
+}
+
+static MediaBufsStatus fmt_set(struct v4l2_format *const fmt, const int fd,
+            const enum v4l2_buf_type buftype,
+            uint32_t pixfmt,
+            const unsigned int width, const unsigned int height,
+                               const size_t bufsize)
+{
+    *fmt = (struct v4l2_format){.type = buftype};
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
+        fmt->fmt.pix_mp.width = width;
+        fmt->fmt.pix_mp.height = height;
+        fmt->fmt.pix_mp.pixelformat = pixfmt;
+        if (bufsize) {
+            fmt->fmt.pix_mp.num_planes = 1;
+            fmt->fmt.pix_mp.plane_fmt[0].sizeimage = bufsize;
+        }
+    }
+    else {
+        fmt->fmt.pix.width = width;
+        fmt->fmt.pix.height = height;
+        fmt->fmt.pix.pixelformat = pixfmt;
+        fmt->fmt.pix.sizeimage = bufsize;
+    }
+
+    while (ioctl(fd, VIDIOC_S_FMT, fmt))
+        if (errno != EINTR)
+            return MEDIABUFS_ERROR_OPERATION_FAILED;
+
+    // Treat anything where we don't get at least what we asked for as a fail
+    if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
+        if (fmt->fmt.pix_mp.width < width ||
+            fmt->fmt.pix_mp.height < height ||
+            fmt->fmt.pix_mp.pixelformat != pixfmt) {
+            return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
+        }
+    }
+    else {
+        if (fmt->fmt.pix.width < width ||
+            fmt->fmt.pix.height < height ||
+            fmt->fmt.pix.pixelformat != pixfmt) {
+            return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
+        }
+    }
+
+    return MEDIABUFS_STATUS_SUCCESS;
+}
+
+static MediaBufsStatus find_fmt_flags(struct v4l2_format *const fmt,
+                   const int fd,
+                   const unsigned int type_v4l2,
+                   const uint32_t flags_must,
+                   const uint32_t flags_not,
+                   const unsigned int width,
+                   const unsigned int height,
+                   mediabufs_dst_fmt_accept_fn *const accept_fn,
+                   void *const accept_v)
+{
+    unsigned int i;
+
+    for (i = 0;; ++i) {
+        struct v4l2_fmtdesc fmtdesc = {
+            .index = i,
+            .type = type_v4l2
+        };
+        while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
+            if (errno != EINTR)
+                return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
+        }
+        if ((fmtdesc.flags & flags_must) != flags_must ||
+            (fmtdesc.flags & flags_not))
+            continue;
+        if (!accept_fn(accept_v, &fmtdesc))
+            continue;
+
+        if (fmt_set(fmt, fd, fmtdesc.type, fmtdesc.pixelformat,
+                width, height, 0) == MEDIABUFS_STATUS_SUCCESS)
+            return MEDIABUFS_STATUS_SUCCESS;
+    }
+    return 0;
+}
+
+
+/* Wait for qent done */
+
+MediaBufsStatus qent_dst_wait(struct qent_dst *const be_dst)
+{
+    struct qent_base *const be = &be_dst->base;
+    enum qent_status estat;
+
+    pthread_mutex_lock(&be_dst->lock);
+    while (be_dst->waiting &&
+           !pthread_cond_wait(&be_dst->cond, &be_dst->lock))
+        /* Loop */;
+    estat = be->status;
+    pthread_mutex_unlock(&be_dst->lock);
+
+    return estat == QENT_DONE ? MEDIABUFS_STATUS_SUCCESS :
+        estat == QENT_ERROR ? MEDIABUFS_ERROR_DECODING_ERROR :
+            MEDIABUFS_ERROR_OPERATION_FAILED;
+}
+
+const uint8_t * qent_dst_data(struct qent_dst *const be_dst, unsigned int buf_no)
+{
+    struct qent_base *const be = &be_dst->base;
+    return dmabuf_map(be->dh[buf_no]);
+}
+
+MediaBufsStatus qent_dst_read_start(struct qent_dst *const be_dst)
+{
+    struct qent_base *const be = &be_dst->base;
+    unsigned int i;
+    for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
+        if (dmabuf_read_start(be->dh[i])) {
+            while (i--)
+                dmabuf_read_end(be->dh[i]);
+            return MEDIABUFS_ERROR_ALLOCATION_FAILED;
+        }
+    }
+    return MEDIABUFS_STATUS_SUCCESS;
+}
+
+MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be_dst)
+{
+    struct qent_base *const be = &be_dst->base;
+    unsigned int i;
+    MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
+
+    for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
+        if (dmabuf_read_end(be->dh[i]))
+            status = MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+    return status;
+}
+
+struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst)
+{
+    if (be_dst)
+        atomic_fetch_add(&be_dst->base.ref_count, 1);
+    return be_dst;
+}
+
+void qent_dst_unref(struct qent_dst ** const pbe_dst)
+{
+    struct qent_dst * const be_dst = *pbe_dst;
+    struct mediabufs_ctl * mbc;
+    if (!be_dst)
+        return;
+    *pbe_dst = NULL;
+
+    if (atomic_fetch_sub(&be_dst->base.ref_count, 1) != 0)
+        return;
+
+    if ((mbc = ff_weak_link_lock(&be_dst->mbc_wl)) != NULL) {
+        queue_put_free(mbc->dst, &be_dst->base);
+        ff_weak_link_unlock(be_dst->mbc_wl);
+    }
+    else {
+        qe_dst_free(be_dst);
+    }
+}
+
+MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
+                unsigned int plane,
+                int fd, size_t size)
+{
+    struct qent_base *const be = &be_dst->base;
+    struct dmabuf_h * dh;
+
+    if (be->status != QENT_IMPORT || be->dh[plane])
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+
+    dh = dmabuf_import(fd, size);
+    if (!dh)
+        return MEDIABUFS_ERROR_ALLOCATION_FAILED;
+
+    be->dh[plane] = dh;
+    return MEDIABUFS_STATUS_SUCCESS;
+}
+
+// Returns noof buffers created, -ve for error
+static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, struct qent_dst * const qes[])
+{
+    unsigned int i;
+
+    struct v4l2_create_buffers cbuf = {
+        .count = n,
+        .memory = V4L2_MEMORY_DMABUF,
+        .format = mbc->dst_fmt,
+    };
+
+    while (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf)) {
+        const int err = -errno;
+        if (err != EINTR) {
+            request_err(mbc->dc, "%s: Failed to create V4L2 buffer\n", __func__);
+            return -err;
+        }
+    }
+
+    if (cbuf.count != n)
+        request_warn(mbc->dc, "%s: Created %d of %d V4L2 buffers requested\n", __func__, cbuf.count, n);
+
+    for (i = 0; i != cbuf.count; ++i)
+        qes[i]->base.index = cbuf.index + i;
+
+    return cbuf.count;
+}
+
+struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc)
+{
+    struct qent_dst * be_dst;
+
+    if (mbc == NULL) {
+        be_dst = qe_dst_new(NULL);
+        if (be_dst)
+            be_dst->base.status = QENT_IMPORT;
+        return be_dst;
+    }
+
+    if (mbc->dst_fixed) {
+        be_dst = base_to_dst(queue_get_free(mbc->dst));
+        if (!be_dst)
+            return NULL;
+    }
+    else {
+        be_dst = base_to_dst(queue_tryget_free(mbc->dst));
+        if (!be_dst) {
+            be_dst = qe_dst_new(mbc->this_wlm);
+            if (!be_dst)
+                return NULL;
+
+            if (create_dst_bufs(mbc, 1, &be_dst) != 1) {
+                qe_dst_free(be_dst);
+                return NULL;
+            }
+        }
+    }
+
+    if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) {
+        /* Given  how create buf works we can't uncreate it on alloc failure
+         * all we can do is put it on the free Q
+        */
+        queue_put_free(mbc->dst, &be_dst->base);
+        return NULL;
+    }
+
+    be_dst->base.status = QENT_PENDING;
+    atomic_store(&be_dst->base.ref_count, 0);
+    return be_dst;
+}
+
+const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc)
+{
+    return &mbc->dst_fmt;
+}
+
+MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
+               const unsigned int width,
+               const unsigned int height,
+               mediabufs_dst_fmt_accept_fn *const accept_fn,
+               void *const accept_v)
+{
+    MediaBufsStatus status;
+    unsigned int i;
+    const enum v4l2_buf_type buf_type = mbc->dst_fmt.type;
+    static const struct {
+        unsigned int flags_must;
+        unsigned int flags_not;
+    } trys[] = {
+        {0, V4L2_FMT_FLAG_EMULATED},
+        {V4L2_FMT_FLAG_EMULATED, 0},
+    };
+    for (i = 0; i != sizeof(trys)/sizeof(trys[0]); ++i) {
+        status = find_fmt_flags(&mbc->dst_fmt, mbc->vfd,
+                                buf_type,
+                                trys[i].flags_must,
+                                trys[i].flags_not,
+                                width, height, accept_fn, accept_v);
+        if (status != MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE)
+            return status;
+    }
+
+    if (status != MEDIABUFS_STATUS_SUCCESS)
+        return status;
+
+    /* Try to create a buffer - don't alloc */
+    return status;
+}
+
+// ** This is a mess if we get partial alloc but without any way to remove
+//    individual V4L2 Q members we are somewhat stuffed
+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed)
+{
+    unsigned int i;
+    int a = 0;
+    unsigned int qc;
+    struct qent_dst * qes[32];
+
+    if (n > 32)
+        return MEDIABUFS_ERROR_ALLOCATION_FAILED;
+
+    // Create qents first as it is hard to get rid of the V4L2 buffers on error
+    for (qc = 0; qc != n; ++qc)
+    {
+        if ((qes[qc] = qe_dst_new(mbc->this_wlm)) == NULL)
+            goto fail;
+    }
+
+    if ((a = create_dst_bufs(mbc, n, qes)) < 0)
+        goto fail;
+
+    for (i = 0; i != a; ++i)
+        queue_put_free(mbc->dst, &qes[i]->base);
+
+    if (a != n)
+        goto fail;
+
+    mbc->dst_fixed = fixed;
+    return MEDIABUFS_STATUS_SUCCESS;
+
+fail:
+    for (i = (a < 0 ? 0 : a); i != qc; ++i)
+        qe_dst_free(qes[i]);
+
+    return MEDIABUFS_ERROR_ALLOCATION_FAILED;
+}
+
+struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc)
+{
+    struct qent_base * buf = queue_get_free(mbc->src);
+    buf->status = QENT_PENDING;
+    return base_to_src(buf);
+}
+
+void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src)
+{
+    struct qent_src *const qe_src = *pqe_src;
+    if (!qe_src)
+        return;
+    *pqe_src = NULL;
+    queue_put_free(mbc->src, &qe_src->base);
+}
+
+/* src format must have been set up before this */
+MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc,
+                  struct dmabufs_ctl * const dbsc,
+                  unsigned int n)
+{
+    unsigned int i;
+    struct v4l2_requestbuffers req = {
+        .count = n,
+        .type = mbc->src_fmt.type,
+        .memory = V4L2_MEMORY_DMABUF
+    };
+
+    bq_free_all_free_src(mbc->src);
+    while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) {
+        if (errno != EINTR) {
+            request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__);
+            return MEDIABUFS_ERROR_OPERATION_FAILED;
+        }
+    }
+
+    if (n > req.count) {
+        request_info(mbc->dc, "Only allocated %d of %d src buffers requested\n", req.count, n);
+        n = req.count;
+    }
+
+    for (i = 0; i != n; ++i) {
+        struct qent_src *const be_src = qe_src_new();
+        if (!be_src) {
+            request_err(mbc->dc, "Failed to create src be %d\n", i);
+            goto fail;
+        }
+        if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) {
+            qe_src_free(be_src);
+            goto fail;
+        }
+        be_src->base.index = i;
+        be_src->fixed_size = !mediabufs_src_resizable(mbc);
+
+        queue_put_free(mbc->src, &be_src->base);
+    }
+
+    return MEDIABUFS_STATUS_SUCCESS;
+
+fail:
+    bq_free_all_free_src(mbc->src);
+    req.count = 0;
+    while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1 &&
+           errno == EINTR)
+        /* Loop */;
+
+    return MEDIABUFS_ERROR_OPERATION_FAILED;
+}
+
+
+
+/*
+ * Set stuff order:
+ *  Set src fmt
+ *  Set parameters (sps) on vfd
+ *  Negotiate dst format (dst_fmt_set)
+ *  Create src buffers
+ *  Alloc a dst buffer or Create dst slots
+*/
+MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc)
+{
+    if (mbc->stream_on)
+        return MEDIABUFS_STATUS_SUCCESS;
+
+    if (set_stream(mbc->vfd, mbc->src_fmt.type, true) < 0) {
+        request_log("Failed to set stream on src type %d\n", mbc->src_fmt.type);
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+
+    if (set_stream(mbc->vfd, mbc->dst_fmt.type, true) < 0) {
+        request_log("Failed to set stream on dst type %d\n", mbc->dst_fmt.type);
+        set_stream(mbc->vfd, mbc->src_fmt.type, false);
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+
+    mbc->stream_on = true;
+    return MEDIABUFS_STATUS_SUCCESS;
+}
+
+MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc)
+{
+    MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
+
+    if (!mbc->stream_on)
+        return MEDIABUFS_STATUS_SUCCESS;
+
+    if (set_stream(mbc->vfd, mbc->dst_fmt.type, false) < 0) {
+        request_log("Failed to set stream off dst type %d\n", mbc->dst_fmt.type);
+        status = MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+
+    if (set_stream(mbc->vfd, mbc->src_fmt.type, false) < 0) {
+        request_log("Failed to set stream off src type %d\n", mbc->src_fmt.type);
+        status = MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+
+    mbc->stream_on = false;
+    return status;
+}
+
+int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, struct v4l2_ext_control control_array[], unsigned int n)
+{
+    struct v4l2_ext_controls controls = {
+        .controls = control_array,
+        .count = n
+    };
+
+    if (mreq) {
+        controls.which = V4L2_CTRL_WHICH_REQUEST_VAL;
+        controls.request_fd = media_request_fd(mreq);
+    }
+
+    while (ioctl(mbc->vfd, VIDIOC_S_EXT_CTRLS, &controls))
+    {
+        const int err = errno;
+        if (err != EINTR) {
+            request_err(mbc->dc, "Unable to set controls: %s\n", strerror(err));
+            return -err;
+        }
+    }
+
+    return 0;
+}
+
+MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
+                struct media_request * const mreq,
+                unsigned int id, void *data,
+                unsigned int size)
+{
+    struct v4l2_ext_control control = {
+        .id = id,
+        .ptr = data,
+        .size = size
+    };
+
+    int rv = mediabufs_ctl_set_ext_ctrls(mbc, mreq, &control, 1);
+    return !rv ? MEDIABUFS_STATUS_SUCCESS : MEDIABUFS_ERROR_OPERATION_FAILED;
+}
+
+MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
+                                      enum v4l2_buf_type buf_type,
+                   const uint32_t pixfmt,
+                   const uint32_t width, const uint32_t height,
+                                      const size_t bufsize)
+{
+    MediaBufsStatus rv = fmt_set(&mbc->src_fmt, mbc->vfd, buf_type, pixfmt, width, height, bufsize);
+    if (rv != MEDIABUFS_STATUS_SUCCESS)
+        request_err(mbc->dc, "Failed to set src buftype %d, format %#x %dx%d\n", buf_type, pixfmt, width, height);
+
+    return rv;
+}
+
+int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n)
+{
+    int rv = 0;
+    while (n--) {
+        while (ioctl(mbc->vfd, VIDIOC_QUERY_EXT_CTRL, ctrls)) {
+            const int err = errno;
+            if (err != EINTR) {
+                // Often used for probing - errors are to be expected
+                request_debug(mbc->dc, "Failed to query ext id=%#x, err=%d\n", ctrls->id, err);
+                ctrls->type = 0; // 0 is invalid
+                rv = -err;
+                break;
+            }
+        }
+        ++ctrls;
+    }
+    return rv;
+}
+
+int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc)
+{
+    // Single planar OUTPUT can only take exact size buffers
+    // Multiplanar will take larger than negotiated
+    return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type);
+}
+
+static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc)
+{
+    if (!mbc)
+        return;
+
+    // Break the weak link first
+    ff_weak_link_break(&mbc->this_wlm);
+
+    polltask_delete(&mbc->pt);
+
+    mediabufs_stream_off(mbc);
+
+    // Empty v4l2 buffer stash
+    request_buffers(mbc->vfd, mbc->src_fmt.type, V4L2_MEMORY_MMAP, 0);
+    request_buffers(mbc->vfd, mbc->dst_fmt.type, V4L2_MEMORY_MMAP, 0);
+
+    bq_free_all_free_src(mbc->src);
+    bq_free_all_inuse_src(mbc->src);
+    bq_free_all_free_dst(mbc->dst);
+
+    {
+        struct qent_dst *dst_be;
+        while ((dst_be = base_to_dst(bq_get_inuse(mbc->dst))) != NULL) {
+            dst_be->base.timestamp = (struct timeval){0};
+            dst_be->base.status = QENT_ERROR;
+            qe_dst_done(dst_be);
+        }
+    }
+
+    queue_delete(mbc->dst);
+    queue_delete(mbc->src);
+    close(mbc->vfd);
+    pthread_mutex_destroy(&mbc->lock);
+
+    free(mbc);
+}
+
+struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc)
+{
+    atomic_fetch_add(&mbc->ref_count, 1);
+    return mbc;
+}
+
+void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc)
+{
+    struct mediabufs_ctl *const mbc = *pmbc;
+    int n;
+
+    if (!mbc)
+        return;
+    *pmbc = NULL;
+    n = atomic_fetch_sub(&mbc->ref_count, 1);
+    if (n)
+        return;
+    mediabufs_ctl_delete(mbc);
+}
+
+static int set_capabilities(struct mediabufs_ctl *const mbc)
+{
+    struct v4l2_capability capability = { 0 };
+    uint32_t caps;
+
+    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &capability)) {
+        int err = errno;
+        request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err));
+        return -err;
+    }
+
+    caps = (capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
+            capability.device_caps :
+            capability.capabilities;
+
+    if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) {
+        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
+        mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
+    }
+    else if ((caps & V4L2_CAP_VIDEO_M2M) != 0) {
+        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
+        mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    }
+    else {
+        request_err(mbc->dc, "No M2M capabilities (%#x)\n", caps);
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+/* One of these per context */
+struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, const char * vpath, struct pollqueue *const pq)
+{
+    struct mediabufs_ctl *const mbc = calloc(1, sizeof(*mbc));
+
+    if (!mbc)
+        return NULL;
+
+    mbc->dc = dc;
+    // Default mono planar
+    mbc->pq = pq;
+    pthread_mutex_init(&mbc->lock, NULL);
+
+    /* Pick a default  - could we scan for this? */
+    if (vpath == NULL)
+        vpath = "/dev/media0";
+
+    while ((mbc->vfd = open(vpath, O_RDWR)) == -1)
+    {
+        const int err = errno;
+        if (err != EINTR) {
+            request_err(dc, "Failed to open video dev '%s': %s\n", vpath, strerror(err));
+            goto fail0;
+        }
+    }
+
+    if (set_capabilities(mbc)) {
+        request_err(dc, "Bad capabilities for video dev '%s'\n", vpath);
+        goto fail1;
+    }
+
+    mbc->src = queue_new(mbc->vfd);
+    if (!mbc->src)
+        goto fail1;
+    mbc->dst = queue_new(mbc->vfd);
+    if (!mbc->dst)
+        goto fail2;
+    mbc->pt = polltask_new(pq, mbc->vfd, POLLIN | POLLOUT, mediabufs_poll_cb, mbc);
+    if (!mbc->pt)
+        goto fail3;
+    mbc->this_wlm = ff_weak_link_new(mbc);
+    if (!mbc->this_wlm)
+        goto fail4;
+
+    /* Cannot add polltask now - polling with nothing pending
+     * generates infinite error polls
+    */
+    return mbc;
+
+fail4:
+    polltask_delete(&mbc->pt);
+fail3:
+    queue_delete(mbc->dst);
+fail2:
+    queue_delete(mbc->src);
+fail1:
+    close(mbc->vfd);
+fail0:
+    free(mbc);
+    request_info(dc, "%s: FAILED\n", __func__);
+    return NULL;
+}
+
+
+
diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h
new file mode 100644
index 0000000000..2f826cfb14
--- /dev/null
+++ b/libavcodec/v4l2_req_media.h
@@ -0,0 +1,151 @@
+/*
+e.h
+*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _MEDIA_H_
+#define _MEDIA_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+struct v4l2_format;
+struct v4l2_fmtdesc;
+struct v4l2_query_ext_ctrl;
+
+struct pollqueue;
+struct media_request;
+struct media_pool;
+
+typedef enum media_buf_status {
+    MEDIABUFS_STATUS_SUCCESS = 0,
+    MEDIABUFS_ERROR_OPERATION_FAILED,
+    MEDIABUFS_ERROR_DECODING_ERROR,
+    MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE,
+    MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT,
+    MEDIABUFS_ERROR_ALLOCATION_FAILED,
+} MediaBufsStatus;
+
+struct media_pool * media_pool_new(const char * const media_path,
+                   struct pollqueue * const pq,
+                   const unsigned int n);
+void media_pool_delete(struct media_pool ** pmp);
+
+// Obtain a media request
+// Will block if none availible - has a 2sec timeout
+struct media_request * media_request_get(struct media_pool * const mp);
+int media_request_fd(const struct media_request * const req);
+
+// Start this request
+// Request structure is returned to pool once done
+int media_request_start(struct media_request * const req);
+
+// Return an *unstarted* media_request to the pool
+// May later be upgraded to allow for aborting a started req
+int media_request_abort(struct media_request ** const preq);
+
+
+struct mediabufs_ctl;
+struct qent_src;
+struct qent_dst;
+struct dmabuf_h;
+struct dmabufs_ctl;
+
+int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp);
+struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst);
+
+// prealloc
+int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc);
+// dbsc may be NULL if realloc not required
+int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc);
+const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be, unsigned int plane);
+int qent_dst_dup_fd(const struct qent_dst *const be, unsigned int plane);
+MediaBufsStatus qent_dst_wait(struct qent_dst *const be);
+void qent_dst_delete(struct qent_dst *const be);
+// Returns a qent_dst to its mbc free Q or deletes it if the mbc is dead
+void qent_dst_unref(struct qent_dst ** const pbe_dst);
+struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst);
+
+const uint8_t * qent_dst_data(struct qent_dst *const be, unsigned int buf_no);
+MediaBufsStatus qent_dst_read_start(struct qent_dst *const be);
+MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be);
+/* Import an fd unattached to any mediabuf */
+MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
+                unsigned int plane,
+                int fd, size_t size);
+
+MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
+                struct media_request **const pmreq,
+                struct qent_src **const psrc_be,
+                struct qent_dst *const dst_be,
+                const bool is_final);
+// Get / alloc a dst buffer & associate with a slot
+// If the dst pool is empty then behaviour depends on the fixed flag passed to
+// dst_slots_create.  Default is !fixed = unlimited alloc
+struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc,
+                           struct dmabufs_ctl *const dbsc);
+// Create dst slots without alloc
+// If fixed true then qent_alloc will only get slots from this pool and will
+// block until a qent has been unrefed
+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed);
+
+MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc);
+MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc);
+const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc);
+
+typedef int mediabufs_dst_fmt_accept_fn(void * v, const struct v4l2_fmtdesc *fmtdesc);
+
+MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
+               const unsigned int width,
+               const unsigned int height,
+               mediabufs_dst_fmt_accept_fn *const accept_fn,
+               void *const accept_v);
+struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc);
+void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src);
+
+int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq,
+                                struct v4l2_ext_control control_array[], unsigned int n);
+MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
+                struct media_request * const mreq,
+                unsigned int id, void *data,
+                unsigned int size);
+int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n);
+
+int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc);
+
+MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
+                                      enum v4l2_buf_type buf_type,
+                                      const uint32_t pixfmt,
+                                      const uint32_t width, const uint32_t height,
+                                      const size_t bufsize);
+
+MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw,
+                  struct dmabufs_ctl * const dbsc,
+                  unsigned int n);
+
+struct mediabufs_ctl * mediabufs_ctl_new(void * const dc,
+                     const char *vpath, struct pollqueue *const pq);
+void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc);
+struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc);
+
+
+#endif
diff --git a/libavcodec/v4l2_req_pollqueue.c b/libavcodec/v4l2_req_pollqueue.c
new file mode 100644
index 0000000000..cc8a5d4001
--- /dev/null
+++ b/libavcodec/v4l2_req_pollqueue.c
@@ -0,0 +1,361 @@
+#include <errno.h>
+#include <limits.h>
+#include <poll.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/eventfd.h>
+
+#include "v4l2_req_pollqueue.h"
+#include "v4l2_req_utils.h"
+
+
+struct pollqueue;
+
+enum polltask_state {
+    POLLTASK_UNQUEUED = 0,
+    POLLTASK_QUEUED,
+    POLLTASK_RUNNING,
+    POLLTASK_Q_KILL,
+    POLLTASK_RUN_KILL,
+};
+
+struct polltask {
+    struct polltask *next;
+    struct polltask *prev;
+    struct pollqueue *q;
+    enum polltask_state state;
+
+    int fd;
+    short events;
+
+    void (*fn)(void *v, short revents);
+    void * v;
+
+    uint64_t timeout; /* CLOCK_MONOTONIC time, 0 => never */
+    sem_t kill_sem;
+};
+
+struct pollqueue {
+    atomic_int ref_count;
+    pthread_mutex_t lock;
+
+    struct polltask *head;
+    struct polltask *tail;
+
+    bool kill;
+    bool no_prod;
+    int prod_fd;
+    struct polltask *prod_pt;
+    pthread_t worker;
+};
+
+struct polltask *polltask_new(struct pollqueue *const pq,
+                              const int fd, const short events,
+                  void (*const fn)(void *v, short revents),
+                  void *const v)
+{
+    struct polltask *pt;
+
+    if (!events)
+        return NULL;
+
+    pt = malloc(sizeof(*pt));
+    if (!pt)
+        return NULL;
+
+    *pt = (struct polltask){
+        .next = NULL,
+        .prev = NULL,
+        .q = pollqueue_ref(pq),
+        .fd = fd,
+        .events = events,
+        .fn = fn,
+        .v = v
+    };
+
+    sem_init(&pt->kill_sem, 0, 0);
+
+    return pt;
+}
+
+static void pollqueue_rem_task(struct pollqueue *const pq, struct polltask *const pt)
+{
+    if (pt->prev)
+        pt->prev->next = pt->next;
+    else
+        pq->head = pt->next;
+    if (pt->next)
+        pt->next->prev = pt->prev;
+    else
+        pq->tail = pt->prev;
+    pt->next = NULL;
+    pt->prev = NULL;
+}
+
+static void polltask_free(struct polltask * const pt)
+{
+    sem_destroy(&pt->kill_sem);
+    free(pt);
+}
+
+static int pollqueue_prod(const struct pollqueue *const pq)
+{
+    static const uint64_t one = 1;
+    return write(pq->prod_fd, &one, sizeof(one));
+}
+
+void polltask_delete(struct polltask **const ppt)
+{
+    struct polltask *const pt = *ppt;
+    struct pollqueue * pq;
+    enum polltask_state state;
+    bool prodme;
+
+    if (!pt)
+        return;
+
+    pq = pt->q;
+    pthread_mutex_lock(&pq->lock);
+    state = pt->state;
+    pt->state = (state == POLLTASK_RUNNING) ? POLLTASK_RUN_KILL : POLLTASK_Q_KILL;
+    prodme = !pq->no_prod;
+    pthread_mutex_unlock(&pq->lock);
+
+    if (state != POLLTASK_UNQUEUED) {
+        if (prodme)
+            pollqueue_prod(pq);
+        while (sem_wait(&pt->kill_sem) && errno == EINTR)
+            /* loop */;
+    }
+
+    // Leave zapping the ref until we have DQed the PT as might well be
+    // legitimately used in it
+    *ppt = NULL;
+    polltask_free(pt);
+    pollqueue_unref(&pq);
+}
+
+static uint64_t pollqueue_now(int timeout)
+{
+    struct timespec now;
+    uint64_t now_ms;
+
+    if (clock_gettime(CLOCK_MONOTONIC, &now))
+        return 0;
+    now_ms = (now.tv_nsec / 1000000) + (uint64_t)now.tv_sec * 1000 + timeout;
+    return now_ms ? now_ms : (uint64_t)1;
+}
+
+void pollqueue_add_task(struct polltask *const pt, const int timeout)
+{
+    bool prodme = false;
+    struct pollqueue * const pq = pt->q;
+
+    pthread_mutex_lock(&pq->lock);
+    if (pt->state != POLLTASK_Q_KILL && pt->state != POLLTASK_RUN_KILL) {
+        if (pq->tail)
+            pq->tail->next = pt;
+        else
+            pq->head = pt;
+        pt->prev = pq->tail;
+        pt->next = NULL;
+        pt->state = POLLTASK_QUEUED;
+        pt->timeout = timeout < 0 ? 0 : pollqueue_now(timeout);
+        pq->tail = pt;
+        prodme = !pq->no_prod;
+    }
+    pthread_mutex_unlock(&pq->lock);
+    if (prodme)
+        pollqueue_prod(pq);
+}
+
+static void *poll_thread(void *v)
+{
+    struct pollqueue *const pq = v;
+    struct pollfd *a = NULL;
+    size_t asize = 0;
+
+    pthread_mutex_lock(&pq->lock);
+    do {
+        unsigned int i;
+        unsigned int n = 0;
+        struct polltask *pt;
+        struct polltask *pt_next;
+        uint64_t now = pollqueue_now(0);
+        int timeout = -1;
+        int rv;
+
+        for (pt = pq->head; pt; pt = pt_next) {
+            int64_t t;
+
+            pt_next = pt->next;
+
+            if (pt->state == POLLTASK_Q_KILL) {
+                pollqueue_rem_task(pq, pt);
+                sem_post(&pt->kill_sem);
+                continue;
+            }
+
+            if (n >= asize) {
+                asize = asize ? asize * 2 : 4;
+                a = realloc(a, asize * sizeof(*a));
+                if (!a) {
+                    request_log("Failed to realloc poll array to %zd\n", asize);
+                    goto fail_locked;
+                }
+            }
+
+            a[n++] = (struct pollfd){
+                .fd = pt->fd,
+                .events = pt->events
+            };
+
+            t = (int64_t)(pt->timeout - now);
+            if (pt->timeout && t < INT_MAX &&
+                (timeout < 0 || (int)t < timeout))
+                timeout = (t < 0) ? 0 : (int)t;
+        }
+        pthread_mutex_unlock(&pq->lock);
+
+        if ((rv = poll(a, n, timeout)) == -1) {
+            if (errno != EINTR) {
+                request_log("Poll error: %s\n", strerror(errno));
+                goto fail_unlocked;
+            }
+        }
+
+        pthread_mutex_lock(&pq->lock);
+        now = pollqueue_now(0);
+
+        /* Prodding in this loop is pointless and might lead to
+         * infinite looping
+        */
+        pq->no_prod = true;
+        for (i = 0, pt = pq->head; i < n; ++i, pt = pt_next) {
+            pt_next = pt->next;
+
+            /* Pending? */
+            if (a[i].revents ||
+                (pt->timeout && (int64_t)(now - pt->timeout) >= 0)) {
+                pollqueue_rem_task(pq, pt);
+                if (pt->state == POLLTASK_QUEUED)
+                    pt->state = POLLTASK_RUNNING;
+                if (pt->state == POLLTASK_Q_KILL)
+                    pt->state = POLLTASK_RUN_KILL;
+                pthread_mutex_unlock(&pq->lock);
+
+                /* This can add new entries to the Q but as
+                 * those are added to the tail our existing
+                 * chain remains intact
+                */
+                pt->fn(pt->v, a[i].revents);
+
+                pthread_mutex_lock(&pq->lock);
+                if (pt->state == POLLTASK_RUNNING)
+                    pt->state = POLLTASK_UNQUEUED;
+                if (pt->state == POLLTASK_RUN_KILL)
+                    sem_post(&pt->kill_sem);
+            }
+        }
+        pq->no_prod = false;
+
+    } while (!pq->kill);
+
+fail_locked:
+    pthread_mutex_unlock(&pq->lock);
+fail_unlocked:
+    free(a);
+    return NULL;
+}
+
+static void prod_fn(void *v, short revents)
+{
+    struct pollqueue *const pq = v;
+    char buf[8];
+    if (revents)
+        read(pq->prod_fd, buf, 8);
+    if (!pq->kill)
+        pollqueue_add_task(pq->prod_pt, -1);
+}
+
+struct pollqueue * pollqueue_new(void)
+{
+    struct pollqueue *pq = malloc(sizeof(*pq));
+    if (!pq)
+        return NULL;
+    *pq = (struct pollqueue){
+        .ref_count = ATOMIC_VAR_INIT(0),
+        .lock = PTHREAD_MUTEX_INITIALIZER,
+        .head = NULL,
+        .tail = NULL,
+        .kill = false,
+        .prod_fd = -1
+    };
+
+    pq->prod_fd = eventfd(0, EFD_NONBLOCK);
+    if (pq->prod_fd == 1)
+        goto fail1;
+    pq->prod_pt = polltask_new(pq, pq->prod_fd, POLLIN, prod_fn, pq);
+    if (!pq->prod_pt)
+        goto fail2;
+    pollqueue_add_task(pq->prod_pt, -1);
+    if (pthread_create(&pq->worker, NULL, poll_thread, pq))
+        goto fail3;
+    // Reset ref count which will have been inced by the add_task
+    atomic_store(&pq->ref_count, 0);
+    return pq;
+
+fail3:
+    polltask_free(pq->prod_pt);
+fail2:
+    close(pq->prod_fd);
+fail1:
+    free(pq);
+    return NULL;
+}
+
+static void pollqueue_free(struct pollqueue *const pq)
+{
+    void *rv;
+
+    pthread_mutex_lock(&pq->lock);
+    pq->kill = true;
+    pollqueue_prod(pq);
+    pthread_mutex_unlock(&pq->lock);
+
+    pthread_join(pq->worker, &rv);
+    polltask_free(pq->prod_pt);
+    pthread_mutex_destroy(&pq->lock);
+    close(pq->prod_fd);
+    free(pq);
+}
+
+struct pollqueue * pollqueue_ref(struct pollqueue *const pq)
+{
+    atomic_fetch_add(&pq->ref_count, 1);
+    return pq;
+}
+
+void pollqueue_unref(struct pollqueue **const ppq)
+{
+    struct pollqueue * const pq = *ppq;
+
+    if (!pq)
+        return;
+    *ppq = NULL;
+
+    if (atomic_fetch_sub(&pq->ref_count, 1) != 0)
+        return;
+
+    pollqueue_free(pq);
+}
+
+
+
diff --git a/libavcodec/v4l2_req_pollqueue.h b/libavcodec/v4l2_req_pollqueue.h
new file mode 100644
index 0000000000..e1182cb2fc
--- /dev/null
+++ b/libavcodec/v4l2_req_pollqueue.h
@@ -0,0 +1,18 @@
+#ifndef POLLQUEUE_H_
+#define POLLQUEUE_H_
+
+struct polltask;
+struct pollqueue;
+
+struct polltask *polltask_new(struct pollqueue *const pq,
+			      const int fd, const short events,
+			      void (*const fn)(void *v, short revents),
+			      void *const v);
+void polltask_delete(struct polltask **const ppt);
+
+void pollqueue_add_task(struct polltask *const pt, const int timeout);
+struct pollqueue * pollqueue_new(void);
+void pollqueue_unref(struct pollqueue **const ppq);
+struct pollqueue * pollqueue_ref(struct pollqueue *const pq);
+
+#endif /* POLLQUEUE_H_ */
diff --git a/libavcodec/v4l2_req_utils.h b/libavcodec/v4l2_req_utils.h
new file mode 100644
index 0000000000..a31cc1f4ec
--- /dev/null
+++ b/libavcodec/v4l2_req_utils.h
@@ -0,0 +1,27 @@
+#ifndef AVCODEC_V4L2_REQ_UTILS_H
+#define AVCODEC_V4L2_REQ_UTILS_H
+
+#include <stdint.h>
+#include "libavutil/log.h"
+
+#define request_log(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__)
+
+#define request_err(_ctx, ...) av_log(_ctx, AV_LOG_ERROR, __VA_ARGS__)
+#define request_warn(_ctx, ...) av_log(_ctx, AV_LOG_WARNING, __VA_ARGS__)
+#define request_info(_ctx, ...) av_log(_ctx, AV_LOG_INFO, __VA_ARGS__)
+#define request_debug(_ctx, ...) av_log(_ctx, AV_LOG_DEBUG, __VA_ARGS__)
+
+static inline char safechar(char c) {
+    return c > 0x20 && c < 0x7f ? c : '.';
+}
+
+static inline const char * strfourcc(char tbuf[5], uint32_t fcc) {
+    tbuf[0] = safechar((fcc >>  0) & 0xff);
+    tbuf[1] = safechar((fcc >>  8) & 0xff);
+    tbuf[2] = safechar((fcc >> 16) & 0xff);
+    tbuf[3] = safechar((fcc >> 24) & 0xff);
+    tbuf[4] = '\0';
+    return tbuf;
+}
+
+#endif
diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
new file mode 100644
index 0000000000..b0a5930844
--- /dev/null
+++ b/libavcodec/v4l2_request_hevc.c
@@ -0,0 +1,297 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+
+#include "decode.h"
+#include "hevcdec.h"
+#include "hwconfig.h"
+#include "internal.h"
+
+#include "v4l2_request_hevc.h"
+
+#include "libavutil/hwcontext_drm.h"
+
+#include "v4l2_req_devscan.h"
+#include "v4l2_req_dmabufs.h"
+#include "v4l2_req_pollqueue.h"
+#include "v4l2_req_media.h"
+#include "v4l2_req_utils.h"
+
+static size_t bit_buf_size(unsigned int w, unsigned int h, unsigned int bits_minus8)
+{
+    const size_t wxh = w * h;
+    size_t bits_alloc;
+
+    /* Annex A gives a min compression of 2 @ lvl 3.1
+     * (wxh <= 983040) and min 4 thereafter but avoid
+     * the odity of 983041 having a lower limit than
+     * 983040.
+     * Multiply by 3/2 for 4:2:0
+     */
+    bits_alloc = wxh < 983040 ? wxh * 3 / 4 :
+        wxh < 983040 * 2 ? 983040 * 3 / 4 :
+        wxh * 3 / 8;
+    /* Allow for bit depth */
+    bits_alloc += (bits_alloc * bits_minus8) / 8;
+    /* Add a few bytes (16k) for overhead */
+    bits_alloc += 0x4000;
+    return bits_alloc;
+}
+
+static int v4l2_req_hevc_start_frame(AVCodecContext *avctx,
+                                     av_unused const uint8_t *buffer,
+                                     av_unused uint32_t size)
+{
+    const V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    return ctx->fns->start_frame(avctx, buffer, size);
+}
+
+static int v4l2_req_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    return ctx->fns->decode_slice(avctx, buffer, size);
+}
+
+static int v4l2_req_hevc_end_frame(AVCodecContext *avctx)
+{
+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
+    return ctx->fns->end_frame(avctx);
+}
+
+static void v4l2_req_hevc_abort_frame(AVCodecContext * const avctx)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    ctx->fns->abort_frame(avctx);
+}
+
+static int v4l2_req_hevc_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    return ctx->fns->frame_params(avctx, hw_frames_ctx);
+}
+
+static int v4l2_req_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    return ctx->fns->alloc_frame(avctx, frame);
+}
+
+
+static int v4l2_request_hevc_uninit(AVCodecContext *avctx)
+{
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+
+    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    decode_q_wait(&ctx->decode_q, NULL);  // Wait for all other threads to be out of decode
+
+    mediabufs_ctl_unref(&ctx->mbufs);
+    media_pool_delete(&ctx->mpool);
+    pollqueue_unref(&ctx->pq);
+    dmabufs_ctl_delete(&ctx->dbufs);
+    devscan_delete(&ctx->devscan);
+
+    decode_q_uninit(&ctx->decode_q);
+
+//    if (avctx->hw_frames_ctx) {
+//        AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
+//        av_buffer_pool_flush(hwfc->pool);
+//    }
+    return 0;
+}
+
+static int dst_fmt_accept_cb(void * v, const struct v4l2_fmtdesc *fmtdesc)
+{
+    AVCodecContext *const avctx = v;
+    const HEVCContext *const h = avctx->priv_data;
+
+    if (h->ps.sps->bit_depth == 8) {
+        if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_COL128 ||
+            fmtdesc->pixelformat == V4L2_PIX_FMT_NV12) {
+            return 1;
+        }
+    }
+    else if (h->ps.sps->bit_depth == 10) {
+        if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static int v4l2_request_hevc_init(AVCodecContext *avctx)
+{
+    const HEVCContext *h = avctx->priv_data;
+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
+    const HEVCSPS * const sps = h->ps.sps;
+    int ret;
+    const struct decdev * decdev;
+    const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2;  // Assuming constant for all APIs but avoiding V4L2 includes
+    size_t src_size;
+
+    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) {
+        av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n");
+        return (AVERROR(-ret));
+    }
+    ret = AVERROR(ENOMEM);  // Assume mem fail by default for these
+
+    if ((decdev = devscan_find(ctx->devscan, src_pix_fmt)) == NULL)
+    {
+        av_log(avctx, AV_LOG_WARNING, "Failed to find a V4L2 device for H265\n");
+        ret = AVERROR(ENODEV);
+        goto fail0;
+    }
+    av_log(avctx, AV_LOG_DEBUG, "Trying V4L2 devices: %s,%s\n",
+           decdev_media_path(decdev), decdev_video_path(decdev));
+
+    if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to open dmabufs\n");
+        goto fail0;
+    }
+
+    if ((ctx->pq = pollqueue_new()) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to create pollqueue\n");
+        goto fail1;
+    }
+
+    if ((ctx->mpool = media_pool_new(decdev_media_path(decdev), ctx->pq, 4)) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to create media pool\n");
+        goto fail2;
+    }
+
+    if ((ctx->mbufs = mediabufs_ctl_new(avctx, decdev_video_path(decdev), ctx->pq)) == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to create media controls\n");
+        goto fail3;
+    }
+
+    // Ask for an initial bitbuf size of max size / 4
+    // We will realloc if we need more
+    // Must use sps->h/w as avctx contains cropped size
+    src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8);
+    if (mediabufs_src_resizable(ctx->mbufs))
+        src_size /= 4;
+    // Kludge for conformance tests which break Annex A limits
+    else if (src_size < 0x40000)
+        src_size = 0x40000;
+
+    if (mediabufs_src_fmt_set(ctx->mbufs, decdev_src_type(decdev), src_pix_fmt,
+                              sps->width, sps->height, src_size)) {
+        char tbuf1[5];
+        av_log(avctx, AV_LOG_ERROR, "Failed to set source format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
+        goto fail4;
+    }
+
+    if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n");
+        ctx->fns = &V2(ff_v4l2_req_hevc, 2);
+    }
+    else if (V2(ff_v4l2_req_hevc, 1).probe(avctx, ctx) == 0) {
+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n");
+        ctx->fns = &V2(ff_v4l2_req_hevc, 1);
+    }
+    else {
+        av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n");
+        ret = AVERROR(EINVAL);
+        goto fail4;
+    }
+
+    if (mediabufs_dst_fmt_set(ctx->mbufs, sps->width, sps->height, dst_fmt_accept_cb, avctx)) {
+        char tbuf1[5];
+        av_log(avctx, AV_LOG_ERROR, "Failed to set destination format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
+        goto fail4;
+    }
+
+    if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6)) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n");
+        goto fail4;
+    }
+
+    {
+        unsigned int dst_slots = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering +
+            avctx->thread_count + (avctx->extra_hw_frames > 0 ? avctx->extra_hw_frames : 6);
+        av_log(avctx, AV_LOG_DEBUG, "Slots=%d: Reordering=%d, threads=%d, hw+=%d\n", dst_slots,
+               sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering,
+               avctx->thread_count, avctx->extra_hw_frames);
+
+        // extra_hw_frames is -1 if unset
+        if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0))) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n");
+            goto fail4;
+        }
+    }
+
+    if (mediabufs_stream_on(ctx->mbufs)) {
+        av_log(avctx, AV_LOG_ERROR, "Failed stream on\n");
+        goto fail4;
+    }
+
+    if ((ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM)) != 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create frame ctx\n");
+        goto fail4;
+    }
+
+    if ((ret = ctx->fns->set_controls(avctx, ctx)) != 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed set controls\n");
+        goto fail5;
+    }
+
+    decode_q_init(&ctx->decode_q);
+
+    // Set our s/w format
+    avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format;
+
+    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s\n",
+           ctx->fns->name,
+           decdev_media_path(decdev), decdev_video_path(decdev));
+
+    return 0;
+
+fail5:
+    av_buffer_unref(&avctx->hw_frames_ctx);
+fail4:
+    mediabufs_ctl_unref(&ctx->mbufs);
+fail3:
+    media_pool_delete(&ctx->mpool);
+fail2:
+    pollqueue_unref(&ctx->pq);
+fail1:
+    dmabufs_ctl_delete(&ctx->dbufs);
+fail0:
+    devscan_delete(&ctx->devscan);
+    return ret;
+}
+
+const AVHWAccel ff_hevc_v4l2request_hwaccel = {
+    .name           = "hevc_v4l2request",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_HEVC,
+    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
+    .alloc_frame    = v4l2_req_hevc_alloc_frame,
+    .start_frame    = v4l2_req_hevc_start_frame,
+    .decode_slice   = v4l2_req_hevc_decode_slice,
+    .end_frame      = v4l2_req_hevc_end_frame,
+    .abort_frame    = v4l2_req_hevc_abort_frame,
+    .init           = v4l2_request_hevc_init,
+    .uninit         = v4l2_request_hevc_uninit,
+    .priv_data_size = sizeof(V4L2RequestContextHEVC),
+    .frame_params   = v4l2_req_hevc_frame_params,
+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
+};
diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
new file mode 100644
index 0000000000..f14f594564
--- /dev/null
+++ b/libavcodec/v4l2_request_hevc.h
@@ -0,0 +1,102 @@
+#ifndef AVCODEC_V4L2_REQUEST_HEVC_H
+#define AVCODEC_V4L2_REQUEST_HEVC_H
+
+#include <stdint.h>
+#include <drm_fourcc.h>
+#include "v4l2_req_decode_q.h"
+
+#ifndef DRM_FORMAT_NV15
+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
+#endif
+
+#ifndef DRM_FORMAT_NV20
+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
+#endif
+
+// P030 should be defined in drm_fourcc.h and hopefully will be sometime
+// in the future but until then...
+#ifndef DRM_FORMAT_P030
+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
+#endif
+
+#ifndef DRM_FORMAT_NV15
+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
+#endif
+
+#ifndef DRM_FORMAT_NV20
+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
+#endif
+
+#include <linux/videodev2.h>
+#ifndef V4L2_CID_CODEC_BASE
+#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
+#endif
+
+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
+// in drm_fourcc.h hopefully will be sometime in the future but until then...
+#ifndef V4L2_PIX_FMT_NV12_10_COL128
+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
+#endif
+
+#ifndef V4L2_PIX_FMT_NV12_COL128
+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
+#endif
+
+#ifndef V4L2_CTRL_FLAG_DYNAMIC_ARRAY
+#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY	0x0800
+#endif
+
+#define MAX_SLICES 128
+
+#define VCAT(name, version) name##_v##version
+#define V2(n,v) VCAT(n, v)
+#define V(n) V2(n, HEVC_CTRLS_VERSION)
+
+#define S2(x) #x
+#define STR(x) S2(x)
+
+// 1 per decoder
+struct v4l2_req_decode_fns;
+
+typedef struct V4L2RequestContextHEVC {
+//    V4L2RequestContext base;
+    const struct v4l2_req_decode_fns * fns;
+
+    unsigned int timestamp;  // ?? maybe uint64_t
+
+    int multi_slice;
+    int decode_mode;
+    int start_code;
+    int max_slices;
+
+    req_decode_q decode_q;
+
+    struct devscan *devscan;
+    struct dmabufs_ctl *dbufs;
+    struct pollqueue *pq;
+    struct media_pool * mpool;
+    struct mediabufs_ctl *mbufs;
+} V4L2RequestContextHEVC;
+
+typedef struct v4l2_req_decode_fns {
+    int src_pix_fmt_v4l2;
+    const char * name;
+
+    // Init setup
+    int (*probe)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
+    int (*set_controls)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
+
+    // Passthrough of hwaccel fns
+    int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
+    int (*decode_slice)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
+    int (*end_frame)(AVCodecContext *avctx);
+    void (*abort_frame)(AVCodecContext *avctx);
+    int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
+    int (*alloc_frame)(AVCodecContext * avctx, AVFrame *frame);
+} v4l2_req_decode_fns;
+
+
+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
+
+#endif
-- 
2.43.0


From 51bab24816d7771ed869f4b8161745d3a5908474 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 27 Apr 2021 19:30:36 +0100
Subject: [PATCH 012/157] Add no_cvt_hw option to ffmpeg

---
 fftools/ffmpeg.c     | 6 ++++--
 fftools/ffmpeg.h     | 2 ++
 fftools/ffmpeg_opt.c | 3 +++
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
index ca5431aeb4..7194630162 100644
--- a/fftools/ffmpeg.c
+++ b/fftools/ffmpeg.c
@@ -2008,6 +2008,9 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame, int keep_ref
         (ifilter->hw_frames_ctx && ifilter->hw_frames_ctx->data != frame->hw_frames_ctx->data))
         need_reinit = 1;
 
+    if (no_cvt_hw && fg->graph)
+        need_reinit = 0;
+
     if (sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DISPLAYMATRIX)) {
         if (!ifilter->displaymatrix || memcmp(sd->data, ifilter->displaymatrix, sizeof(int32_t) * 9))
             need_reinit = 1;
@@ -2277,8 +2280,7 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_
         decoded_frame->top_field_first = ist->top_field_first;
 
     ist->frames_decoded++;
-
-    if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
+    if (!no_cvt_hw && ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
         err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame);
         if (err < 0)
             goto fail;
diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h
index f1412f6446..8f478619b3 100644
--- a/fftools/ffmpeg.h
+++ b/fftools/ffmpeg.h
@@ -729,6 +729,8 @@ extern enum VideoSyncMethod video_sync_method;
 extern float frame_drop_threshold;
 extern int do_benchmark;
 extern int do_benchmark_all;
+extern int no_cvt_hw;
+extern int do_deinterlace;
 extern int do_hex_dump;
 extern int do_pkt_dump;
 extern int copy_ts;
diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c
index 055275d813..761db36588 100644
--- a/fftools/ffmpeg_opt.c
+++ b/fftools/ffmpeg_opt.c
@@ -71,6 +71,7 @@ enum VideoSyncMethod video_sync_method = VSYNC_AUTO;
 float frame_drop_threshold = 0;
 int do_benchmark      = 0;
 int do_benchmark_all  = 0;
+int no_cvt_hw         = 0;
 int do_hex_dump       = 0;
 int do_pkt_dump       = 0;
 int copy_ts           = 0;
@@ -1427,6 +1428,8 @@ const OptionDef options[] = {
         "add timings for benchmarking" },
     { "benchmark_all",  OPT_BOOL | OPT_EXPERT,                       { &do_benchmark_all },
       "add timings for each task" },
+    { "no_cvt_hw",      OPT_BOOL | OPT_EXPERT,                       { &no_cvt_hw },
+      "do not auto-convert hw frames to sw" },
     { "progress",       HAS_ARG | OPT_EXPERT,                        { .func_arg = opt_progress },
       "write program-readable progress information", "url" },
     { "stdin",          OPT_BOOL | OPT_EXPERT,                       { &stdin_interaction },
-- 
2.43.0


From 020ca9e6c68941ff2aa0fb2f43018b193089fef4 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 28 Apr 2021 10:16:39 +0100
Subject: [PATCH 013/157] Add vout_drm

---
 configure                |   4 +
 libavdevice/Makefile     |   1 +
 libavdevice/alldevices.c |   1 +
 libavdevice/drm_vout.c   | 638 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 644 insertions(+)
 create mode 100644 libavdevice/drm_vout.c

diff --git a/configure b/configure
index c091446730..fb72aa89a6 100755
--- a/configure
+++ b/configure
@@ -346,6 +346,7 @@ External library support:
   --enable-libnpp          enable Nvidia Performance Primitives-based code [no]
   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
   --enable-sand            enable sand video formats [rpi]
+  --enable-vout-drm        enable the vout_drm module - for internal testing only [no]
   --disable-nvdec          disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
   --disable-nvenc          disable Nvidia video encoding code [autodetect]
   --enable-omx             enable OpenMAX IL code [no]
@@ -1940,6 +1941,7 @@ FEATURE_LIST="
     small
     static
     swscale_alpha
+    vout_drm
 "
 
 # this list should be kept in linking order
@@ -3559,8 +3561,10 @@ sndio_indev_deps="sndio"
 sndio_outdev_deps="sndio"
 v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h"
 v4l2_indev_suggest="libv4l2"
+v4l2_outdev_deps="libdrm"
 v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
 v4l2_outdev_suggest="libv4l2"
+vout_drm_outdev_deps="libdrm vout_drm"
 vfwcap_indev_deps="vfw32 vfwcap_defines"
 xcbgrab_indev_deps="libxcb"
 xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
diff --git a/libavdevice/Makefile b/libavdevice/Makefile
index 8a62822b69..36aac30186 100644
--- a/libavdevice/Makefile
+++ b/libavdevice/Makefile
@@ -48,6 +48,7 @@ OBJS-$(CONFIG_SNDIO_OUTDEV)              += sndio_enc.o sndio.o
 OBJS-$(CONFIG_V4L2_INDEV)                += v4l2.o v4l2-common.o timefilter.o
 OBJS-$(CONFIG_V4L2_OUTDEV)               += v4l2enc.o v4l2-common.o
 OBJS-$(CONFIG_VFWCAP_INDEV)              += vfwcap.o
+OBJS-$(CONFIG_VOUT_DRM_OUTDEV)           += drm_vout.o
 OBJS-$(CONFIG_XCBGRAB_INDEV)             += xcbgrab.o
 OBJS-$(CONFIG_XV_OUTDEV)                 += xv.o
 
diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c
index 8a90fcb5d7..e2a8669f27 100644
--- a/libavdevice/alldevices.c
+++ b/libavdevice/alldevices.c
@@ -52,6 +52,7 @@ extern const FFOutputFormat ff_sndio_muxer;
 extern const AVInputFormat  ff_v4l2_demuxer;
 extern const FFOutputFormat ff_v4l2_muxer;
 extern const AVInputFormat  ff_vfwcap_demuxer;
+extern const FFOutputFormat ff_vout_drm_muxer;
 extern const AVInputFormat  ff_xcbgrab_demuxer;
 extern const FFOutputFormat ff_xv_muxer;
 
diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
new file mode 100644
index 0000000000..cfb33ce7c3
--- /dev/null
+++ b/libavdevice/drm_vout.c
@@ -0,0 +1,638 @@
+/*
+ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+// *** This module is a work in progress and its utility is strictly
+//     limited to testing.
+
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/hwcontext_drm.h"
+#include "libavformat/mux.h"
+#include "avdevice.h"
+
+#include "pthread.h"
+#include <semaphore.h>
+#include <unistd.h>
+
+#include <xf86drm.h>
+#include <xf86drmMode.h>
+
+#define TRACE_ALL 0
+
+#define DRM_MODULE "vc4"
+
+#define ERRSTR strerror(errno)
+
+struct drm_setup {
+   int conId;
+   uint32_t crtcId;
+   int crtcIdx;
+   uint32_t planeId;
+   unsigned int out_fourcc;
+   struct {
+       int x, y, width, height;
+   } compose;
+};
+
+typedef struct drm_aux_s {
+    unsigned int fb_handle;
+    uint32_t bo_handles[AV_DRM_MAX_PLANES];
+    AVFrame * frame;
+} drm_aux_t;
+
+// Aux size should only need to be 2, but on a few streams (Hobbit) under FKMS
+// we get initial flicker probably due to dodgy drm timing
+#define AUX_SIZE 3
+typedef struct drm_display_env_s
+{
+    AVClass *class;
+
+    int drm_fd;
+    uint32_t con_id;
+    struct drm_setup setup;
+    enum AVPixelFormat avfmt;
+    int show_all;
+
+    unsigned int ano;
+    drm_aux_t aux[AUX_SIZE];
+
+    pthread_t q_thread;
+    sem_t q_sem_in;
+    sem_t q_sem_out;
+    int q_terminate;
+    AVFrame * q_next;
+
+} drm_display_env_t;
+
+
+static int drm_vout_write_trailer(AVFormatContext *s)
+{
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
+#endif
+
+    return 0;
+}
+
+static int drm_vout_write_header(AVFormatContext *s)
+{
+    const AVCodecParameters * const par = s->streams[0]->codecpar;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
+#endif
+    if (   s->nb_streams > 1
+        || par->codec_type != AVMEDIA_TYPE_VIDEO
+        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
+        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+static int find_plane(struct AVFormatContext * const avctx,
+                      const int drmfd, const int crtcidx, const uint32_t format,
+                      uint32_t * const pplane_id)
+{
+   drmModePlaneResPtr planes;
+   drmModePlanePtr plane;
+   unsigned int i;
+   unsigned int j;
+   int ret = 0;
+
+   planes = drmModeGetPlaneResources(drmfd);
+   if (!planes)
+   {
+       av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR);
+       return -1;
+   }
+
+   for (i = 0; i < planes->count_planes; ++i) {
+      plane = drmModeGetPlane(drmfd, planes->planes[i]);
+      if (!planes)
+      {
+          av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR);
+          break;
+      }
+
+      if (!(plane->possible_crtcs & (1 << crtcidx))) {
+         drmModeFreePlane(plane);
+         continue;
+      }
+
+      for (j = 0; j < plane->count_formats; ++j) {
+         if (plane->formats[j] == format)
+            break;
+      }
+
+      if (j == plane->count_formats) {
+         drmModeFreePlane(plane);
+         continue;
+      }
+
+      *pplane_id = plane->plane_id;
+      drmModeFreePlane(plane);
+      break;
+   }
+
+   if (i == planes->count_planes)
+      ret = -1;
+
+   drmModeFreePlaneResources(planes);
+   return ret;
+}
+
+static void da_uninit(drm_display_env_t * const de, drm_aux_t * da)
+{
+    if (da->fb_handle != 0) {
+        drmModeRmFB(de->drm_fd, da->fb_handle);
+        da->fb_handle = 0;
+    }
+
+    for (unsigned int i = 0; i != AV_DRM_MAX_PLANES; ++i) {
+        if (da->bo_handles[i]) {
+            struct drm_gem_close gem_close = {.handle = da->bo_handles[i]};
+            drmIoctl(de->drm_fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
+            da->bo_handles[i] = 0;
+        }
+    }
+    av_frame_free(&da->frame);
+}
+
+static int do_display(AVFormatContext * const s, drm_display_env_t * const de, AVFrame * frame)
+{
+    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
+    drm_aux_t * da = de->aux + de->ano;
+    const uint32_t format = desc->layers[0].format;
+    int ret = 0;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "<<< %s: fd=%d\n", __func__, desc->objects[0].fd);
+#endif
+
+    if (de->setup.out_fourcc != format) {
+        if (find_plane(s, de->drm_fd, de->setup.crtcIdx, format, &de->setup.planeId)) {
+            av_frame_free(&frame);
+            av_log(s, AV_LOG_WARNING, "No plane for format: %#x\n", format);
+            return -1;
+        }
+        de->setup.out_fourcc = format;
+    }
+
+    {
+        drmVBlank vbl = {
+            .request = {
+                .type = DRM_VBLANK_RELATIVE,
+                .sequence = 0
+            }
+        };
+
+        while (drmWaitVBlank(de->drm_fd, &vbl)) {
+            if (errno != EINTR) {
+//                av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR);
+                break;
+            }
+        }
+    }
+
+    da_uninit(de, da);
+
+    {
+        uint32_t pitches[4] = {0};
+        uint32_t offsets[4] = {0};
+        uint64_t modifiers[4] = {0};
+        uint32_t bo_handles[4] = {0};
+        int i, j, n;
+
+        da->frame = frame;
+
+        for (i = 0; i < desc->nb_objects; ++i) {
+            if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) {
+                av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR);
+                return -1;
+            }
+        }
+
+        n = 0;
+        for (i = 0; i < desc->nb_layers; ++i) {
+            for (j = 0; j < desc->layers[i].nb_planes; ++j) {
+                const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
+                const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
+                pitches[n] = p->pitch;
+                offsets[n] = p->offset;
+                modifiers[n] = obj->format_modifier;
+                bo_handles[n] = da->bo_handles[p->object_index];
+                ++n;
+            }
+        }
+
+#if 1 && TRACE_ALL
+        av_log(s, AV_LOG_DEBUG, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
+               " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
+               av_frame_cropped_width(frame),
+               av_frame_cropped_height(frame),
+               desc->layers[0].format,
+               bo_handles[0],
+               bo_handles[1],
+               bo_handles[2],
+               bo_handles[3],
+               pitches[0],
+               pitches[1],
+               pitches[2],
+               pitches[3],
+               offsets[0],
+               offsets[1],
+               offsets[2],
+               offsets[3],
+               (long long)modifiers[0],
+               (long long)modifiers[1],
+               (long long)modifiers[2],
+               (long long)modifiers[3]
+               );
+#endif
+
+        if (drmModeAddFB2WithModifiers(de->drm_fd,
+                                         av_frame_cropped_width(frame),
+                                         av_frame_cropped_height(frame),
+                                         desc->layers[0].format, bo_handles,
+                                         pitches, offsets, modifiers,
+                                         &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) {
+            av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR);
+            return -1;
+        }
+    }
+
+    ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId,
+                              da->fb_handle, 0,
+                de->setup.compose.x, de->setup.compose.y,
+                de->setup.compose.width,
+                de->setup.compose.height,
+                0, 0,
+                av_frame_cropped_width(frame) << 16,
+                av_frame_cropped_height(frame) << 16);
+
+    if (ret != 0) {
+        av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR);
+    }
+
+    de->ano = de->ano + 1 >= AUX_SIZE ? 0 : de->ano + 1;
+
+    return ret;
+}
+
+static int do_sem_wait(sem_t * const sem, const int nowait)
+{
+    while (nowait ? sem_trywait(sem) : sem_wait(sem)) {
+        if (errno != EINTR)
+            return -errno;
+    }
+    return 0;
+}
+
+static void * display_thread(void * v)
+{
+    AVFormatContext * const s = v;
+    drm_display_env_t * const de = s->priv_data;
+    int i;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
+#endif
+
+    sem_post(&de->q_sem_out);
+
+    for (;;) {
+        AVFrame * frame;
+
+        do_sem_wait(&de->q_sem_in, 0);
+
+        if (de->q_terminate)
+            break;
+
+        frame = de->q_next;
+        de->q_next = NULL;
+        sem_post(&de->q_sem_out);
+
+        do_display(s, de, frame);
+    }
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
+#endif
+
+    for (i = 0; i != AUX_SIZE; ++i)
+        da_uninit(de, de->aux + i);
+
+    av_frame_free(&de->q_next);
+
+    return NULL;
+}
+
+static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
+{
+    const AVFrame * const src_frame = (AVFrame *)pkt->data;
+    AVFrame * frame;
+    drm_display_env_t * const de = s->priv_data;
+    int ret;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
+#endif
+
+    if ((src_frame->flags & AV_FRAME_FLAG_CORRUPT) != 0) {
+        av_log(s, AV_LOG_WARNING, "Discard corrupt frame: fmt=%d, ts=%" PRId64 "\n", src_frame->format, src_frame->pts);
+        return 0;
+    }
+
+    if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
+        frame = av_frame_alloc();
+        av_frame_ref(frame, src_frame);
+    }
+    else if (src_frame->format == AV_PIX_FMT_VAAPI) {
+        frame = av_frame_alloc();
+        frame->format = AV_PIX_FMT_DRM_PRIME;
+        if (av_hwframe_map(frame, src_frame, 0) != 0)
+        {
+            av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
+            av_frame_free(&frame);
+            return AVERROR(EINVAL);
+        }
+    }
+    else {
+        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
+        return AVERROR(EINVAL);
+    }
+
+    ret = do_sem_wait(&de->q_sem_out, !de->show_all);
+    if (ret) {
+        av_frame_free(&frame);
+    }
+    else {
+        de->q_next = frame;
+        sem_post(&de->q_sem_in);
+    }
+
+    return 0;
+}
+
+static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
+                          unsigned flags)
+{
+    av_log(s, AV_LOG_ERROR, "%s: NIF: idx=%d, flags=%#x\n", __func__, stream_index, flags);
+    return AVERROR_PATCHWELCOME;
+}
+
+static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
+{
+#if TRACE_ALL
+    av_log(s, AV_LOG_DEBUG, "%s: %d\n", __func__, type);
+#endif
+    switch(type) {
+    case AV_APP_TO_DEV_WINDOW_REPAINT:
+        return 0;
+    default:
+        break;
+    }
+    return AVERROR(ENOSYS);
+}
+
+static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId)
+{
+   int ret = -1;
+   int i;
+   drmModeRes *res = drmModeGetResources(drmfd);
+   drmModeConnector *c;
+
+   if(!res)
+   {
+      printf( "drmModeGetResources failed: %s\n", ERRSTR);
+      return -1;
+   }
+
+   if (res->count_crtcs <= 0)
+   {
+      printf( "drm: no crts\n");
+      goto fail_res;
+   }
+
+   if (!s->conId) {
+      fprintf(stderr,
+         "No connector ID specified.  Choosing default from list:\n");
+
+      for (i = 0; i < res->count_connectors; i++) {
+         drmModeConnector *con =
+            drmModeGetConnector(drmfd, res->connectors[i]);
+         drmModeEncoder *enc = NULL;
+         drmModeCrtc *crtc = NULL;
+
+         if (con->encoder_id) {
+            enc = drmModeGetEncoder(drmfd, con->encoder_id);
+            if (enc->crtc_id) {
+               crtc = drmModeGetCrtc(drmfd, enc->crtc_id);
+            }
+         }
+
+         if (!s->conId && crtc) {
+            s->conId = con->connector_id;
+            s->crtcId = crtc->crtc_id;
+         }
+
+         av_log(avctx, AV_LOG_DEBUG, "Connector %d (crtc %d): type %d, %dx%d%s\n",
+                con->connector_id,
+                crtc ? crtc->crtc_id : 0,
+                con->connector_type,
+                crtc ? crtc->width : 0,
+                crtc ? crtc->height : 0,
+                (s->conId == (int)con->connector_id ?
+            " (chosen)" : ""));
+      }
+
+      if (!s->conId) {
+         av_log(avctx, AV_LOG_ERROR,
+            "No suitable enabled connector found.\n");
+         return -1;;
+      }
+   }
+
+   s->crtcIdx = -1;
+
+   for (i = 0; i < res->count_crtcs; ++i) {
+      if (s->crtcId == res->crtcs[i]) {
+         s->crtcIdx = i;
+         break;
+      }
+   }
+
+   if (s->crtcIdx == -1)
+   {
+       av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId);
+       goto fail_res;
+   }
+
+   if (res->count_connectors <= 0)
+   {
+       av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n");
+       goto fail_res;
+   }
+
+   c = drmModeGetConnector(drmfd, s->conId);
+   if (!c)
+   {
+       av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR);
+       goto fail_res;
+   }
+
+   if (!c->count_modes)
+   {
+       av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n");
+       goto fail_conn;
+   }
+
+   {
+      drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId);
+      s->compose.x = crtc->x;
+      s->compose.y = crtc->y;
+      s->compose.width = crtc->width;
+      s->compose.height = crtc->height;
+      drmModeFreeCrtc(crtc);
+   }
+
+   if (pConId)
+      *pConId = c->connector_id;
+   ret = 0;
+
+fail_conn:
+   drmModeFreeConnector(c);
+
+fail_res:
+   drmModeFreeResources(res);
+
+   return ret;
+}
+
+// deinit is called if init fails so no need to clean up explicity here
+static int drm_vout_init(struct AVFormatContext * s)
+{
+    drm_display_env_t * const de = s->priv_data;
+    int rv;
+    const char * drm_module = DRM_MODULE;
+
+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    de->drm_fd = -1;
+    de->con_id = 0;
+    de->setup = (struct drm_setup){0};
+    de->q_terminate = 0;
+
+    if ((de->drm_fd = drmOpen(drm_module, NULL)) < 0)
+    {
+        rv = AVERROR(errno);
+        av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", drm_module, av_err2str(rv));
+        return rv;
+    }
+
+    if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0)
+    {
+        av_log(s, AV_LOG_ERROR, "failed to find valid mode\n");
+        rv = AVERROR(EINVAL);
+        goto fail_close;
+    }
+
+    sem_init(&de->q_sem_in, 0, 0);
+    sem_init(&de->q_sem_out, 0, 0);
+    if (pthread_create(&de->q_thread, NULL, display_thread, s)) {
+        rv = AVERROR(errno);
+        av_log(s, AV_LOG_ERROR, "Failed to creatye display thread: %s\n", av_err2str(rv));
+        goto fail_close;
+    }
+
+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
+
+    return 0;
+
+fail_close:
+    close(de->drm_fd);
+    de->drm_fd = -1;
+    av_log(s, AV_LOG_DEBUG, ">>> %s: FAIL\n", __func__);
+
+    return rv;
+}
+
+static void drm_vout_deinit(struct AVFormatContext * s)
+{
+    drm_display_env_t * const de = s->priv_data;
+
+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    de->q_terminate = 1;
+    sem_post(&de->q_sem_in);
+    pthread_join(de->q_thread, NULL);
+    sem_destroy(&de->q_sem_in);
+    sem_destroy(&de->q_sem_out);
+
+    for (unsigned int i = 0; i != AUX_SIZE; ++i)
+        da_uninit(de, de->aux + i);
+
+    av_frame_free(&de->q_next);
+
+    if (de->drm_fd >= 0) {
+        close(de->drm_fd);
+        de->drm_fd = -1;
+    }
+
+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
+}
+
+
+#define OFFSET(x) offsetof(drm_display_env_t, x)
+static const AVOption options[] = {
+    { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
+    { NULL }
+};
+
+static const AVClass drm_vout_class = {
+    .class_name = "drm vid outdev",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
+};
+
+FFOutputFormat ff_vout_drm_muxer = {
+    .p = {
+        .name           = "vout_drm",
+        .long_name      = NULL_IF_CONFIG_SMALL("Drm video output device"),
+        .audio_codec    = AV_CODEC_ID_NONE,
+        .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
+        .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
+        .priv_class     = &drm_vout_class,
+    },
+    .priv_data_size = sizeof(drm_display_env_t),
+    .write_header   = drm_vout_write_header,
+    .write_packet   = drm_vout_write_packet,
+    .write_uncoded_frame = drm_vout_write_frame,
+    .write_trailer  = drm_vout_write_trailer,
+    .control_message = drm_vout_control_message,
+    .init           = drm_vout_init,
+    .deinit         = drm_vout_deinit,
+};
+
-- 
2.43.0


From e31a78ac841536ddcc59296072748fbb62decda1 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 28 Apr 2021 11:34:18 +0100
Subject: [PATCH 014/157] Add vout_egl

---
 configure                |   6 +
 libavdevice/Makefile     |   1 +
 libavdevice/alldevices.c |   1 +
 libavdevice/egl_vout.c   | 811 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 819 insertions(+)
 create mode 100644 libavdevice/egl_vout.c

diff --git a/configure b/configure
index fb72aa89a6..a4ffd87976 100755
--- a/configure
+++ b/configure
@@ -347,6 +347,7 @@ External library support:
   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
   --enable-sand            enable sand video formats [rpi]
   --enable-vout-drm        enable the vout_drm module - for internal testing only [no]
+  --enable-vout-egl        enable the vout_egl module - for internal testing only [no]
   --disable-nvdec          disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
   --disable-nvenc          disable Nvidia video encoding code [autodetect]
   --enable-omx             enable OpenMAX IL code [no]
@@ -1818,6 +1819,7 @@ EXTERNAL_LIBRARY_LIST="
     libdav1d
     libdc1394
     libdrm
+    epoxy
     libflite
     libfontconfig
     libfreetype
@@ -1942,6 +1944,7 @@ FEATURE_LIST="
     static
     swscale_alpha
     vout_drm
+    vout_egl
 "
 
 # this list should be kept in linking order
@@ -3565,6 +3568,8 @@ v4l2_outdev_deps="libdrm"
 v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
 v4l2_outdev_suggest="libv4l2"
 vout_drm_outdev_deps="libdrm vout_drm"
+vout_egl_outdev_deps="xlib"
+vout_egl_outdev_select="epoxy"
 vfwcap_indev_deps="vfw32 vfwcap_defines"
 xcbgrab_indev_deps="libxcb"
 xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
@@ -6596,6 +6601,7 @@ enabled libdav1d          && require_pkg_config libdav1d "dav1d >= 0.5.0" "dav1d
 enabled libdavs2          && require_pkg_config libdavs2 "davs2 >= 1.6.0" davs2.h davs2_decoder_open
 enabled libdc1394         && require_pkg_config libdc1394 libdc1394-2 dc1394/dc1394.h dc1394_new
 enabled libdrm            && require_pkg_config libdrm libdrm xf86drm.h drmGetVersion
+enabled epoxy             && require_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
 enabled libfdk_aac        && { check_pkg_config libfdk_aac fdk-aac "fdk-aac/aacenc_lib.h" aacEncOpen ||
                                { require libfdk_aac fdk-aac/aacenc_lib.h aacEncOpen -lfdk-aac &&
                                  warn "using libfdk without pkg-config"; } }
diff --git a/libavdevice/Makefile b/libavdevice/Makefile
index 36aac30186..0989cb895f 100644
--- a/libavdevice/Makefile
+++ b/libavdevice/Makefile
@@ -49,6 +49,7 @@ OBJS-$(CONFIG_V4L2_INDEV)                += v4l2.o v4l2-common.o timefilter.o
 OBJS-$(CONFIG_V4L2_OUTDEV)               += v4l2enc.o v4l2-common.o
 OBJS-$(CONFIG_VFWCAP_INDEV)              += vfwcap.o
 OBJS-$(CONFIG_VOUT_DRM_OUTDEV)           += drm_vout.o
+OBJS-$(CONFIG_VOUT_EGL_OUTDEV)           += egl_vout.o
 OBJS-$(CONFIG_XCBGRAB_INDEV)             += xcbgrab.o
 OBJS-$(CONFIG_XV_OUTDEV)                 += xv.o
 
diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c
index e2a8669f27..ffb410b92d 100644
--- a/libavdevice/alldevices.c
+++ b/libavdevice/alldevices.c
@@ -53,6 +53,7 @@ extern const AVInputFormat  ff_v4l2_demuxer;
 extern const FFOutputFormat ff_v4l2_muxer;
 extern const AVInputFormat  ff_vfwcap_demuxer;
 extern const FFOutputFormat ff_vout_drm_muxer;
+extern const FFOutputFormat ff_vout_egl_muxer;
 extern const AVInputFormat  ff_xcbgrab_demuxer;
 extern const FFOutputFormat ff_xv_muxer;
 
diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c
new file mode 100644
index 0000000000..7b9c610ace
--- /dev/null
+++ b/libavdevice/egl_vout.c
@@ -0,0 +1,811 @@
+/*
+ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+// *** This module is a work in progress and its utility is strictly
+//     limited to testing.
+//     Amongst other issues it doesn't wait for the pic to be displayed before
+//     returning the buffer so flikering does occur.
+
+#include <epoxy/gl.h>
+#include <epoxy/egl.h>
+
+#include "libavutil/opt.h"
+#include "libavutil/avassert.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/hwcontext_drm.h"
+#include "libavformat/mux.h"
+#include "avdevice.h"
+
+#include "pthread.h"
+#include <semaphore.h>
+#include <stdatomic.h>
+#include <unistd.h>
+
+#include <X11/Xlib.h>
+#include <X11/Xutil.h>
+
+#include "libavutil/rpi_sand_fns.h"
+
+#define TRACE_ALL 0
+
+struct egl_setup {
+   int conId;
+
+   Display *dpy;
+   EGLDisplay egl_dpy;
+   EGLContext ctx;
+   EGLSurface surf;
+   Window win;
+
+   uint32_t crtcId;
+   int crtcIdx;
+   uint32_t planeId;
+   struct {
+       int x, y, width, height;
+   } compose;
+};
+
+typedef struct egl_aux_s {
+    int fd;
+    GLuint texture;
+
+} egl_aux_t;
+
+typedef struct egl_display_env_s
+{
+    AVClass *class;
+
+    struct egl_setup setup;
+    enum AVPixelFormat avfmt;
+
+    int show_all;
+    int window_width, window_height;
+    int window_x, window_y;
+    int fullscreen;
+
+    egl_aux_t aux[32];
+
+    pthread_t q_thread;
+    pthread_mutex_t q_lock;
+    sem_t display_start_sem;
+    sem_t q_sem;
+    int q_terminate;
+    AVFrame * q_this;
+    AVFrame * q_next;
+
+} egl_display_env_t;
+
+
+/**
+ * Remove window border/decorations.
+ */
+static void
+no_border( Display *dpy, Window w)
+{
+   static const unsigned MWM_HINTS_DECORATIONS = (1 << 1);
+   static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5;
+
+   typedef struct
+   {
+      unsigned long       flags;
+      unsigned long       functions;
+      unsigned long       decorations;
+      long                inputMode;
+      unsigned long       status;
+   } PropMotifWmHints;
+
+   PropMotifWmHints motif_hints;
+   Atom prop, proptype;
+   unsigned long flags = 0;
+
+   /* setup the property */
+   motif_hints.flags = MWM_HINTS_DECORATIONS;
+   motif_hints.decorations = flags;
+
+   /* get the atom for the property */
+   prop = XInternAtom( dpy, "_MOTIF_WM_HINTS", True );
+   if (!prop) {
+      /* something went wrong! */
+      return;
+   }
+
+   /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */
+   proptype = prop;
+
+   XChangeProperty( dpy, w,                         /* display, window */
+                    prop, proptype,                 /* property, type */
+                    32,                             /* format: 32-bit datums */
+                    PropModeReplace,                /* mode */
+                    (unsigned char *) &motif_hints, /* data */
+                    PROP_MOTIF_WM_HINTS_ELEMENTS    /* nelements */
+                  );
+}
+
+
+/*
+ * Create an RGB, double-buffered window.
+ * Return the window and context handles.
+ */
+static int
+make_window(struct AVFormatContext * const s,
+            egl_display_env_t * const de,
+            Display *dpy, EGLDisplay egl_dpy, const char *name,
+            Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet)
+{
+   int scrnum = DefaultScreen( dpy );
+   XSetWindowAttributes attr;
+   unsigned long mask;
+   Window root = RootWindow( dpy, scrnum );
+   Window win;
+   EGLContext ctx;
+   const int fullscreen = de->fullscreen;
+   EGLConfig config;
+   int x = de->window_x;
+   int y = de->window_y;
+   int width = de->window_width ? de->window_width : 1280;
+   int height = de->window_height ? de->window_height : 720;
+
+
+   if (fullscreen) {
+      int scrnum = DefaultScreen(dpy);
+
+      x = 0; y = 0;
+      width = DisplayWidth(dpy, scrnum);
+      height = DisplayHeight(dpy, scrnum);
+   }
+
+   {
+      EGLint num_configs;
+      static const EGLint attribs[] = {
+         EGL_RED_SIZE, 1,
+         EGL_GREEN_SIZE, 1,
+         EGL_BLUE_SIZE, 1,
+         EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
+         EGL_NONE
+      };
+
+      if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) {
+         av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n");
+         return -1;
+      }
+   }
+
+   {
+      EGLint vid;
+      if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) {
+         av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n");
+         return -1;
+      }
+
+      {
+         XVisualInfo visTemplate = {
+            .visualid = vid,
+         };
+         int num_visuals;
+         XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask,
+                                               &visTemplate, &num_visuals);
+
+         /* window attributes */
+         attr.background_pixel = 0;
+         attr.border_pixel = 0;
+         attr.colormap = XCreateColormap( dpy, root, visinfo->visual, AllocNone);
+         attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask;
+         /* XXX this is a bad way to get a borderless window! */
+         mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask;
+
+         win = XCreateWindow( dpy, root, x, y, width, height,
+                              0, visinfo->depth, InputOutput,
+                              visinfo->visual, mask, &attr );
+         XFree(visinfo);
+      }
+   }
+
+   if (fullscreen)
+      no_border(dpy, win);
+
+   /* set hints and properties */
+   {
+      XSizeHints sizehints;
+      sizehints.x = x;
+      sizehints.y = y;
+      sizehints.width  = width;
+      sizehints.height = height;
+      sizehints.flags = USSize | USPosition;
+      XSetNormalHints(dpy, win, &sizehints);
+      XSetStandardProperties(dpy, win, name, name,
+                              None, (char **)NULL, 0, &sizehints);
+   }
+
+   eglBindAPI(EGL_OPENGL_ES_API);
+
+   {
+      static const EGLint ctx_attribs[] = {
+         EGL_CONTEXT_CLIENT_VERSION, 2,
+         EGL_NONE
+      };
+      ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs );
+      if (!ctx) {
+         av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
+         return -1;
+      }
+   }
+
+
+   XMapWindow(dpy, win);
+
+   {
+      EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
+      if (!surf) {
+         av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
+         return -1;
+      }
+
+      if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) {
+         av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
+         return -1;
+      }
+
+      *winRet = win;
+      *ctxRet = ctx;
+      *surfRet = surf;
+   }
+
+   return 0;
+}
+
+static GLint
+compile_shader(struct AVFormatContext * const avctx, GLenum target, const char *source)
+{
+   GLuint s = glCreateShader(target);
+
+   if (s == 0) {
+      av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n");
+      return 0;
+   }
+
+   glShaderSource(s, 1, (const GLchar **) &source, NULL);
+   glCompileShader(s);
+
+   {
+      GLint ok;
+      glGetShaderiv(s, GL_COMPILE_STATUS, &ok);
+
+      if (!ok) {
+         GLchar *info;
+         GLint size;
+
+         glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size);
+         info = malloc(size);
+
+         glGetShaderInfoLog(s, size, NULL, info);
+         av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source);
+
+         return 0;
+      }
+   }
+
+   return s;
+}
+
+static GLuint link_program(struct AVFormatContext * const s, GLint vs, GLint fs)
+{
+   GLuint prog = glCreateProgram();
+
+   if (prog == 0) {
+      av_log(s, AV_LOG_ERROR, "Failed to create program\n");
+      return 0;
+   }
+
+   glAttachShader(prog, vs);
+   glAttachShader(prog, fs);
+   glLinkProgram(prog);
+
+   {
+      GLint ok;
+      glGetProgramiv(prog, GL_LINK_STATUS, &ok);
+      if (!ok) {
+         /* Some drivers return a size of 1 for an empty log.  This is the size
+          * of a log that contains only a terminating NUL character.
+          */
+         GLint size;
+         GLchar *info = NULL;
+         glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size);
+         if (size > 1) {
+            info = malloc(size);
+            glGetProgramInfoLog(prog, size, NULL, info);
+         }
+
+         av_log(s, AV_LOG_ERROR, "Failed to link: %s\n",
+                 (info != NULL) ? info : "<empty log>");
+         return 0;
+      }
+   }
+
+   return prog;
+}
+
+static int
+gl_setup(struct AVFormatContext * const s)
+{
+   const char *vs =
+      "attribute vec4 pos;\n"
+      "varying vec2 texcoord;\n"
+      "\n"
+      "void main() {\n"
+      "  gl_Position = pos;\n"
+      "  texcoord.x = (pos.x + 1.0) / 2.0;\n"
+      "  texcoord.y = (-pos.y + 1.0) / 2.0;\n"
+      "}\n";
+   const char *fs =
+      "#extension GL_OES_EGL_image_external : enable\n"
+      "precision mediump float;\n"
+      "uniform samplerExternalOES s;\n"
+      "varying vec2 texcoord;\n"
+      "void main() {\n"
+      "  gl_FragColor = texture2D(s, texcoord);\n"
+      "}\n";
+
+   GLuint vs_s;
+   GLuint fs_s;
+   GLuint prog;
+
+   if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) ||
+       !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) ||
+       !(prog = link_program(s, vs_s, fs_s)))
+      return -1;
+
+   glUseProgram(prog);
+
+   {
+      static const float verts[] = {
+         -1, -1,
+         1, -1,
+         1, 1,
+         -1, 1,
+      };
+      glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts);
+   }
+
+   glEnableVertexAttribArray(0);
+   return 0;
+}
+
+static int egl_vout_write_trailer(AVFormatContext *s)
+{
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
+#endif
+
+    return 0;
+}
+
+static int egl_vout_write_header(AVFormatContext *s)
+{
+    const AVCodecParameters * const par = s->streams[0]->codecpar;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
+#endif
+    if (   s->nb_streams > 1
+        || par->codec_type != AVMEDIA_TYPE_VIDEO
+        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
+        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+
+static int do_display(AVFormatContext * const s, egl_display_env_t * const de, AVFrame * const frame)
+{
+    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
+    egl_aux_t * da = NULL;
+    unsigned int i;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
+#endif
+
+    for (i = 0; i != 32; ++i) {
+        if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) {
+            da = de->aux + i;
+            break;
+        }
+    }
+
+    if (da == NULL) {
+        av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__);
+        return AVERROR(EINVAL);
+    }
+
+    if (da->texture == 0) {
+        EGLint attribs[50];
+        EGLint * a = attribs;
+        int i, j;
+        static const EGLint anames[] = {
+           EGL_DMA_BUF_PLANE0_FD_EXT,
+           EGL_DMA_BUF_PLANE0_OFFSET_EXT,
+           EGL_DMA_BUF_PLANE0_PITCH_EXT,
+           EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
+           EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
+           EGL_DMA_BUF_PLANE1_FD_EXT,
+           EGL_DMA_BUF_PLANE1_OFFSET_EXT,
+           EGL_DMA_BUF_PLANE1_PITCH_EXT,
+           EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT,
+           EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
+           EGL_DMA_BUF_PLANE2_FD_EXT,
+           EGL_DMA_BUF_PLANE2_OFFSET_EXT,
+           EGL_DMA_BUF_PLANE2_PITCH_EXT,
+           EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT,
+           EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT,
+        };
+        const EGLint * b = anames;
+
+        *a++ = EGL_WIDTH;
+        *a++ = av_frame_cropped_width(frame);
+        *a++ = EGL_HEIGHT;
+        *a++ = av_frame_cropped_height(frame);
+        *a++ = EGL_LINUX_DRM_FOURCC_EXT;
+        *a++ = desc->layers[0].format;
+
+        for (i = 0; i < desc->nb_layers; ++i) {
+            for (j = 0; j < desc->layers[i].nb_planes; ++j) {
+                const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
+                const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
+                *a++ = *b++;
+                *a++ = obj->fd;
+                *a++ = *b++;
+                *a++ = p->offset;
+                *a++ = *b++;
+                *a++ = p->pitch;
+                if (obj->format_modifier == 0) {
+                   b += 2;
+                }
+                else {
+                   *a++ = *b++;
+                   *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF);
+                   *a++ = *b++;
+                   *a++ = (EGLint)(obj->format_modifier >> 32);
+                }
+            }
+        }
+
+        *a = EGL_NONE;
+
+#if TRACE_ALL
+        for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) {
+           av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]);
+        }
+#endif
+        {
+           const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy,
+                                              EGL_NO_CONTEXT,
+                                              EGL_LINUX_DMA_BUF_EXT,
+                                              NULL, attribs);
+           if (!image) {
+              av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd);
+              return -1;
+           }
+
+           glGenTextures(1, &da->texture);
+           glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
+           glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+           glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+           glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
+
+           eglDestroyImageKHR(de->setup.egl_dpy, image);
+        }
+
+        da->fd = desc->objects[0].fd;
+
+#if 0
+        av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
+               " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
+               av_frame_cropped_width(frame),
+               av_frame_cropped_height(frame),
+               desc->layers[0].format,
+               bo_plane_handles[0],
+               bo_plane_handles[1],
+               bo_plane_handles[2],
+               bo_plane_handles[3],
+               pitches[0],
+               pitches[1],
+               pitches[2],
+               pitches[3],
+               offsets[0],
+               offsets[1],
+               offsets[2],
+               offsets[3],
+               (long long)modifiers[0],
+               (long long)modifiers[1],
+               (long long)modifiers[2],
+               (long long)modifiers[3]
+               );
+#endif
+    }
+
+    glClearColor(0.5, 0.5, 0.5, 0.5);
+    glClear(GL_COLOR_BUFFER_BIT);
+
+    glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
+    glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+    eglSwapBuffers(de->setup.egl_dpy, de->setup.surf);
+
+    glDeleteTextures(1, &da->texture);
+    da->texture = 0;
+    da->fd = -1;
+
+    return 0;
+}
+
+static void * display_thread(void * v)
+{
+    AVFormatContext * const s = v;
+    egl_display_env_t * const de = s->priv_data;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
+#endif
+    {
+       EGLint egl_major, egl_minor;
+
+       de->setup.dpy = XOpenDisplay(NULL);
+       if (!de->setup.dpy) {
+          av_log(s, AV_LOG_ERROR, "Couldn't open X display\n");
+          goto fail;
+       }
+
+       de->setup.egl_dpy = eglGetDisplay(de->setup.dpy);
+       if (!de->setup.egl_dpy) {
+          av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n");
+          goto fail;
+       }
+
+       if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) {
+           av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n");
+           goto fail;
+       }
+
+       av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor);
+
+       if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) {
+          av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n");
+          goto fail;
+       }
+    }
+
+    if (!de->window_width || !de->window_height) {
+       de->window_width = 1280;
+       de->window_height = 720;
+    }
+    if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout",
+                    &de->setup.win, &de->setup.ctx, &de->setup.surf)) {
+       av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__);
+       goto fail;
+    }
+
+    if (gl_setup(s)) {
+       av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__);
+       goto fail;
+    }
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "--- %s: Start done\n", __func__);
+#endif
+    sem_post(&de->display_start_sem);
+
+    for (;;) {
+        AVFrame * frame;
+
+        while (sem_wait(&de->q_sem) != 0) {
+            av_assert0(errno == EINTR);
+        }
+
+        if (de->q_terminate)
+            break;
+
+        pthread_mutex_lock(&de->q_lock);
+        frame = de->q_next;
+        de->q_next = NULL;
+        pthread_mutex_unlock(&de->q_lock);
+
+        do_display(s, de, frame);
+
+        av_frame_free(&de->q_this);
+        de->q_this = frame;
+    }
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, ">>> %s\n", __func__);
+#endif
+
+    return NULL;
+
+fail:
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, ">>> %s: FAIL\n", __func__);
+#endif
+    de->q_terminate = 1;
+    sem_post(&de->display_start_sem);
+
+    return NULL;
+}
+
+static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
+{
+    const AVFrame * const src_frame = (AVFrame *)pkt->data;
+    AVFrame * frame;
+    egl_display_env_t * const de = s->priv_data;
+
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
+#endif
+
+    if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
+        frame = av_frame_alloc();
+        av_frame_ref(frame, src_frame);
+    }
+    else if (src_frame->format == AV_PIX_FMT_VAAPI) {
+        frame = av_frame_alloc();
+        frame->format = AV_PIX_FMT_DRM_PRIME;
+        if (av_hwframe_map(frame, src_frame, 0) != 0)
+        {
+            av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
+            av_frame_free(&frame);
+            return AVERROR(EINVAL);
+        }
+    }
+    else {
+        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
+        return AVERROR(EINVAL);
+    }
+
+    // Really hacky sync
+    while (de->show_all && de->q_next) {
+       usleep(3000);
+    }
+
+    pthread_mutex_lock(&de->q_lock);
+    {
+        AVFrame * const t = de->q_next;
+        de->q_next = frame;
+        frame = t;
+    }
+    pthread_mutex_unlock(&de->q_lock);
+
+    if (frame == NULL)
+        sem_post(&de->q_sem);
+    else
+        av_frame_free(&frame);
+
+    return 0;
+}
+
+static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
+                          unsigned flags)
+{
+    av_log(s, AV_LOG_ERROR, "%s: NIF: idx=%d, flags=%#x\n", __func__, stream_index, flags);
+    return AVERROR_PATCHWELCOME;
+}
+
+static int egl_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
+{
+#if TRACE_ALL
+    av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
+#endif
+    switch(type) {
+    case AV_APP_TO_DEV_WINDOW_REPAINT:
+        return 0;
+    default:
+        break;
+    }
+    return AVERROR(ENOSYS);
+}
+
+// deinit is called if init fails so no need to clean up explicity here
+static int egl_vout_init(struct AVFormatContext * s)
+{
+    egl_display_env_t * const de = s->priv_data;
+    unsigned int i;
+
+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    de->setup = (struct egl_setup){0};
+
+    for (i = 0; i != 32; ++i) {
+        de->aux[i].fd = -1;
+    }
+
+    de->q_terminate = 0;
+    pthread_mutex_init(&de->q_lock, NULL);
+    sem_init(&de->q_sem, 0, 0);
+    sem_init(&de->display_start_sem, 0, 0);
+    av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0);
+
+    sem_wait(&de->display_start_sem);
+    if (de->q_terminate) {
+       av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__);
+       return -1;
+    }
+
+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
+
+    return 0;
+}
+
+static void egl_vout_deinit(struct AVFormatContext * s)
+{
+    egl_display_env_t * const de = s->priv_data;
+
+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
+
+    de->q_terminate = 1;
+    sem_post(&de->q_sem);
+    pthread_join(de->q_thread, NULL);
+    sem_destroy(&de->q_sem);
+    pthread_mutex_destroy(&de->q_lock);
+
+    av_frame_free(&de->q_next);
+    av_frame_free(&de->q_this);
+
+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
+}
+
+#define OFFSET(x) offsetof(egl_display_env_t, x)
+static const AVOption options[] = {
+   { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
+   { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
+   { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
+   { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
+   { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
+    { NULL }
+
+};
+
+static const AVClass egl_vout_class = {
+    .class_name = "egl vid outdev",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
+};
+
+FFOutputFormat ff_vout_egl_muxer = {
+    .p = {
+        .name           = "vout_egl",
+        .long_name      = NULL_IF_CONFIG_SMALL("Egl video output device"),
+        .audio_codec    = AV_CODEC_ID_NONE,
+        .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
+        .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
+        .priv_class     = &egl_vout_class,
+    },
+    .priv_data_size = sizeof(egl_display_env_t),
+    .write_header   = egl_vout_write_header,
+    .write_packet   = egl_vout_write_packet,
+    .write_uncoded_frame = egl_vout_write_frame,
+    .write_trailer  = egl_vout_write_trailer,
+    .control_message = egl_vout_control_message,
+    .init           = egl_vout_init,
+    .deinit         = egl_vout_deinit,
+};
+
-- 
2.43.0


From 1de210e3ecf697bd3d15b277352691aac2527ed5 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 28 Apr 2021 12:51:22 +0100
Subject: [PATCH 015/157] V4L2 stateful rework

---
 libavcodec/Makefile       |   3 +-
 libavcodec/v4l2_buffers.c | 556 +++++++++++++++++++++++++++-----------
 libavcodec/v4l2_buffers.h |  28 +-
 libavcodec/v4l2_context.c | 536 +++++++++++++++++++++++++++---------
 libavcodec/v4l2_context.h |  20 +-
 libavcodec/v4l2_m2m.c     |  20 +-
 libavcodec/v4l2_m2m.h     |  31 +++
 libavcodec/v4l2_m2m_dec.c | 446 ++++++++++++++++++++++++++----
 8 files changed, 1286 insertions(+), 354 deletions(-)

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 2d440b5648..e1aa0ba014 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -169,7 +169,8 @@ OBJS-$(CONFIG_VIDEODSP)                += videodsp.o
 OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
 OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
 OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
-OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
+OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\
+                                          weak_link.o
 OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\
 					  v4l2_req_devscan.o weak_link.o
 OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 3f5471067a..a003934ca1 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -21,6 +21,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include <drm_fourcc.h>
 #include <linux/videodev2.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
@@ -29,12 +30,14 @@
 #include <poll.h>
 #include "libavcodec/avcodec.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/hwcontext.h"
 #include "v4l2_context.h"
 #include "v4l2_buffers.h"
 #include "v4l2_m2m.h"
+#include "weak_link.h"
 
 #define USEC_PER_SEC 1000000
-static AVRational v4l2_timebase = { 1, USEC_PER_SEC };
+static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };
 
 static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
 {
@@ -51,34 +54,44 @@ static inline AVCodecContext *logger(V4L2Buffer *buf)
 static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf)
 {
     V4L2m2mContext *s = buf_to_m2mctx(avbuf);
-
-    if (s->avctx->pkt_timebase.num)
-        return s->avctx->pkt_timebase;
-    return s->avctx->time_base;
+    const AVRational tb = s->avctx->pkt_timebase.num ?
+        s->avctx->pkt_timebase :
+        s->avctx->time_base;
+    return tb.num && tb.den ? tb : v4l2_timebase;
 }
 
-static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts)
+static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts, int no_rescale)
 {
-    int64_t v4l2_pts;
-
-    if (pts == AV_NOPTS_VALUE)
-        pts = 0;
-
     /* convert pts to v4l2 timebase */
-    v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
+    const int64_t v4l2_pts =
+        no_rescale ? pts :
+        pts == AV_NOPTS_VALUE ? 0 :
+            av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
     out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
     out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
 }
 
-static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf)
+static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf, int no_rescale)
 {
-    int64_t v4l2_pts;
-
     /* convert pts back to encoder timebase */
-    v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
+    const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
                         avbuf->buf.timestamp.tv_usec;
 
-    return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
+    return
+        no_rescale ? v4l2_pts :
+        v4l2_pts == 0 ? AV_NOPTS_VALUE :
+            av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
+}
+
+static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length)
+{
+    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
+        out->planes[plane].bytesused = bytesused;
+        out->planes[plane].length = length;
+    } else {
+        out->buf.bytesused = bytesused;
+        out->buf.length = length;
+    }
 }
 
 static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
@@ -209,68 +222,143 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
     return AVCOL_TRC_UNSPECIFIED;
 }
 
-static void v4l2_free_buffer(void *opaque, uint8_t *unused)
+static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
 {
-    V4L2Buffer* avbuf = opaque;
-    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
+    AVDRMLayerDescriptor *layer;
 
-    if (atomic_fetch_sub(&avbuf->context_refcount, 1) == 1) {
-        atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel);
+    /* fill the DRM frame descriptor */
+    drm_desc->nb_objects = avbuf->num_planes;
+    drm_desc->nb_layers = 1;
 
-        if (s->reinit) {
-            if (!atomic_load(&s->refcount))
-                sem_post(&s->refsync);
-        } else {
-            if (s->draining && V4L2_TYPE_IS_OUTPUT(avbuf->context->type)) {
-                /* no need to queue more buffers to the driver */
-                avbuf->status = V4L2BUF_AVAILABLE;
-            }
-            else if (avbuf->context->streamon)
-                ff_v4l2_buffer_enqueue(avbuf);
-        }
+    layer = &drm_desc->layers[0];
+    layer->nb_planes = avbuf->num_planes;
+
+    for (int i = 0; i < avbuf->num_planes; i++) {
+        layer->planes[i].object_index = i;
+        layer->planes[i].offset = 0;
+        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
+    }
+
+    switch (avbuf->context->av_pix_fmt) {
+    case AV_PIX_FMT_YUYV422:
+
+        layer->format = DRM_FORMAT_YUYV;
+        layer->nb_planes = 1;
+
+        break;
+
+    case AV_PIX_FMT_NV12:
+    case AV_PIX_FMT_NV21:
+
+        layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ?
+            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
+
+        if (avbuf->num_planes > 1)
+            break;
+
+        layer->nb_planes = 2;
+
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
+            avbuf->context->format.fmt.pix.height;
+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
+        break;
+
+    case AV_PIX_FMT_YUV420P:
+
+        layer->format = DRM_FORMAT_YUV420;
+
+        if (avbuf->num_planes > 1)
+            break;
+
+        layer->nb_planes = 3;
+
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
+            avbuf->context->format.fmt.pix.height;
+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
+
+        layer->planes[2].object_index = 0;
+        layer->planes[2].offset = layer->planes[1].offset +
+            ((avbuf->plane_info[0].bytesperline *
+              avbuf->context->format.fmt.pix.height) >> 2);
+        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
+        break;
 
-        av_buffer_unref(&avbuf->context_ref);
+    default:
+        drm_desc->nb_layers = 0;
+        break;
     }
+
+    return (uint8_t *) drm_desc;
 }
 
-static int v4l2_buf_increase_ref(V4L2Buffer *in)
+static void v4l2_free_bufref(void *opaque, uint8_t *data)
 {
-    V4L2m2mContext *s = buf_to_m2mctx(in);
+    AVBufferRef * bufref = (AVBufferRef *)data;
+    V4L2Buffer *avbuf = (V4L2Buffer *)bufref->data;
+    struct V4L2Context *ctx = ff_weak_link_lock(&avbuf->context_wl);
 
-    if (in->context_ref)
-        atomic_fetch_add(&in->context_refcount, 1);
-    else {
-        in->context_ref = av_buffer_ref(s->self_ref);
-        if (!in->context_ref)
-            return AVERROR(ENOMEM);
+    if (ctx != NULL) {
+        // Buffer still attached to context
+        V4L2m2mContext *s = buf_to_m2mctx(avbuf);
 
-        in->context_refcount = 1;
-    }
+        ff_mutex_lock(&ctx->lock);
 
-    in->status = V4L2BUF_RET_USER;
-    atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed);
+        avbuf->status = V4L2BUF_AVAILABLE;
 
-    return 0;
+        if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name);
+            /* no need to queue more buffers to the driver */
+        }
+        else if (ctx->streamon) {
+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer requeue\n", ctx->name);
+            avbuf->buf.timestamp.tv_sec = 0;
+            avbuf->buf.timestamp.tv_usec = 0;
+            ff_v4l2_buffer_enqueue(avbuf);  // will set to IN_DRIVER
+        }
+        else {
+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer freed but streamoff\n", ctx->name);
+        }
+
+        ff_mutex_unlock(&ctx->lock);
+    }
+
+    ff_weak_link_unlock(avbuf->context_wl);
+    av_buffer_unref(&bufref);
 }
 
-static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
 {
-    int ret;
+    struct v4l2_exportbuffer expbuf;
+    int i, ret;
 
-    if (plane >= in->num_planes)
-        return AVERROR(EINVAL);
+    for (i = 0; i < avbuf->num_planes; i++) {
+        memset(&expbuf, 0, sizeof(expbuf));
 
-    /* even though most encoders return 0 in data_offset encoding vp8 does require this value */
-    *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset,
-                            in->plane_info[plane].length, v4l2_free_buffer, in, 0);
-    if (!*buf)
-        return AVERROR(ENOMEM);
+        expbuf.index = avbuf->buf.index;
+        expbuf.type = avbuf->buf.type;
+        expbuf.plane = i;
 
-    ret = v4l2_buf_increase_ref(in);
-    if (ret)
-        av_buffer_unref(buf);
+        ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf);
+        if (ret < 0)
+            return AVERROR(errno);
 
-    return ret;
+        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) {
+            /* drm frame */
+            avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length;
+            avbuf->drm_frame.objects[i].fd = expbuf.fd;
+            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        } else {
+            /* drm frame */
+            avbuf->drm_frame.objects[0].size = avbuf->buf.length;
+            avbuf->drm_frame.objects[0].fd = expbuf.fd;
+            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        }
+    }
+
+    return 0;
 }
 
 static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset)
@@ -285,30 +373,50 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i
 
     memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset));
 
-    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
-        out->planes[plane].bytesused = bytesused;
-        out->planes[plane].length = length;
-    } else {
-        out->buf.bytesused = bytesused;
-        out->buf.length = length;
-    }
+    set_buf_length(out, plane, bytesused, length);
 
     return 0;
 }
 
+static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf)
+{
+    AVBufferRef * bufref = av_buffer_ref(avbuf->context->bufrefs[avbuf->buf.index]);
+    AVBufferRef * newbuf;
+
+    if (!bufref)
+        return NULL;
+
+    newbuf = av_buffer_create((uint8_t *)bufref, sizeof(*bufref), v4l2_free_bufref, NULL, 0);
+    if (newbuf == NULL)
+        av_buffer_unref(&bufref);
+
+    avbuf->status = V4L2BUF_RET_USER;
+    return newbuf;
+}
+
 static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
 {
-    int i, ret;
+    int i;
 
     frame->format = avbuf->context->av_pix_fmt;
 
-    for (i = 0; i < avbuf->num_planes; i++) {
-        ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
-        if (ret)
-            return ret;
+    frame->buf[0] = wrap_avbuf(avbuf);
+    if (frame->buf[0] == NULL)
+        return AVERROR(ENOMEM);
+
+    if (buf_to_m2mctx(avbuf)->output_drm) {
+        /* 1. get references to the actual data */
+        frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
+        frame->format = AV_PIX_FMT_DRM_PRIME;
+        frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref);
+        return 0;
+    }
+
 
+    /* 1. get references to the actual data */
+    for (i = 0; i < avbuf->num_planes; i++) {
+        frame->data[i] = (uint8_t *)avbuf->plane_info[i].mm_addr + avbuf->planes[i].data_offset;
         frame->linesize[i] = avbuf->plane_info[i].bytesperline;
-        frame->data[i] = frame->buf[i]->data;
     }
 
     /* fixup special cases */
@@ -337,68 +445,95 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
     return 0;
 }
 
+static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h)
+{
+    if (dst_stride == src_stride && w + 32 >= dst_stride) {
+        memcpy(dst, src, dst_stride * h);
+    }
+    else {
+        while (--h >= 0) {
+            memcpy(dst, src, w);
+            dst += dst_stride;
+            src += src_stride;
+        }
+    }
+}
+
+static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes)
+{
+    return i != 0  && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA));
+}
+
 static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
 {
-    int i, ret;
-    struct v4l2_format fmt = out->context->format;
-    int pixel_format = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
-                       fmt.fmt.pix_mp.pixelformat : fmt.fmt.pix.pixelformat;
-    int height       = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
-                       fmt.fmt.pix_mp.height : fmt.fmt.pix.height;
-    int is_planar_format = 0;
-
-    switch (pixel_format) {
-    case V4L2_PIX_FMT_YUV420M:
-    case V4L2_PIX_FMT_YVU420M:
-#ifdef V4L2_PIX_FMT_YUV422M
-    case V4L2_PIX_FMT_YUV422M:
-#endif
-#ifdef V4L2_PIX_FMT_YVU422M
-    case V4L2_PIX_FMT_YVU422M:
-#endif
-#ifdef V4L2_PIX_FMT_YUV444M
-    case V4L2_PIX_FMT_YUV444M:
-#endif
-#ifdef V4L2_PIX_FMT_YVU444M
-    case V4L2_PIX_FMT_YVU444M:
-#endif
-    case V4L2_PIX_FMT_NV12M:
-    case V4L2_PIX_FMT_NV21M:
-    case V4L2_PIX_FMT_NV12MT_16X16:
-    case V4L2_PIX_FMT_NV12MT:
-    case V4L2_PIX_FMT_NV16M:
-    case V4L2_PIX_FMT_NV61M:
-        is_planar_format = 1;
-    }
-
-    if (!is_planar_format) {
-        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
-        int planes_nb = 0;
-        int offset = 0;
-
-        for (i = 0; i < desc->nb_components; i++)
-            planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
-
-        for (i = 0; i < planes_nb; i++) {
-            int size, h = height;
-            if (i == 1 || i == 2) {
+    int i;
+    int num_planes = 0;
+    int pel_strides[4] = {0};
+
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+
+    if ((desc->flags & AV_PIX_FMT_FLAG_HWACCEL) != 0) {
+        av_log(NULL, AV_LOG_ERROR, "%s: HWACCEL cannot be copied\n", __func__);
+        return -1;
+    }
+
+    for (i = 0; i != desc->nb_components; ++i) {
+        if (desc->comp[i].plane >= num_planes)
+            num_planes = desc->comp[i].plane + 1;
+        pel_strides[desc->comp[i].plane] = desc->comp[i].step;
+    }
+
+    if (out->num_planes > 1) {
+        if (num_planes != out->num_planes) {
+            av_log(NULL, AV_LOG_ERROR, "%s: Num planes mismatch: %d != %d\n", __func__, num_planes, out->num_planes);
+            return -1;
+        }
+        for (i = 0; i != num_planes; ++i) {
+            int w = frame->width;
+            int h = frame->height;
+            if (is_chroma(desc, i, num_planes)) {
+                w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
                 h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
             }
-            size = frame->linesize[i] * h;
-            ret = v4l2_bufref_to_buf(out, 0, frame->data[i], size, offset);
-            if (ret)
-                return ret;
-            offset += size;
+
+            cpy_2d(out->plane_info[i].mm_addr, out->plane_info[i].bytesperline,
+                   frame->data[i], frame->linesize[i],
+                   w * pel_strides[i], h);
+            set_buf_length(out, i, out->plane_info[i].bytesperline * h, out->plane_info[i].length);
         }
-        return 0;
     }
+    else
+    {
+        unsigned int offset = 0;
+
+        for (i = 0; i != num_planes; ++i) {
+            int w = frame->width;
+            int h = frame->height;
+            int dst_stride = out->plane_info[0].bytesperline;
+            uint8_t * const dst = (uint8_t *)out->plane_info[0].mm_addr + offset;
+
+            if (is_chroma(desc, i, num_planes)) {
+                // Is chroma
+                dst_stride >>= desc->log2_chroma_w;
+                offset += dst_stride * (out->context->height >> desc->log2_chroma_h);
+                w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
+                h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
+            }
+            else {
+                // Is luma or alpha
+                offset += dst_stride * out->context->height;
+            }
+            if (offset > out->plane_info[0].length) {
+                av_log(NULL, AV_LOG_ERROR, "%s: Plane total %d > buffer size %d\n", __func__, offset, out->plane_info[0].length);
+                return -1;
+            }
 
-    for (i = 0; i < out->num_planes; i++) {
-        ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0);
-        if (ret)
-            return ret;
+            cpy_2d(dst, dst_stride,
+                   frame->data[i], frame->linesize[i],
+                   w * pel_strides[i], h);
+        }
+        set_buf_length(out, 0, offset, out->plane_info[0].length);
     }
-
     return 0;
 }
 
@@ -410,14 +545,15 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
 
 int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
 {
-    v4l2_set_pts(out, frame->pts);
+    v4l2_set_pts(out, frame->pts, 0);
 
     return v4l2_buffer_swframe_to_buf(frame, out);
 }
 
-int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_rescale_pts)
 {
     int ret;
+    V4L2Context * const ctx = avbuf->context;
 
     av_frame_unref(frame);
 
@@ -432,13 +568,22 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
     frame->colorspace = v4l2_get_color_space(avbuf);
     frame->color_range = v4l2_get_color_range(avbuf);
     frame->color_trc = v4l2_get_color_trc(avbuf);
-    frame->pts = v4l2_get_pts(avbuf);
+    frame->pts = v4l2_get_pts(avbuf, no_rescale_pts);
     frame->pkt_dts = AV_NOPTS_VALUE;
 
     /* these values are updated also during re-init in v4l2_process_driver_event */
-    frame->height = avbuf->context->height;
-    frame->width = avbuf->context->width;
-    frame->sample_aspect_ratio = avbuf->context->sample_aspect_ratio;
+    frame->height = ctx->height;
+    frame->width = ctx->width;
+    frame->sample_aspect_ratio = ctx->sample_aspect_ratio;
+
+    if (ctx->selection.height && ctx->selection.width) {
+        frame->crop_left = ctx->selection.left < frame->width ? ctx->selection.left : 0;
+        frame->crop_top  = ctx->selection.top < frame->height ? ctx->selection.top  : 0;
+        frame->crop_right = ctx->selection.left + ctx->selection.width < frame->width ?
+            frame->width - (ctx->selection.left + ctx->selection.width) : 0;
+        frame->crop_bottom = ctx->selection.top + ctx->selection.height < frame->height ?
+            frame->height - (ctx->selection.top + ctx->selection.height) : 0;
+    }
 
     /* 3. report errors upstream */
     if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) {
@@ -451,15 +596,14 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
 
 int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
 {
-    int ret;
-
     av_packet_unref(pkt);
-    ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf);
-    if (ret)
-        return ret;
+
+    pkt->buf = wrap_avbuf(avbuf);
+    if (pkt->buf == NULL)
+        return AVERROR(ENOMEM);
 
     pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
-    pkt->data = pkt->buf->data;
+    pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset;
 
     if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
         pkt->flags |= AV_PKT_FLAG_KEY;
@@ -469,20 +613,27 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
         pkt->flags |= AV_PKT_FLAG_CORRUPT;
     }
 
-    pkt->dts = pkt->pts = v4l2_get_pts(avbuf);
+    pkt->dts = pkt->pts = v4l2_get_pts(avbuf, 0);
 
     return 0;
 }
 
-int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
+                                    const void *extdata, size_t extlen, int no_rescale_pts)
 {
     int ret;
 
-    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0);
+    if (extlen) {
+        ret = v4l2_bufref_to_buf(out, 0, extdata, extlen, 0);
+        if (ret)
+            return ret;
+    }
+
+    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen);
     if (ret)
         return ret;
 
-    v4l2_set_pts(out, pkt->pts);
+    v4l2_set_pts(out, pkt->pts, no_rescale_pts);
 
     if (pkt->flags & AV_PKT_FLAG_KEY)
         out->flags = V4L2_BUF_FLAG_KEYFRAME;
@@ -490,15 +641,61 @@ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
     return 0;
 }
 
-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
+{
+    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
+}
+
+
+static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data)
+{
+    V4L2Buffer * const avbuf = (V4L2Buffer *)data;
+    int i;
+
+    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->plane_info); ++i) {
+        struct V4L2Plane_info *p = avbuf->plane_info + i;
+        if (p->mm_addr != NULL)
+            munmap(p->mm_addr, p->length);
+    }
+
+    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
+        if (avbuf->drm_frame.objects[i].fd != -1)
+            close(avbuf->drm_frame.objects[i].fd);
+    }
+
+    ff_weak_link_unref(&avbuf->context_wl);
+
+    av_free(avbuf);
+}
+
+
+int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx)
 {
-    V4L2Context *ctx = avbuf->context;
     int ret, i;
+    V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf));
+    AVBufferRef * bufref;
+
+    *pbufref = NULL;
+    if (avbuf == NULL)
+        return AVERROR(ENOMEM);
+
+    bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0);
+    if (bufref == NULL) {
+        av_free(avbuf);
+        return AVERROR(ENOMEM);
+    }
 
+    avbuf->context = ctx;
     avbuf->buf.memory = V4L2_MEMORY_MMAP;
     avbuf->buf.type = ctx->type;
     avbuf->buf.index = index;
 
+    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
+        avbuf->drm_frame.objects[i].fd = -1;
+    }
+
+    avbuf->context_wl = ff_weak_link_ref(ctx->wl_master);
+
     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
         avbuf->buf.length = VIDEO_MAX_PLANES;
         avbuf->buf.m.planes = avbuf->planes;
@@ -506,7 +703,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
 
     ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf);
     if (ret < 0)
-        return AVERROR(errno);
+        goto fail;
 
     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
         avbuf->num_planes = 0;
@@ -526,25 +723,33 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
 
         if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
-            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
-                                           PROT_READ | PROT_WRITE, MAP_SHARED,
-                                           buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
+
+            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
+                !buf_to_m2mctx(avbuf)->output_drm) {
+                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
+                                               PROT_READ | PROT_WRITE, MAP_SHARED,
+                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
+            }
         } else {
             avbuf->plane_info[i].length = avbuf->buf.length;
-            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
-                                          PROT_READ | PROT_WRITE, MAP_SHARED,
-                                          buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
+
+            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
+                !buf_to_m2mctx(avbuf)->output_drm) {
+                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
+                                               PROT_READ | PROT_WRITE, MAP_SHARED,
+                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
+            }
         }
 
-        if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
-            return AVERROR(ENOMEM);
+        if (avbuf->plane_info[i].mm_addr == MAP_FAILED) {
+            avbuf->plane_info[i].mm_addr = NULL;
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
     }
 
     avbuf->status = V4L2BUF_AVAILABLE;
 
-    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
-        return 0;
-
     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
         avbuf->buf.m.planes = avbuf->planes;
         avbuf->buf.length   = avbuf->num_planes;
@@ -554,7 +759,20 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
         avbuf->buf.length    = avbuf->planes[0].length;
     }
 
-    return ff_v4l2_buffer_enqueue(avbuf);
+    if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+        if (buf_to_m2mctx(avbuf)->output_drm) {
+            ret = v4l2_buffer_export_drm(avbuf);
+            if (ret)
+                    goto fail;
+        }
+    }
+
+    *pbufref = bufref;
+    return 0;
+
+fail:
+    av_buffer_unref(&bufref);
+    return ret;
 }
 
 int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
@@ -563,9 +781,27 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
 
     avbuf->buf.flags = avbuf->flags;
 
+    if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) {
+        av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
+               avbuf->context->name, avbuf->buf.index,
+               avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
+               avbuf->context->q_count);
+    }
+
     ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf);
-    if (ret < 0)
-        return AVERROR(errno);
+    if (ret < 0) {
+        int err = errno;
+        av_log(logger(avbuf), AV_LOG_ERROR, "--- %s VIDIOC_QBUF: index %d FAIL err %d (%s)\n",
+               avbuf->context->name, avbuf->buf.index,
+               err, strerror(err));
+        return AVERROR(err);
+    }
+
+    ++avbuf->context->q_count;
+    av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
+           avbuf->context->name, avbuf->buf.index,
+           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
+           avbuf->context->q_count);
 
     avbuf->status = V4L2BUF_IN_DRIVER;
 
diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
index 3d2ff1b9a5..111526aee3 100644
--- a/libavcodec/v4l2_buffers.h
+++ b/libavcodec/v4l2_buffers.h
@@ -28,27 +28,37 @@
 #include <stddef.h>
 #include <linux/videodev2.h>
 
+#include "avcodec.h"
 #include "libavutil/buffer.h"
 #include "libavutil/frame.h"
+#include "libavutil/hwcontext_drm.h"
 #include "packet.h"
 
 enum V4L2Buffer_status {
     V4L2BUF_AVAILABLE,
     V4L2BUF_IN_DRIVER,
+    V4L2BUF_IN_USE,
     V4L2BUF_RET_USER,
 };
 
 /**
  * V4L2Buffer (wrapper for v4l2_buffer management)
  */
+struct V4L2Context;
+struct ff_weak_link_client;
+
 typedef struct V4L2Buffer {
-    /* each buffer needs to have a reference to its context */
+    /* each buffer needs to have a reference to its context
+     * The pointer is good enough for most operation but once the buffer has
+     * been passed to the user the buffer may become orphaned so for free ops
+     * the weak link must be used to ensure that the context is actually
+     * there
+     */
     struct V4L2Context *context;
+    struct ff_weak_link_client *context_wl;
 
-    /* This object is refcounted per-plane, so we need to keep track
-     * of how many context-refs we are holding. */
-    AVBufferRef *context_ref;
-    atomic_uint context_refcount;
+    /* DRM descriptor */
+    AVDRMFrameDescriptor drm_frame;
 
     /* keep track of the mmap address and mmap length */
     struct V4L2Plane_info {
@@ -73,11 +83,12 @@ typedef struct V4L2Buffer {
  *
  * @param[in] frame The AVFRame to push the information to
  * @param[in] buf The V4L2Buffer to get the information from
+ * @param[in] no_rescale_pts If non-zero do not rescale PTS
  *
  * @returns 0 in case of success, AVERROR(EINVAL) if the number of planes is incorrect,
  * AVERROR(ENOMEM) if the AVBufferRef can't be created.
  */
-int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf);
+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf, int no_rescale_pts);
 
 /**
  * Extracts the data from a V4L2Buffer to an AVPacket
@@ -101,6 +112,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf);
  */
 int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
 
+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
+                                    const void *extdata, size_t extlen, int no_rescale_pts);
+
 /**
  * Extracts the data from an AVFrame to a V4L2Buffer
  *
@@ -119,7 +133,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
  *
  * @returns 0 in case of success, a negative AVERROR code otherwise
  */
-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
+int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx);
 
 /**
  * Enqueues a V4L2Buffer
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index a40be94690..be76068af3 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -27,11 +27,13 @@
 #include <unistd.h>
 #include <fcntl.h>
 #include <poll.h>
+#include "libavutil/avassert.h"
 #include "libavcodec/avcodec.h"
 #include "decode.h"
 #include "v4l2_buffers.h"
 #include "v4l2_fmt.h"
 #include "v4l2_m2m.h"
+#include "weak_link.h"
 
 struct v4l2_format_update {
     uint32_t v4l2_fmt;
@@ -153,21 +155,99 @@ static inline void v4l2_save_to_context(V4L2Context* ctx, struct v4l2_format_upd
     }
 }
 
-static int v4l2_start_decode(V4L2Context *ctx)
+static int get_default_selection(V4L2Context * const ctx, struct v4l2_rect *r)
 {
-    struct v4l2_decoder_cmd cmd = {
-        .cmd = V4L2_DEC_CMD_START,
-        .flags = 0,
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
+    struct v4l2_selection selection = {
+        .type = V4L2_BUF_TYPE_VIDEO_CAPTURE,
+        .target = V4L2_SEL_TGT_COMPOSE
     };
-    int ret;
 
-    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DECODER_CMD, &cmd);
-    if (ret)
+    memset(r, 0, sizeof(*r));
+    if (ioctl(s->fd, VIDIOC_G_SELECTION, &selection))
         return AVERROR(errno);
 
+    *r = selection.r;
     return 0;
 }
 
+static int do_source_change(V4L2m2mContext * const s)
+{
+    AVCodecContext *const avctx = s->avctx;
+
+    int ret;
+    int reinit;
+    int full_reinit;
+    struct v4l2_format cap_fmt = s->capture.format;
+
+    s->resize_pending = 0;
+    s->capture.done = 0;
+
+    ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
+    if (ret) {
+        av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->capture.name);
+        return 0;
+    }
+
+    s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
+
+    get_default_selection(&s->capture, &s->capture.selection);
+
+    reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
+    if (reinit) {
+        s->capture.height = v4l2_get_height(&cap_fmt);
+        s->capture.width = v4l2_get_width(&cap_fmt);
+    }
+    s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
+
+    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n",
+           s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
+           s->capture.selection.width, s->capture.selection.height,
+           s->capture.selection.left, s->capture.selection.top);
+
+    s->reinit = 1;
+
+    if (reinit) {
+        if (avctx)
+            ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
+        if (ret < 0)
+            av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n");
+
+        ret = ff_v4l2_m2m_codec_reinit(s);
+        if (ret) {
+            av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
+            return AVERROR(EINVAL);
+        }
+        goto reinit_run;
+    }
+
+    /* Buffers are OK so just stream off to ack */
+    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only\n", __func__);
+
+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
+    if (ret)
+        av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n");
+    s->draining = 0;
+
+    /* reinit executed */
+reinit_run:
+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON);
+    return 1;
+}
+
+static int ctx_done(V4L2Context * const ctx)
+{
+    int rv = 0;
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
+
+    ctx->done = 1;
+
+    if (s->resize_pending && !V4L2_TYPE_IS_OUTPUT(ctx->type))
+        rv = do_source_change(s);
+
+    return rv;
+}
+
 /**
  * handle resolution change event and end of stream event
  * returns 1 if reinit was successful, negative if it failed
@@ -175,8 +255,7 @@ static int v4l2_start_decode(V4L2Context *ctx)
  */
 static int v4l2_handle_event(V4L2Context *ctx)
 {
-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
-    struct v4l2_format cap_fmt = s->capture.format;
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
     struct v4l2_event evt = { 0 };
     int ret;
 
@@ -186,44 +265,22 @@ static int v4l2_handle_event(V4L2Context *ctx)
         return 0;
     }
 
+    av_log(logger(ctx), AV_LOG_INFO, "Dq event %d\n", evt.type);
+
     if (evt.type == V4L2_EVENT_EOS) {
-        ctx->done = 1;
+//        ctx->done = 1;
+        av_log(logger(ctx), AV_LOG_TRACE, "%s VIDIOC_EVENT_EOS\n", ctx->name);
         return 0;
     }
 
     if (evt.type != V4L2_EVENT_SOURCE_CHANGE)
         return 0;
 
-    ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
-    if (ret) {
-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name);
-        return 0;
-    }
-
-    if (v4l2_resolution_changed(&s->capture, &cap_fmt)) {
-        s->capture.height = v4l2_get_height(&cap_fmt);
-        s->capture.width = v4l2_get_width(&cap_fmt);
-        s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
-    } else {
-        v4l2_start_decode(ctx);
+    s->resize_pending = 1;
+    if (!ctx->done)
         return 0;
-    }
-
-    s->reinit = 1;
-
-    if (s->avctx)
-        ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
-    if (ret < 0)
-        av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n");
-
-    ret = ff_v4l2_m2m_codec_reinit(s);
-    if (ret) {
-        av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n");
-        return AVERROR(EINVAL);
-    }
 
-    /* reinit executed */
-    return 1;
+    return do_source_change(s);
 }
 
 static int v4l2_stop_decode(V4L2Context *ctx)
@@ -266,8 +323,26 @@ static int v4l2_stop_encode(V4L2Context *ctx)
     return 0;
 }
 
+static int count_in_driver(const V4L2Context * const ctx)
+{
+    int i;
+    int n = 0;
+
+    if (!ctx->bufrefs)
+        return -1;
+
+    for (i = 0; i < ctx->num_buffers; ++i) {
+        V4L2Buffer *const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
+        if (avbuf->status == V4L2BUF_IN_DRIVER)
+            ++n;
+    }
+    return n;
+}
+
 static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
 {
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
+    const int is_capture = !V4L2_TYPE_IS_OUTPUT(ctx->type);
     struct v4l2_plane planes[VIDEO_MAX_PLANES];
     struct v4l2_buffer buf = { 0 };
     V4L2Buffer *avbuf;
@@ -276,50 +351,84 @@ static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
         .fd = ctx_to_m2mctx(ctx)->fd,
     };
     int i, ret;
+    int no_rx_means_done = 0;
 
-    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) {
+    if (is_capture && ctx->bufrefs) {
         for (i = 0; i < ctx->num_buffers; i++) {
-            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
+            avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
+            if (avbuf->status == V4L2BUF_IN_DRIVER)
                 break;
         }
         if (i == ctx->num_buffers)
-            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to "
+            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers (%d) returned to "
                                                 "userspace. Increase num_capture_buffers "
                                                 "to prevent device deadlock or dropped "
-                                                "packets/frames.\n");
+                                                "packets/frames.\n", i);
     }
 
+#if 0
+    // I think this is true but pointless
+    // we will get some other form of EOF signal
+
     /* if we are draining and there are no more capture buffers queued in the driver we are done */
-    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) {
+    if (is_capture && ctx_to_m2mctx(ctx)->draining) {
         for (i = 0; i < ctx->num_buffers; i++) {
             /* capture buffer initialization happens during decode hence
              * detection happens at runtime
              */
-            if (!ctx->buffers)
+            if (!ctx->bufrefs)
                 break;
 
-            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
+            avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
+            if (avbuf->status == V4L2BUF_IN_DRIVER)
                 goto start;
         }
         ctx->done = 1;
         return NULL;
     }
+#endif
 
 start:
-    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
-        pfd.events =  POLLOUT | POLLWRNORM;
-    else {
+    if (is_capture) {
         /* no need to listen to requests for more input while draining */
         if (ctx_to_m2mctx(ctx)->draining)
             pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
+    } else {
+        pfd.events =  POLLOUT | POLLWRNORM;
     }
+    no_rx_means_done = s->resize_pending && is_capture;
 
     for (;;) {
-        ret = poll(&pfd, 1, timeout);
+        // If we have a resize pending then all buffers should be Qed
+        // With a resize pending we should be in drain but evidence suggests
+        // that not all decoders do this so poll to clear
+        int t2 = no_rx_means_done ? 0 : timeout < 0 ? 3000 : timeout;
+        const int e = pfd.events;
+
+        ret = poll(&pfd, 1, t2);
+
         if (ret > 0)
             break;
-        if (errno == EINTR)
-            continue;
+
+        if (ret < 0) {
+            int err = errno;
+            if (err == EINTR)
+                continue;
+            av_log(logger(ctx), AV_LOG_ERROR, "=== poll error %d (%s): events=%#x, cap buffers=%d\n",
+                   err, strerror(err),
+                   e, count_in_driver(ctx));
+            return NULL;
+        }
+
+        // ret == 0 (timeout)
+        if (no_rx_means_done) {
+            av_log(logger(ctx), AV_LOG_DEBUG, "Ctx done on timeout\n");
+            ret = ctx_done(ctx);
+            if (ret > 0)
+                goto start;
+        }
+        if (timeout == -1)
+            av_log(logger(ctx), AV_LOG_ERROR, "=== poll unexpected TIMEOUT: events=%#x, cap buffers=%d\n", e, count_in_driver(ctx));;
         return NULL;
     }
 
@@ -329,7 +438,8 @@ start:
            no need to raise a warning */
         if (timeout == 0) {
             for (i = 0; i < ctx->num_buffers; i++) {
-                if (ctx->buffers[i].status != V4L2BUF_AVAILABLE)
+                avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
+                if (avbuf->status != V4L2BUF_AVAILABLE)
                     av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
             }
         }
@@ -347,22 +457,25 @@ start:
             ctx->done = 1;
             return NULL;
         }
-        if (ret) {
-            /* if re-init was successful drop the buffer (if there was one)
-             * since we had to reconfigure capture (unmap all buffers)
-             */
-            return NULL;
-        }
+        if (ret > 0)
+            goto start;
     }
 
     /* 2. dequeue the buffer */
     if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
 
-        if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+        if (is_capture) {
             /* there is a capture buffer ready */
             if (pfd.revents & (POLLIN | POLLRDNORM))
                 goto dequeue;
 
+            // CAPTURE Q drained
+            if (no_rx_means_done) {
+                if (ctx_done(ctx) > 0)
+                    goto start;
+                return NULL;
+            }
+
             /* the driver is ready to accept more input; instead of waiting for the capture
              * buffer to complete we return NULL so input can proceed (we are single threaded)
              */
@@ -380,37 +493,58 @@ dequeue:
             buf.m.planes = planes;
         }
 
-        ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf);
-        if (ret) {
-            if (errno != EAGAIN) {
-                ctx->done = 1;
-                if (errno != EPIPE)
+        while ((ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf)) == -1) {
+            const int err = errno;
+            if (err == EINTR)
+                continue;
+            if (err != EAGAIN) {
+                // EPIPE on CAPTURE can be used instead of BUF_FLAG_LAST
+                if (err != EPIPE || !is_capture)
                     av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
-                        ctx->name, av_err2str(AVERROR(errno)));
+                        ctx->name, av_err2str(AVERROR(err)));
+                if (ctx_done(ctx) > 0)
+                    goto start;
             }
             return NULL;
         }
+        --ctx->q_count;
+        av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d\n",
+               ctx->name, buf.index,
+               buf.timestamp.tv_sec, buf.timestamp.tv_usec,
+               ctx->q_count, ++ctx->dq_count);
 
-        if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+        avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
+        avbuf->status = V4L2BUF_AVAILABLE;
+        avbuf->buf = buf;
+        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+            memcpy(avbuf->planes, planes, sizeof(planes));
+            avbuf->buf.m.planes = avbuf->planes;
+        }
+
+        if (ctx_to_m2mctx(ctx)->draining && is_capture) {
             int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
                             buf.m.planes[0].bytesused : buf.bytesused;
             if (bytesused == 0) {
-                ctx->done = 1;
+                av_log(logger(ctx), AV_LOG_DEBUG, "Buffer empty - reQ\n");
+
+                // Must reQ so we don't leak
+                // May not matter if the next thing we do is release all the
+                // buffers but better to be tidy.
+                ff_v4l2_buffer_enqueue(avbuf);
+
+                if (ctx_done(ctx) > 0)
+                    goto start;
                 return NULL;
             }
 #ifdef V4L2_BUF_FLAG_LAST
-            if (buf.flags & V4L2_BUF_FLAG_LAST)
-                ctx->done = 1;
+            if (buf.flags & V4L2_BUF_FLAG_LAST) {
+                av_log(logger(ctx), AV_LOG_TRACE, "FLAG_LAST set\n");
+                avbuf->status = V4L2BUF_IN_USE;  // Avoid flushing this buffer
+                ctx_done(ctx);
+            }
 #endif
         }
 
-        avbuf = &ctx->buffers[buf.index];
-        avbuf->status = V4L2BUF_AVAILABLE;
-        avbuf->buf = buf;
-        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-            memcpy(avbuf->planes, planes, sizeof(planes));
-            avbuf->buf.m.planes = avbuf->planes;
-        }
         return avbuf;
     }
 
@@ -429,8 +563,9 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
     }
 
     for (i = 0; i < ctx->num_buffers; i++) {
-        if (ctx->buffers[i].status == V4L2BUF_AVAILABLE)
-            return &ctx->buffers[i];
+        V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
+        if (avbuf->status == V4L2BUF_AVAILABLE)
+            return avbuf;
     }
 
     return NULL;
@@ -438,25 +573,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
 
 static int v4l2_release_buffers(V4L2Context* ctx)
 {
-    struct v4l2_requestbuffers req = {
-        .memory = V4L2_MEMORY_MMAP,
-        .type = ctx->type,
-        .count = 0, /* 0 -> unmaps buffers from the driver */
-    };
-    int i, j;
+    int i;
+    int ret = 0;
+    const int fd = ctx_to_m2mctx(ctx)->fd;
 
-    for (i = 0; i < ctx->num_buffers; i++) {
-        V4L2Buffer *buffer = &ctx->buffers[i];
+    // Orphan any buffers in the wild
+    ff_weak_link_break(&ctx->wl_master);
+
+    if (ctx->bufrefs) {
+        for (i = 0; i < ctx->num_buffers; i++)
+            av_buffer_unref(ctx->bufrefs + i);
+    }
+
+    if (fd != -1) {
+        struct v4l2_requestbuffers req = {
+            .memory = V4L2_MEMORY_MMAP,
+            .type = ctx->type,
+            .count = 0, /* 0 -> unmap all buffers from the driver */
+        };
+
+        while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) {
+            if (errno == EINTR)
+                continue;
+
+            ret = AVERROR(errno);
 
-        for (j = 0; j < buffer->num_planes; j++) {
-            struct V4L2Plane_info *p = &buffer->plane_info[j];
-            if (p->mm_addr && p->length)
-                if (munmap(p->mm_addr, p->length) < 0)
-                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno)));
+            av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n",
+                ctx->name, av_err2str(AVERROR(errno)));
+
+            if (ctx_to_m2mctx(ctx)->output_drm)
+                av_log(logger(ctx), AV_LOG_ERROR,
+                    "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n"
+                    "for all buffers: \n"
+                    "  1. drmModeRmFB(..)\n"
+                    "  2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
         }
     }
+    ctx->q_count = 0;
 
-    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
+    return ret;
 }
 
 static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
@@ -485,6 +640,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm
 
 static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
 {
+    V4L2m2mContext* s = ctx_to_m2mctx(ctx);
+    V4L2m2mPriv *priv = s->avctx->priv_data;
     enum AVPixelFormat pixfmt = ctx->av_pix_fmt;
     struct v4l2_fmtdesc fdesc;
     int ret;
@@ -503,6 +660,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
         if (ret)
             return AVERROR(EINVAL);
 
+        if (priv->pix_fmt != AV_PIX_FMT_NONE) {
+            if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) {
+                fdesc.index++;
+                continue;
+            }
+        }
+
         pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
         ret = v4l2_try_raw_format(ctx, pixfmt);
         if (ret){
@@ -555,18 +719,73 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
   *
   *****************************************************************************/
 
+
+static void flush_all_buffers_status(V4L2Context* const ctx)
+{
+    int i;
+    for (i = 0; i < ctx->num_buffers; ++i) {
+        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
+        if (buf->status == V4L2BUF_IN_DRIVER)
+            buf->status = V4L2BUF_AVAILABLE;
+    }
+    ctx->q_count = 0;
+}
+
+static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
+{
+    int i;
+    int rv;
+
+    if (!ctx->bufrefs) {
+        rv = ff_v4l2_context_init(ctx);
+        if (rv) {
+            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
+            return rv;
+        }
+    }
+
+    for (i = 0; i < ctx->num_buffers; ++i) {
+        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
+        if (buf->status == V4L2BUF_AVAILABLE) {
+            rv = ff_v4l2_buffer_enqueue(buf);
+            if (rv < 0)
+                return rv;
+        }
+    }
+    return 0;
+}
+
 int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
 {
     int type = ctx->type;
     int ret;
+    AVCodecContext * const avctx = logger(ctx);
+
+    ff_mutex_lock(&ctx->lock);
+
+    if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
+        stuff_all_buffers(avctx, ctx);
 
     ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
-    if (ret < 0)
-        return AVERROR(errno);
+    if (ret < 0) {
+        const int err = errno;
+        av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name,
+               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err);
+        ret = AVERROR(err);
+    }
+    else
+    {
+        if (cmd == VIDIOC_STREAMOFF)
+            flush_all_buffers_status(ctx);
 
-    ctx->streamon = (cmd == VIDIOC_STREAMON);
+        ctx->streamon = (cmd == VIDIOC_STREAMON);
+        av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name,
+               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF");
+    }
 
-    return 0;
+    ff_mutex_unlock(&ctx->lock);
+
+    return ret;
 }
 
 int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
@@ -594,7 +813,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
     return ff_v4l2_buffer_enqueue(avbuf);
 }
 
-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
+                                   const void * extdata, size_t extlen, int no_rescale_pts)
 {
     V4L2m2mContext *s = ctx_to_m2mctx(ctx);
     V4L2Buffer* avbuf;
@@ -602,8 +822,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
 
     if (!pkt->size) {
         ret = v4l2_stop_decode(ctx);
+        // Log but otherwise ignore stop failure
         if (ret)
-            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name);
+            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
         s->draining = 1;
         return 0;
     }
@@ -612,14 +833,14 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
     if (!avbuf)
         return AVERROR(EAGAIN);
 
-    ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf);
+    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts);
     if (ret)
         return ret;
 
     return ff_v4l2_buffer_enqueue(avbuf);
 }
 
-int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout, int no_rescale_pts)
 {
     V4L2Buffer *avbuf;
 
@@ -636,7 +857,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
         return AVERROR(EAGAIN);
     }
 
-    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
+    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf, no_rescale_pts);
 }
 
 int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
@@ -695,54 +916,57 @@ void ff_v4l2_context_release(V4L2Context* ctx)
 {
     int ret;
 
-    if (!ctx->buffers)
+    if (!ctx->bufrefs)
         return;
 
     ret = v4l2_release_buffers(ctx);
     if (ret)
         av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name);
 
-    av_freep(&ctx->buffers);
+    av_freep(&ctx->bufrefs);
+    av_buffer_unref(&ctx->frames_ref);
+
+    ff_mutex_destroy(&ctx->lock);
 }
 
-int ff_v4l2_context_init(V4L2Context* ctx)
+
+static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers)
 {
-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
     struct v4l2_requestbuffers req;
-    int ret, i;
-
-    if (!v4l2_type_supported(ctx)) {
-        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
-        return AVERROR_PATCHWELCOME;
-    }
-
-    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
-    if (ret)
-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name);
+    int ret;
+    int i;
 
     memset(&req, 0, sizeof(req));
-    req.count = ctx->num_buffers;
+    req.count = req_buffers;
     req.memory = V4L2_MEMORY_MMAP;
     req.type = ctx->type;
-    ret = ioctl(s->fd, VIDIOC_REQBUFS, &req);
-    if (ret < 0) {
-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, strerror(errno));
-        return AVERROR(errno);
+    while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) {
+        if (errno != EINTR) {
+            ret = AVERROR(errno);
+            av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, av_err2str(ret));
+            return ret;
+        }
     }
 
     ctx->num_buffers = req.count;
-    ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer));
-    if (!ctx->buffers) {
+    ctx->bufrefs = av_mallocz(ctx->num_buffers * sizeof(*ctx->bufrefs));
+    if (!ctx->bufrefs) {
         av_log(logger(ctx), AV_LOG_ERROR, "%s malloc enomem\n", ctx->name);
-        return AVERROR(ENOMEM);
+        goto fail_release;
     }
 
-    for (i = 0; i < req.count; i++) {
-        ctx->buffers[i].context = ctx;
-        ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i);
-        if (ret < 0) {
+    ctx->wl_master = ff_weak_link_new(ctx);
+    if (!ctx->wl_master) {
+        ret = AVERROR(ENOMEM);
+        goto fail_release;
+    }
+
+    for (i = 0; i < ctx->num_buffers; i++) {
+        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx);
+        if (ret) {
             av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
-            goto error;
+            goto fail_release;
         }
     }
 
@@ -756,10 +980,62 @@ int ff_v4l2_context_init(V4L2Context* ctx)
 
     return 0;
 
-error:
+fail_release:
     v4l2_release_buffers(ctx);
+    av_freep(&ctx->bufrefs);
+    return ret;
+}
+
+int ff_v4l2_context_init(V4L2Context* ctx)
+{
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
+    int ret;
+
+    // It is not valid to reinit a context without a previous release
+    av_assert0(ctx->bufrefs == NULL);
+
+    if (!v4l2_type_supported(ctx)) {
+        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    ff_mutex_init(&ctx->lock, NULL);
 
-    av_freep(&ctx->buffers);
+    if (s->output_drm) {
+        AVHWFramesContext *hwframes;
+
+        ctx->frames_ref = av_hwframe_ctx_alloc(s->device_ref);
+        if (!ctx->frames_ref) {
+            ret = AVERROR(ENOMEM);
+            goto fail_unlock;
+        }
+
+        hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
+        hwframes->format = AV_PIX_FMT_DRM_PRIME;
+        hwframes->sw_format = ctx->av_pix_fmt;
+        hwframes->width = ctx->width;
+        hwframes->height = ctx->height;
+        ret = av_hwframe_ctx_init(ctx->frames_ref);
+        if (ret < 0)
+            goto fail_unref_hwframes;
+    }
+
+    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
+    if (ret) {
+        ret = AVERROR(errno);
+        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed: %s\n", ctx->name, av_err2str(ret));
+        goto fail_unref_hwframes;
+    }
+
+    ret = create_buffers(ctx, ctx->num_buffers);
+    if (ret < 0)
+        goto fail_unref_hwframes;
+
+    return 0;
 
+fail_unref_hwframes:
+    av_buffer_unref(&ctx->frames_ref);
+fail_unlock:
+    ff_mutex_destroy(&ctx->lock);
     return ret;
 }
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 6f7460c89a..59009d11d1 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -32,6 +32,8 @@
 #include "libavutil/rational.h"
 #include "codec_id.h"
 #include "packet.h"
+#include "libavutil/buffer.h"
+#include "libavutil/thread.h"
 #include "v4l2_buffers.h"
 
 typedef struct V4L2Context {
@@ -71,11 +73,12 @@ typedef struct V4L2Context {
      */
     int width, height;
     AVRational sample_aspect_ratio;
+    struct v4l2_rect selection;
 
     /**
-     * Indexed array of V4L2Buffers
+     * Indexed array of pointers to V4L2Buffers
      */
-    V4L2Buffer *buffers;
+    AVBufferRef **bufrefs;
 
     /**
      * Readonly after init.
@@ -93,6 +96,12 @@ typedef struct V4L2Context {
      */
     int done;
 
+    AVBufferRef *frames_ref;
+    int q_count;
+    int dq_count;
+    struct ff_weak_link_master *wl_master;
+
+    AVMutex lock;
 } V4L2Context;
 
 /**
@@ -157,9 +166,12 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
  * @param[in] ctx The V4L2Context to dequeue from.
  * @param[inout] f The AVFrame to dequeue to.
  * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
+ * @param[in] no_rescale_pts (0 rescale pts, 1 use pts as
+ *       timestamp directly)
+ *
  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
  */
-int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int no_rescale_pts);
 
 /**
  * Enqueues a buffer to a V4L2Context from an AVPacket
@@ -171,7 +183,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
  * @param[in] pkt A pointer to an AVPacket.
  * @return 0 in case of success, a negative error otherwise.
  */
-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size, int no_rescale_pts);
 
 /**
  * Enqueues a buffer to a V4L2Context from an AVFrame
diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index 602efb7a16..516e6d9858 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -216,13 +216,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s)
         av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n");
 
     /* 2. unmap the capture buffers (v4l2 and ffmpeg):
-     *    we must wait for all references to be released before being allowed
-     *    to queue new buffers.
      */
-    av_log(log_ctx, AV_LOG_DEBUG, "waiting for user to release AVBufferRefs\n");
-    if (atomic_load(&s->refcount))
-        while(sem_wait(&s->refsync) == -1 && errno == EINTR);
-
     ff_v4l2_context_release(&s->capture);
 
     /* 3. get the new capture format */
@@ -259,6 +253,8 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context)
     av_frame_free(&s->frame);
     av_packet_unref(&s->buf_pkt);
 
+    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n");
+
     av_free(s);
 }
 
@@ -270,6 +266,11 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
     if (!s)
         return 0;
 
+    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n");
+
+    if (av_codec_is_decoder(s->avctx->codec))
+        av_packet_unref(&s->buf_pkt);
+
     if (s->fd >= 0) {
         ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
         if (ret)
@@ -282,7 +283,14 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
 
     ff_v4l2_context_release(&s->output);
 
+    close(s->fd);
+    s->fd = -1;
+
     s->self_ref = NULL;
+    // This is only called on avctx close so after this point we don't have that
+    // Crash sooner if we find we are using it (can still log with avctx = NULL)
+    s->avctx = NULL;
+    priv->context = NULL;
     av_buffer_unref(&priv->context_ref);
 
     return 0;
diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 04d86d7b92..24a9c94864 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -30,6 +30,7 @@
 #include <linux/videodev2.h>
 
 #include "libavcodec/avcodec.h"
+#include "libavutil/pixfmt.h"
 #include "v4l2_context.h"
 
 #define container_of(ptr, type, member) ({ \
@@ -40,6 +41,17 @@
     { "num_output_buffers", "Number of buffers in the output context",\
         OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 2, INT_MAX, FLAGS }
 
+#define FF_V4L2_M2M_TRACK_SIZE 128
+typedef struct V4L2m2mTrackEl {
+    int     discard;   // If we see this buffer its been flushed, so discard
+    int     pkt_size;
+    int64_t pts;
+    int64_t reordered_opaque;
+    int64_t pkt_pos;
+    int64_t pkt_duration;
+    int64_t track_pts;
+} V4L2m2mTrackEl;
+
 typedef struct V4L2m2mContext {
     char devname[PATH_MAX];
     int fd;
@@ -53,6 +65,7 @@ typedef struct V4L2m2mContext {
     sem_t refsync;
     atomic_uint refcount;
     int reinit;
+    int resize_pending;
 
     /* null frame/packet received */
     int draining;
@@ -66,6 +79,23 @@ typedef struct V4L2m2mContext {
 
     /* reference back to V4L2m2mPriv */
     void *priv;
+
+    AVBufferRef *device_ref;
+
+    /* generate DRM frames */
+    int output_drm;
+
+    /* Frame tracking */
+    int64_t last_pkt_dts;
+    int64_t last_opaque;
+    unsigned int track_no;
+    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
+
+    /* req pkt */
+    int req_pkt;
+
+    /* Ext data sent */
+    int extdata_sent;
 } V4L2m2mContext;
 
 typedef struct V4L2m2mPriv {
@@ -76,6 +106,7 @@ typedef struct V4L2m2mPriv {
 
     int num_output_buffers;
     int num_capture_buffers;
+    enum AVPixelFormat pix_fmt;
 } V4L2m2mPriv;
 
 /**
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 4944d08511..7f6033ac2c 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -23,6 +23,10 @@
 
 #include <linux/videodev2.h>
 #include <sys/ioctl.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_drm.h"
 #include "libavutil/pixfmt.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/opt.h"
@@ -30,26 +34,51 @@
 #include "codec_internal.h"
 #include "libavcodec/decode.h"
 
+#include "libavcodec/hwaccels.h"
+#include "libavcodec/internal.h"
+#include "libavcodec/hwconfig.h"
+
 #include "v4l2_context.h"
 #include "v4l2_m2m.h"
 #include "v4l2_fmt.h"
 
+static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s)
+{
+    int ret;
+    struct v4l2_decoder_cmd cmd = {
+        .cmd = V4L2_DEC_CMD_START,
+        .flags = 0,
+    };
+
+    if (s->output.streamon)
+        return 0;
+
+    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON);
+    if (ret < 0)
+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n");
+
+    if (!s->capture.streamon || ret < 0)
+        return ret;
+
+    ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd);
+    if (ret < 0)
+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno);
+    else
+        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_DECODER_CMD start OK\n");
+
+    return ret;
+}
+
 static int v4l2_try_start(AVCodecContext *avctx)
 {
     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
     V4L2Context *const capture = &s->capture;
-    V4L2Context *const output = &s->output;
     struct v4l2_selection selection = { 0 };
     int ret;
 
     /* 1. start the output process */
-    if (!output->streamon) {
-        ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON);
-        if (ret < 0) {
-            av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n");
-            return ret;
-        }
-    }
+    if ((ret = check_output_streamon(avctx, s)) != 0)
+        return ret;
 
     if (capture->streamon)
         return 0;
@@ -63,15 +92,29 @@ static int v4l2_try_start(AVCodecContext *avctx)
     }
 
     /* 2.1 update the AVCodecContext */
-    avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
-    capture->av_pix_fmt = avctx->pix_fmt;
+    capture->av_pix_fmt =
+        ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
+    if (s->output_drm) {
+        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
+        avctx->sw_pix_fmt = capture->av_pix_fmt;
+    }
+    else
+        avctx->pix_fmt = capture->av_pix_fmt;
 
     /* 3. set the crop parameters */
+#if 1
+    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    selection.target = V4L2_SEL_TGT_CROP_DEFAULT;
+    ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
+    av_log(avctx, AV_LOG_INFO, "Post G selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height);
+#else
     selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
     selection.r.height = avctx->coded_height;
     selection.r.width = avctx->coded_width;
+    av_log(avctx, AV_LOG_INFO, "Try selection %dx%d\n", avctx->coded_width, avctx->coded_height);
     ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
-    if (!ret) {
+    av_log(avctx, AV_LOG_INFO, "Post S selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height);
+    if (1) {
         ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
         if (ret) {
             av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
@@ -82,15 +125,7 @@ static int v4l2_try_start(AVCodecContext *avctx)
             capture->width  = selection.r.width;
         }
     }
-
-    /* 4. init the capture context now that we have the capture format */
-    if (!capture->buffers) {
-        ret = ff_v4l2_context_init(capture);
-        if (ret) {
-            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
-            return AVERROR(ENOMEM);
-        }
-    }
+#endif
 
     /* 5. start the capture process */
     ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
@@ -133,50 +168,287 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s)
     return 0;
 }
 
-static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
+static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
+{
+    return (int64_t)n;
+}
+
+static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
+{
+    return (unsigned int)pts;
+}
+
+// FFmpeg requires us to propagate a number of vars from the coded pkt into
+// the decoded frame. The only thing that tracks like that in V4L2 stateful
+// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
+// guarantees about PTS being unique or specified for every frame so replace
+// the supplied PTS with a simple incrementing number and keep a circular
+// buffer of all the things we want preserved (including the original PTS)
+// indexed by the tracking no.
+static void
+xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *const avpkt)
+{
+    int64_t track_pts;
+
+    // Avoid 0
+    if (++s->track_no == 0)
+        s->track_no = 1;
+
+    track_pts = track_to_pts(avctx, s->track_no);
+
+    av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, s->track_no);
+    s->last_pkt_dts = avpkt->dts;
+    s->track_els[s->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
+        .discard          = 0,
+        .pkt_size         = avpkt->size,
+        .pts              = avpkt->pts,
+        .reordered_opaque = avctx->reordered_opaque,
+        .pkt_pos          = avpkt->pos,
+        .pkt_duration     = avpkt->duration,
+        .track_pts        = track_pts
+    };
+    avpkt->pts = track_pts;
+}
+
+// Returns -1 if we should discard the frame
+static int
+xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *const frame)
+{
+    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
+    const V4L2m2mTrackEl *const t = s->track_els + n;
+    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
+    {
+        av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
+        frame->pts              = AV_NOPTS_VALUE;
+        frame->pkt_dts          = s->last_pkt_dts;
+        frame->reordered_opaque = s->last_opaque;
+        frame->pkt_pos          = -1;
+        frame->pkt_duration     = 0;
+        frame->pkt_size         = -1;
+    }
+    else if (!t->discard)
+    {
+        frame->pts              = t->pts;
+        frame->pkt_dts          = s->last_pkt_dts;
+        frame->reordered_opaque = t->reordered_opaque;
+        frame->pkt_pos          = t->pkt_pos;
+        frame->pkt_duration     = t->pkt_duration;
+        frame->pkt_size         = t->pkt_size;
+
+        s->last_opaque = s->track_els[n].reordered_opaque;
+        s->track_els[n].pts = AV_NOPTS_VALUE;  // If we hit this again deny accurate knowledge of PTS
+    }
+    else
+    {
+        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
+        return -1;
+    }
+
+    frame->best_effort_timestamp = frame->pts;
+    frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 ", DTS=%" PRId64 "\n", frame->pts, frame->pkt_dts);
+    return 0;
+}
+
+static inline int stream_started(const V4L2m2mContext * const s) {
+    return s->capture.streamon && s->output.streamon;
+}
+
+#define NQ_OK        0
+#define NQ_Q_FULL    1
+#define NQ_SRC_EMPTY 2
+#define NQ_DRAINING  3
+#define NQ_DEAD      4
+
+#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
+
+// AVERROR_EOF     Flushing an already flushed stream
+// -ve             Error (all errors except EOF are unexpected)
+// NQ_OK (0)       OK
+// NQ_Q_FULL       Dst full (retry if we think V4L2 Q has space now)
+// NQ_SRC_EMPTY    Src empty (do not retry)
+// NQ_DRAINING     At EOS, dQ dest until EOS there too
+// NQ_DEAD         Not running (do not retry, do not attempt capture dQ)
+
+static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s)
 {
-    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-    V4L2Context *const capture = &s->capture;
-    V4L2Context *const output = &s->output;
     int ret;
 
+    // If we don't already have a coded packet - get a new one
+    // We will already have a coded pkt if the output Q was full last time we
+    // tried to Q it
     if (!s->buf_pkt.size) {
         ret = ff_decode_get_packet(avctx, &s->buf_pkt);
+
+        if (ret == AVERROR(EAGAIN)) {
+            if (!stream_started(s)) {
+                av_log(avctx, AV_LOG_TRACE, "%s: receive_frame before 1st coded packet\n", __func__);
+                return NQ_DEAD;
+            }
+            return NQ_SRC_EMPTY;
+        }
+
+        if (ret == AVERROR_EOF) {
+            // EOF - enter drain mode
+            av_log(avctx, AV_LOG_TRACE, "--- EOS req: ret=%d, size=%d, started=%d, drain=%d\n",
+                   ret, s->buf_pkt.size, stream_started(s), s->draining);
+            if (!stream_started(s)) {
+                av_log(avctx, AV_LOG_DEBUG, "EOS on flushed stream\n");
+                s->draining = 1;
+                s->capture.done = 1;
+                return AVERROR_EOF;
+            }
+
+            if (!s->draining) {
+                // Calling enqueue with an empty pkt starts drain
+                av_assert0(s->buf_pkt.size == 0);
+                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0, 1);
+                if (ret) {
+                    av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret);
+                    return ret;
+                }
+            }
+            return NQ_DRAINING;
+        }
+
         if (ret < 0) {
-            if (ret == AVERROR(EAGAIN))
-                return ff_v4l2_context_dequeue_frame(capture, frame, 0);
-            else if (ret != AVERROR_EOF)
-                return ret;
+            av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret);
+            return ret;
         }
+
+        xlat_pts_in(avctx, s, &s->buf_pkt);
     }
 
-    if (s->draining)
-        goto dequeue;
+    if ((ret = check_output_streamon(avctx, s)) != 0)
+        return ret;
 
-    ret = ff_v4l2_context_enqueue_packet(output, &s->buf_pkt);
-    if (ret < 0 && ret != AVERROR(EAGAIN))
-        goto fail;
+    ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt,
+                                         avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size,
+                                         1);
 
-    /* if EAGAIN don't unref packet and try to enqueue in the next iteration */
-    if (ret != AVERROR(EAGAIN))
+    if (ret == AVERROR(EAGAIN)) {
+        // Out of input buffers - keep packet
+        ret = NQ_Q_FULL;
+    }
+    else {
+        // In all other cases we are done with this packet
         av_packet_unref(&s->buf_pkt);
+        s->extdata_sent = 1;
 
-    if (!s->draining) {
-        ret = v4l2_try_start(avctx);
         if (ret) {
-            /* cant recover */
-            if (ret != AVERROR(ENOMEM))
-                ret = 0;
-            goto fail;
+            av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret);
+            return ret;
+        }
+    }
+
+    // Start if we haven't
+    {
+        const int ret2 = v4l2_try_start(avctx);
+        if (ret2) {
+            av_log(avctx, AV_LOG_DEBUG, "Start failure: err=%d\n", ret2);
+            ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD;
+        }
+    }
+
+    return ret;
+}
+
+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+    int src_rv;
+    int dst_rv = 1;  // Non-zero (done), non-negative (error) number
+
+    do {
+        src_rv = try_enqueue_src(avctx, s);
+
+        // If we got a frame last time and we have nothing to enqueue then
+        // return now. rv will be AVERROR(EAGAIN) indicating that we want more input
+        // This should mean that once decode starts we enter a stable state where
+        // we alternately ask for input and produce output
+        if (s->req_pkt && src_rv == NQ_SRC_EMPTY)
+            break;
+
+        if (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) {
+            av_log(avctx, AV_LOG_WARNING, "Poll says src Q has space but enqueue fail");
+            src_rv = NQ_SRC_EMPTY;  // If we can't enqueue pretend that there is nothing to enqueue
+        }
+
+        // Try to get a new frame if
+        // (a) we haven't already got one AND
+        // (b) enqueue returned a status indicating that decode should be attempted
+        if (dst_rv != 0 && TRY_DQ(src_rv)) {
+            do {
+                // Dequeue frame will unref any previous contents of frame
+                // if it returns success so we don't need an explicit unref
+                // when discarding
+                // This returns AVERROR(EAGAIN) if there isn't a frame ready yet
+                // but there is room in the input Q
+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, -1, 1);
+
+                if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
+                    av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
+                           s->draining, s->capture.done);
+                else if (dst_rv && dst_rv != AVERROR(EAGAIN))
+                    av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
+                           s->draining, s->capture.done, dst_rv);
+
+                // Go again if we got a frame that we need to discard
+            } while (dst_rv == 0 && xlat_pts_out(avctx, s, frame));
+        }
+
+        // Continue trying to enqueue packets if either
+        // (a) we succeeded last time OR
+        // (b) enqueue failed due to input Q full AND there is now room
+    } while (src_rv == NQ_OK || (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) );
+
+    // Ensure that the frame contains nothing if we aren't returning a frame
+    // (might happen when discarding)
+    if (dst_rv)
+        av_frame_unref(frame);
+
+    // If we got a frame this time ask for a pkt next time
+    s->req_pkt = (dst_rv == 0);
+
+#if 0
+    if (dst_rv == 0)
+    {
+        static int z = 0;
+        if (++z > 50) {
+            av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n");
+            ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
+            return -1;
         }
     }
+#endif
+
+    return dst_rv == 0 ? 0 :
+        src_rv < 0 ? src_rv :
+        dst_rv < 0 ? dst_rv :
+            AVERROR(EAGAIN);
+}
+
+#if 0
+#include <time.h>
+static int64_t us_time(void)
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
+}
 
-dequeue:
-    return ff_v4l2_context_dequeue_frame(capture, frame, -1);
-fail:
-    av_packet_unref(&s->buf_pkt);
+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    int ret;
+    const int64_t now = us_time();
+    int64_t done;
+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
+    ret = v4l2_receive_frame2(avctx, frame);
+    done = us_time();
+    av_log(avctx, AV_LOG_TRACE, ">>> %s: rx time=%" PRId64 ", rv=%d\n", __func__, done - now, ret);
     return ret;
 }
+#endif
 
 static av_cold int v4l2_decode_init(AVCodecContext *avctx)
 {
@@ -185,6 +457,9 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     V4L2m2mPriv *priv = avctx->priv_data;
     int ret;
 
+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
+    avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
+
     ret = ff_v4l2_m2m_create_context(priv, &s);
     if (ret < 0)
         return ret;
@@ -205,6 +480,28 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
     capture->av_pix_fmt = avctx->pix_fmt;
 
+    /* the client requests the codec to generate DRM frames:
+     *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
+     *       check the ff_v4l2_buffer_to_avframe conversion function.
+     *   - the DRM frame format is passed in the DRM frame descriptor layer.
+     *       check the v4l2_get_drm_frame function.
+     */
+    switch (ff_get_format(avctx, avctx->codec->pix_fmts)) {
+    default:
+        s->output_drm = 1;
+        break;
+    }
+
+    s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
+    if (!s->device_ref) {
+        ret = AVERROR(ENOMEM);
+        return ret;
+    }
+
+    ret = av_hwdevice_ctx_init(s->device_ref);
+    if (ret < 0)
+        return ret;
+
     s->avctx = avctx;
     ret = ff_v4l2_m2m_codec_init(priv);
     if (ret) {
@@ -217,7 +514,53 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
 
 static av_cold int v4l2_decode_close(AVCodecContext *avctx)
 {
-    return ff_v4l2_m2m_codec_end(avctx->priv_data);
+    int rv;
+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
+    rv = ff_v4l2_m2m_codec_end(avctx->priv_data);
+    av_log(avctx, AV_LOG_TRACE, ">>> %s: rv=%d\n", __func__, rv);
+    return rv;
+}
+
+static void v4l2_decode_flush(AVCodecContext *avctx)
+{
+    // An alternatve and more drastic form of flush is to simply do this:
+    //    v4l2_decode_close(avctx);
+    //    v4l2_decode_init(avctx);
+    // The downside is that this keeps a decoder open until all the frames
+    // associated with it have been returned.  This is a bit wasteful on
+    // possibly limited h/w resources and fails on a Pi for this reason unless
+    // more GPU mem is allocated than is the default.
+
+    V4L2m2mPriv * const priv = avctx->priv_data;
+    V4L2m2mContext * const s = priv->context;
+    V4L2Context * const output = &s->output;
+    V4L2Context * const capture = &s->capture;
+    int ret, i;
+
+    av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
+
+    // Reflushing everything is benign, quick and avoids having to worry about
+    // states like EOS processing so don't try to optimize out (having got it
+    // wrong once)
+
+    ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
+    if (ret < 0)
+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret);
+
+    // V4L2 makes no guarantees about whether decoded frames are flushed or not
+    // so mark all frames we are tracking to be discarded if they appear
+    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i)
+        s->track_els[i].discard = 1;
+
+    // resend extradata
+    s->extdata_sent = 0;
+    // clear EOS status vars
+    s->draining = 0;
+    output->done = 0;
+    capture->done = 0;
+
+    // Stream on will occur when we actually submit a new frame
+    av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__);
 }
 
 #define OFFSET(x) offsetof(V4L2m2mPriv, x)
@@ -227,9 +570,15 @@ static const AVOption options[] = {
     V4L_M2M_DEFAULT_OPTS,
     { "num_capture_buffers", "Number of buffers in the capture context",
         OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS },
+    { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS },
     { NULL},
 };
 
+static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
+    HW_CONFIG_INTERNAL(DRM_PRIME),
+    NULL
+};
+
 #define M2MDEC_CLASS(NAME) \
     static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
         .class_name = #NAME "_v4l2m2m_decoder", \
@@ -250,11 +599,16 @@ static const AVOption options[] = {
         .init           = v4l2_decode_init, \
         FF_CODEC_RECEIVE_FRAME_CB(v4l2_receive_frame), \
         .close          = v4l2_decode_close, \
+        .flush          = v4l2_decode_flush, \
         .bsfs           = bsf_name, \
         .p.capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \
         .caps_internal  = FF_CODEC_CAP_NOT_INIT_THREADSAFE | \
                           FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \
         .p.wrapper_name = "v4l2m2m", \
+        .p.pix_fmts     = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
+                                                         AV_PIX_FMT_NV12, \
+                                                         AV_PIX_FMT_NONE}, \
+        .hw_configs     = v4l2_m2m_hw_configs, \
     }
 
 M2MDEC(h264,  "H.264", AV_CODEC_ID_H264,       "h264_mp4toannexb");
-- 
2.43.0


From 1c741b868d12290f38c3c9b319d1375929e13e28 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 10 Jun 2021 18:46:21 +0100
Subject: [PATCH 016/157] Fix crash in hw_device_default_name if type not found
 (NONE)

---
 fftools/ffmpeg_hw.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fftools/ffmpeg_hw.c b/fftools/ffmpeg_hw.c
index 88fa782470..740a5e7153 100644
--- a/fftools/ffmpeg_hw.c
+++ b/fftools/ffmpeg_hw.c
@@ -75,6 +75,8 @@ static char *hw_device_default_name(enum AVHWDeviceType type)
     char *name;
     size_t index_pos;
     int index, index_limit = 1000;
+    if (!type_name)
+        return NULL;
     index_pos = strlen(type_name);
     name = av_malloc(index_pos + 4);
     if (!name)
-- 
2.43.0


From 40c28cb25239f476787cb798b903ac552837d88c Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 10 Jun 2021 18:59:18 +0100
Subject: [PATCH 017/157] Allow v4l2m2m to select non-drm_prime output formats

---
 libavcodec/v4l2_buffers.c |  2 +-
 libavcodec/v4l2_m2m_dec.c | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index a003934ca1..1ca1128db6 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -524,7 +524,7 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
                 offset += dst_stride * out->context->height;
             }
             if (offset > out->plane_info[0].length) {
-                av_log(NULL, AV_LOG_ERROR, "%s: Plane total %d > buffer size %d\n", __func__, offset, out->plane_info[0].length);
+                av_log(NULL, AV_LOG_ERROR, "%s: Plane total %u > buffer size %zu\n", __func__, offset, out->plane_info[0].length);
                 return -1;
             }
 
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 7f6033ac2c..a4b5a4e7e9 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -455,10 +455,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     V4L2Context *capture, *output;
     V4L2m2mContext *s;
     V4L2m2mPriv *priv = avctx->priv_data;
+    int gf_pix_fmt;
     int ret;
 
     av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
-    avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
 
     ret = ff_v4l2_m2m_create_context(priv, &s);
     if (ret < 0)
@@ -486,10 +486,15 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
      *   - the DRM frame format is passed in the DRM frame descriptor layer.
      *       check the v4l2_get_drm_frame function.
      */
-    switch (ff_get_format(avctx, avctx->codec->pix_fmts)) {
-    default:
+
+    gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
+    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n",
+           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
+
+    s->output_drm = 0;
+    if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
+        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
         s->output_drm = 1;
-        break;
     }
 
     s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
@@ -607,6 +612,7 @@ static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
         .p.wrapper_name = "v4l2m2m", \
         .p.pix_fmts     = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
                                                          AV_PIX_FMT_NV12, \
+                                                         AV_PIX_FMT_YUV420P, \
                                                          AV_PIX_FMT_NONE}, \
         .hw_configs     = v4l2_m2m_hw_configs, \
     }
-- 
2.43.0


From b831aabd77bd809202b2bcf85802ef55bae95f36 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 10 Jun 2021 18:59:38 +0100
Subject: [PATCH 018/157] Fix YUV420P output from v4l2m2m

Also put get_width get_height inlines in header as they are generally
useful.
---
 libavcodec/v4l2_buffers.c | 12 ++++++------
 libavcodec/v4l2_context.c | 22 ++++++----------------
 libavcodec/v4l2_m2m.h     | 12 ++++++++++++
 3 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 1ca1128db6..f4c11ca8d0 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -425,17 +425,17 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
     case AV_PIX_FMT_NV21:
         if (avbuf->num_planes > 1)
             break;
-        frame->linesize[1] = avbuf->plane_info[0].bytesperline;
-        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
+        frame->linesize[1] = frame->linesize[0];
+        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
         break;
 
     case AV_PIX_FMT_YUV420P:
         if (avbuf->num_planes > 1)
             break;
-        frame->linesize[1] = avbuf->plane_info[0].bytesperline >> 1;
-        frame->linesize[2] = avbuf->plane_info[0].bytesperline >> 1;
-        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
-        frame->data[2] = frame->data[1] + ((avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height) >> 2);
+        frame->linesize[1] = frame->linesize[0] / 2;
+        frame->linesize[2] = frame->linesize[1];
+        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
+        frame->data[2] = frame->data[1] + frame->linesize[1] * ff_v4l2_get_format_height(&avbuf->context->format) / 2;
         break;
 
     default:
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index be76068af3..6fe2586627 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -55,16 +55,6 @@ static inline AVCodecContext *logger(V4L2Context *ctx)
     return ctx_to_m2mctx(ctx)->avctx;
 }
 
-static inline unsigned int v4l2_get_width(struct v4l2_format *fmt)
-{
-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
-}
-
-static inline unsigned int v4l2_get_height(struct v4l2_format *fmt)
-{
-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
-}
-
 static AVRational v4l2_get_sar(V4L2Context *ctx)
 {
     struct AVRational sar = { 0, 1 };
@@ -96,8 +86,8 @@ static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2
     if (ret)
         av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
             ctx->name,
-            v4l2_get_width(fmt1), v4l2_get_height(fmt1),
-            v4l2_get_width(fmt2), v4l2_get_height(fmt2));
+            ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1),
+            ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2));
 
     return ret;
 }
@@ -195,8 +185,8 @@ static int do_source_change(V4L2m2mContext * const s)
 
     reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
     if (reinit) {
-        s->capture.height = v4l2_get_height(&cap_fmt);
-        s->capture.width = v4l2_get_width(&cap_fmt);
+        s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
+        s->capture.width = ff_v4l2_get_format_width(&cap_fmt);
     }
     s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
 
@@ -973,8 +963,8 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers
     av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name,
         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat),
         req.count,
-        v4l2_get_width(&ctx->format),
-        v4l2_get_height(&ctx->format),
+        ff_v4l2_get_format_width(&ctx->format),
+        ff_v4l2_get_format_height(&ctx->format),
         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage,
         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline);
 
diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 24a9c94864..8f054f2f50 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -160,4 +160,16 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx);
  */
 int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);
 
+
+static inline unsigned int ff_v4l2_get_format_width(struct v4l2_format *fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
+}
+
+static inline unsigned int ff_v4l2_get_format_height(struct v4l2_format *fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
+}
+
+
 #endif /* AVCODEC_V4L2_M2M_H */
-- 
2.43.0


From 4328b57fba21b6418afb21ae29782c316d643adc Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 10 Jun 2021 19:23:44 +0100
Subject: [PATCH 019/157] Report buffer overflows in v4l2m2m

---
 libavcodec/v4l2_buffers.c | 14 ++++++++++----
 libavcodec/v4l2_context.c |  5 ++++-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index f4c11ca8d0..de31f7ced9 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -364,6 +364,7 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
 static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset)
 {
     unsigned int bytesused, length;
+    int rv = 0;
 
     if (plane >= out->num_planes)
         return AVERROR(EINVAL);
@@ -371,11 +372,16 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i
     length = out->plane_info[plane].length;
     bytesused = FFMIN(size+offset, length);
 
-    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset));
+    if (size > length - offset) {
+        size = length - offset;
+        rv = AVERROR(ENOMEM);
+    }
+
+    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, size);
 
     set_buf_length(out, plane, bytesused, length);
 
-    return 0;
+    return rv;
 }
 
 static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf)
@@ -630,7 +636,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
     }
 
     ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen);
-    if (ret)
+    if (ret && ret != AVERROR(ENOMEM))
         return ret;
 
     v4l2_set_pts(out, pkt->pts, no_rescale_pts);
@@ -638,7 +644,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
     if (pkt->flags & AV_PKT_FLAG_KEY)
         out->flags = V4L2_BUF_FLAG_KEYFRAME;
 
-    return 0;
+    return ret;
 }
 
 int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 6fe2586627..81aced0c2b 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -824,7 +824,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
         return AVERROR(EAGAIN);
 
     ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts);
-    if (ret)
+    if (ret == AVERROR(ENOMEM))
+        av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
+               __func__, pkt->size, avbuf->planes[0].length);
+    else if (ret)
         return ret;
 
     return ff_v4l2_buffer_enqueue(avbuf);
-- 
2.43.0


From 3404c8657375fd998099c6f4c5cb37cb7159d1df Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 14 Jun 2021 11:55:16 +0100
Subject: [PATCH 020/157] Increase V4L2 H264 stateful coded buffer size

Try to set a min size of frame size / 2 for bitbuffers passed to V4l2.
This fixes a few streams that have large I-frames.  You would hope
Annex-A gave useful minCR so an appropriate size could be calculated
but it doesn't really.  It gives good guidance for bits required over
time but the instantaneous limits are very weak so it is possible
that even this won't be enough.  The correct long term solution would
be to have resizable dmabufs but that is a greter rewrite than seems
sensible now.
---
 libavcodec/v4l2_context.c | 24 +++++++++++++++++++++++-
 libavcodec/v4l2_context.h |  6 ++++++
 libavcodec/v4l2_m2m_dec.c | 24 ++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 81aced0c2b..a17ae027a6 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -902,7 +902,29 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
 
 int ff_v4l2_context_set_format(V4L2Context* ctx)
 {
-    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
+    int ret;
+
+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
+    if (ret != 0)
+        return ret;
+
+    // Check returned size against min size and if smaller have another go
+    // Only worry about plane[0] as this is meant to enforce limits for
+    // encoded streams where we might know a bit more about the shape
+    // than the driver
+    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
+        if (ctx->min_buf_size <= ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage)
+            return 0;
+        ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage = ctx->min_buf_size;
+    }
+    else {
+        if (ctx->min_buf_size <= ctx->format.fmt.pix.sizeimage)
+            return 0;
+        ctx->format.fmt.pix.sizeimage = ctx->min_buf_size;
+    }
+
+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
+    return ret;
 }
 
 void ff_v4l2_context_release(V4L2Context* ctx)
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 59009d11d1..37b0431400 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -75,6 +75,12 @@ typedef struct V4L2Context {
     AVRational sample_aspect_ratio;
     struct v4l2_rect selection;
 
+    /**
+     * If the default size of buffer is less than this then try to
+     * set to this.
+     */
+    uint32_t min_buf_size;
+
     /**
      * Indexed array of pointers to V4L2Buffers
      */
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index a4b5a4e7e9..1851acbc93 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -450,6 +450,27 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 }
 #endif
 
+static uint32_t max_coded_size(const AVCodecContext * const avctx)
+{
+    uint32_t wxh = avctx->coded_width * avctx->coded_height;
+    uint32_t size;
+
+    // Currently the only thing we try to set our own limits for is H264
+    if (avctx->codec_id != AV_CODEC_ID_H264)
+        return 0;
+
+    size = wxh * 3 / 2;
+    // H.264 Annex A table A-1 gives minCR which is either 2 or 4
+    // unfortunately that doesn't yield an actually useful limit
+    // and it should be noted that frame 0 is special cased to allow
+    // a bigger number which really isn't helpful for us. So just pick
+    // frame_size / 2
+    size /= 2;
+    // Add 64k to allow for any overheads and/or encoder hopefulness
+    // with small WxH
+    return size + (1 << 16);
+}
+
 static av_cold int v4l2_decode_init(AVCodecContext *avctx)
 {
     V4L2Context *capture, *output;
@@ -460,6 +481,7 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
 
     av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
 
+    av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level);
     ret = ff_v4l2_m2m_create_context(priv, &s);
     if (ret < 0)
         return ret;
@@ -476,9 +498,11 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
 
     output->av_codec_id = avctx->codec_id;
     output->av_pix_fmt  = AV_PIX_FMT_NONE;
+    output->min_buf_size = max_coded_size(avctx);
 
     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
     capture->av_pix_fmt = avctx->pix_fmt;
+    capture->min_buf_size = 0;
 
     /* the client requests the codec to generate DRM frames:
      *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
-- 
2.43.0


From 487bf42ffa2df39e2076bb294336fec2b8a01540 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 28 Jun 2021 12:13:35 +0100
Subject: [PATCH 021/157] Fix raw video s.t. it respects any remaining cropping

This fixes the long standing CONFWIN_A conformance test failure for drm.
---
 libavcodec/rawenc.c       |  32 ++++++++---
 libavutil/hwcontext_drm.c | 112 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 130 insertions(+), 14 deletions(-)

diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
index 594a77c42a..8ca0379e12 100644
--- a/libavcodec/rawenc.c
+++ b/libavcodec/rawenc.c
@@ -124,32 +124,41 @@ static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
 
 
 static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
-                      const AVFrame *frame, int *got_packet)
+                      const AVFrame *src_frame, int *got_packet)
 {
     int ret;
+    AVFrame * frame = NULL;
 
 #if CONFIG_SAND
-    if (av_rpi_is_sand_frame(frame)) {
-        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) :
-            av_rpi_is_sand16_frame(frame) ? raw_sand16_as_yuv420(avctx, pkt, frame) :
-            av_rpi_is_sand30_frame(frame) ? raw_sand30_as_yuv420(avctx, pkt, frame) : -1;
+    if (av_rpi_is_sand_frame(src_frame)) {
+        ret = av_rpi_is_sand8_frame(src_frame) ? raw_sand8_as_yuv420(avctx, pkt, src_frame) :
+            av_rpi_is_sand16_frame(src_frame) ? raw_sand16_as_yuv420(avctx, pkt, src_frame) :
+            av_rpi_is_sand30_frame(src_frame) ? raw_sand30_as_yuv420(avctx, pkt, src_frame) : -1;
         *got_packet = (ret == 0);
         return ret;
     }
 #endif
 
+    if ((frame = av_frame_clone(src_frame)) == NULL) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    if ((ret = av_frame_apply_cropping(frame, AV_FRAME_CROP_UNALIGNED)) < 0)
+        goto fail;
+
     ret = av_image_get_buffer_size(frame->format,
                                        frame->width, frame->height, 1);
     if (ret < 0)
-        return ret;
+        goto fail;
 
     if ((ret = ff_get_encode_buffer(avctx, pkt, ret, 0)) < 0)
-        return ret;
+        goto fail;
     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
                                        (const uint8_t **)frame->data, frame->linesize,
                                        frame->format,
                                        frame->width, frame->height, 1)) < 0)
-        return ret;
+        goto fail;
 
     if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 &&
        frame->format   == AV_PIX_FMT_YUYV422) {
@@ -165,8 +174,15 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
             AV_WB64(&pkt->data[8 * x], v << 48 | v >> 16);
         }
     }
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    av_frame_free(&frame);
     *got_packet = 1;
     return 0;
+
+fail:
+    av_frame_free(&frame);
+    *got_packet = 0;
+    return ret;
 }
 
 const FFCodec ff_rawvideo_encoder = {
diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c
index 7a9fdbd263..baf18920fa 100644
--- a/libavutil/hwcontext_drm.c
+++ b/libavutil/hwcontext_drm.c
@@ -21,6 +21,7 @@
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <unistd.h>
+#include <sys/ioctl.h>
 
 /* This was introduced in version 4.6. And may not exist all without an
  * optional package. So to prevent a hard dependency on needing the Linux
@@ -31,6 +32,7 @@
 #endif
 
 #include <drm.h>
+#include <libdrm/drm_fourcc.h>
 #include <xf86drm.h>
 
 #include "avassert.h"
@@ -38,7 +40,9 @@
 #include "hwcontext_drm.h"
 #include "hwcontext_internal.h"
 #include "imgutils.h"
-
+#if CONFIG_SAND
+#include "libavutil/rpi_sand_fns.h"
+#endif
 
 static void drm_device_free(AVHWDeviceContext *hwdev)
 {
@@ -53,6 +57,11 @@ static int drm_device_create(AVHWDeviceContext *hwdev, const char *device,
     AVDRMDeviceContext *hwctx = hwdev->hwctx;
     drmVersionPtr version;
 
+    if (device == NULL) {
+        hwctx->fd = -1;
+        return 0;
+    }
+
     hwctx->fd = open(device, O_RDWR);
     if (hwctx->fd < 0)
         return AVERROR(errno);
@@ -139,6 +148,8 @@ static int drm_map_frame(AVHWFramesContext *hwfc,
     if (flags & AV_HWFRAME_MAP_WRITE)
         mmap_prot |= PROT_WRITE;
 
+    if (dst->format == AV_PIX_FMT_NONE)
+        dst->format = hwfc->sw_format;
 #if HAVE_LINUX_DMA_BUF_H
     if (flags & AV_HWFRAME_MAP_READ)
         map->sync_flags |= DMA_BUF_SYNC_READ;
@@ -185,6 +196,23 @@ static int drm_map_frame(AVHWFramesContext *hwfc,
 
     dst->width  = src->width;
     dst->height = src->height;
+    dst->crop_top    = src->crop_top;
+    dst->crop_bottom = src->crop_bottom;
+    dst->crop_left   = src->crop_left;
+    dst->crop_right  = src->crop_right;
+
+#if CONFIG_SAND
+    // Rework for sand frames
+    if (av_rpi_is_sand_frame(dst)) {
+        // As it stands the sand formats hold stride2 in linesize[3]
+        // linesize[0] & [1] contain stride1 which is always 128 for everything we do
+        // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1]
+        dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier);
+        dst->linesize[0] = 128;
+        dst->linesize[1] = 128;
+        // *** Are we sure src->height is actually what we want ???
+    }
+#endif
 
     err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src,
                                 &drm_unmap_frame, map);
@@ -212,7 +240,15 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx,
     if (!pix_fmts)
         return AVERROR(ENOMEM);
 
-    pix_fmts[0] = ctx->sw_format;
+    // **** Offer native sand too ????
+    pix_fmts[0] =
+#if CONFIG_SAND
+        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
+            AV_PIX_FMT_YUV420P :
+        ctx->sw_format == AV_PIX_FMT_RPI4_10 ?
+            AV_PIX_FMT_YUV420P10LE :
+#endif
+            ctx->sw_format;
     pix_fmts[1] = AV_PIX_FMT_NONE;
 
     *formats = pix_fmts;
@@ -231,18 +267,79 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc,
     map = av_frame_alloc();
     if (!map)
         return AVERROR(ENOMEM);
-    map->format = dst->format;
 
+    // Map to default
+    map->format = AV_PIX_FMT_NONE;
     err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ);
     if (err)
         goto fail;
 
-    map->width  = dst->width;
-    map->height = dst->height;
+#if 0
+    av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__,
+           hwfc->sw_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE,
+           map->width, map->height,
+           map->linesize[0],
+           map->linesize[1],
+           map->linesize[2],
+           map->linesize[3],
+           dst->width, dst->height,
+           dst->linesize[0],
+           dst->linesize[1],
+           dst->linesize[2]);
+#endif
+#if CONFIG_SAND
+    if (av_rpi_is_sand_frame(map)) {
+        // Preserve crop - later ffmpeg code assumes that we have in that it
+        // overwrites any crop that we create with the old values
+        const unsigned int w = FFMIN(dst->width, map->width);
+        const unsigned int h = FFMIN(dst->height, map->height);
+
+        if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) {
+            av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
+                                     map->data[0],
+                                     128, stride2,
+                                     0, 0, w, h);
+            av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
+                                     dst->data[2], dst->linesize[2],
+                                     map->data[1],
+                                     128, stride2,
+                                     0, 0, w / 2, h / 2);
+        }
+        else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) {
+            av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
+                                     map->data[0],
+                                     128, stride2,
+                                     0, 0, w, h);
+            av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
+                                     dst->data[2], dst->linesize[2],
+                                     map->data[1],
+                                     128, stride2,
+                                     0, 0, w / 2, h / 2);
+        }
+        else
+        {
+            av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
+            err = AVERROR(EINVAL);
+            goto fail;
+        }
+
+        dst->width = w;
+        dst->height = h;
+    }
+    else
+#endif
+    {
+        // Kludge mapped h/w s.t. frame_copy works
+        map->width  = dst->width;
+        map->height = dst->height;
+        err = av_frame_copy(dst, map);
+    }
 
-    err = av_frame_copy(dst, map);
     if (err)
+    {
+        av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__);
         goto fail;
+    }
 
     err = 0;
 fail:
@@ -257,7 +354,10 @@ static int drm_transfer_data_to(AVHWFramesContext *hwfc,
     int err;
 
     if (src->width > hwfc->width || src->height > hwfc->height)
+    {
+        av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height);
         return AVERROR(EINVAL);
+    }
 
     map = av_frame_alloc();
     if (!map)
-- 
2.43.0


From 02bd1f549d2f97bd5efa634adb8729a0b9f4bc8d Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 13 Aug 2021 15:38:28 +0100
Subject: [PATCH 022/157] Set frame interlace from V4L2 buffer field

---
 libavcodec/v4l2_buffers.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index de31f7ced9..97b8eb1db3 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -222,6 +222,16 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
     return AVCOL_TRC_UNSPECIFIED;
 }
 
+static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf)
+{
+    return V4L2_FIELD_IS_INTERLACED(buf->buf.field);
+}
+
+static int v4l2_buf_is_top_first(const V4L2Buffer * const buf)
+{
+    return buf->buf.field == V4L2_FIELD_INTERLACED_TB;
+}
+
 static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
 {
     AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
@@ -576,6 +586,8 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_resc
     frame->color_trc = v4l2_get_color_trc(avbuf);
     frame->pts = v4l2_get_pts(avbuf, no_rescale_pts);
     frame->pkt_dts = AV_NOPTS_VALUE;
+    frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf);
+    frame->top_field_first = v4l2_buf_is_top_first(avbuf);
 
     /* these values are updated also during re-init in v4l2_process_driver_event */
     frame->height = ctx->height;
-- 
2.43.0


From 5b0827580d7b01660f97d16485a14894d9246a85 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 13 Aug 2021 16:11:53 +0100
Subject: [PATCH 023/157] Fix V4L2 stateful to avoid crash if flush before
 start

---
 libavcodec/v4l2_context.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index a17ae027a6..eb901e8fab 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -713,6 +713,10 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
 static void flush_all_buffers_status(V4L2Context* const ctx)
 {
     int i;
+
+    if (!ctx->bufrefs)
+        return;
+
     for (i = 0; i < ctx->num_buffers; ++i) {
         struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
         if (buf->status == V4L2BUF_IN_DRIVER)
-- 
2.43.0


From 11eadd825a741b02845ed651a478575df2dbe029 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 9 Sep 2021 17:44:13 +0100
Subject: [PATCH 024/157] Copy properties from frame to v4l2 buffer

Now copies all the properties in ff_v4l2_buffer_avframe_to_buf that
ff_v4l2_buffer_buf_to_avframe copies
---
 libavcodec/v4l2_buffers.c | 126 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 97b8eb1db3..126d2a17f4 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -128,6 +128,105 @@ static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
     return AVCOL_PRI_UNSPECIFIED;
 }
 
+static void v4l2_set_color(V4L2Buffer *buf,
+                           const enum AVColorPrimaries avcp,
+                           const enum AVColorSpace avcs,
+                           const enum AVColorTransferCharacteristic avxc)
+{
+    enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT;
+    enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT;
+    enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT;
+
+    switch (avcp) {
+    case AVCOL_PRI_BT709:
+        cs = V4L2_COLORSPACE_REC709;
+        ycbcr = V4L2_YCBCR_ENC_709;
+        break;
+    case AVCOL_PRI_BT470M:
+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
+        ycbcr = V4L2_YCBCR_ENC_601;
+        break;
+    case AVCOL_PRI_BT470BG:
+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
+        break;
+    case AVCOL_PRI_SMPTE170M:
+        cs = V4L2_COLORSPACE_SMPTE170M;
+        break;
+    case AVCOL_PRI_SMPTE240M:
+        cs = V4L2_COLORSPACE_SMPTE240M;
+        break;
+    case AVCOL_PRI_BT2020:
+        cs = V4L2_COLORSPACE_BT2020;
+        break;
+    case AVCOL_PRI_SMPTE428:
+    case AVCOL_PRI_SMPTE431:
+    case AVCOL_PRI_SMPTE432:
+    case AVCOL_PRI_EBU3213:
+    case AVCOL_PRI_RESERVED:
+    case AVCOL_PRI_FILM:
+    case AVCOL_PRI_UNSPECIFIED:
+    default:
+        break;
+    }
+
+    switch (avcs) {
+    case AVCOL_SPC_RGB:
+        cs = V4L2_COLORSPACE_SRGB;
+        break;
+    case AVCOL_SPC_BT709:
+        cs = V4L2_COLORSPACE_REC709;
+        break;
+    case AVCOL_SPC_FCC:
+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
+        break;
+    case AVCOL_SPC_BT470BG:
+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
+        break;
+    case AVCOL_SPC_SMPTE170M:
+        cs = V4L2_COLORSPACE_SMPTE170M;
+        break;
+    case AVCOL_SPC_SMPTE240M:
+        cs = V4L2_COLORSPACE_SMPTE240M;
+        break;
+    case AVCOL_SPC_BT2020_CL:
+        cs = V4L2_COLORSPACE_BT2020;
+        ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM;
+        break;
+    case AVCOL_SPC_BT2020_NCL:
+        cs = V4L2_COLORSPACE_BT2020;
+        break;
+    default:
+        break;
+    }
+
+    switch (xfer) {
+    case AVCOL_TRC_BT709:
+        xfer = V4L2_XFER_FUNC_709;
+        break;
+    case AVCOL_TRC_IEC61966_2_1:
+        xfer = V4L2_XFER_FUNC_SRGB;
+        break;
+    case AVCOL_TRC_SMPTE240M:
+        xfer = V4L2_XFER_FUNC_SMPTE240M;
+        break;
+    case AVCOL_TRC_SMPTE2084:
+        xfer = V4L2_XFER_FUNC_SMPTE2084;
+        break;
+    default:
+        break;
+    }
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
+        buf->context->format.fmt.pix_mp.colorspace = cs;
+        buf->context->format.fmt.pix_mp.ycbcr_enc = ycbcr;
+        buf->context->format.fmt.pix_mp.xfer_func = xfer;
+    } else {
+        buf->context->format.fmt.pix.colorspace = cs;
+        buf->context->format.fmt.pix.ycbcr_enc = ycbcr;
+        buf->context->format.fmt.pix.xfer_func = xfer;
+    }
+}
+
 static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
 {
     enum v4l2_quantization qt;
@@ -146,6 +245,20 @@ static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
      return AVCOL_RANGE_UNSPECIFIED;
 }
 
+static void v4l2_set_color_range(V4L2Buffer *buf, const enum AVColorRange avcr)
+{
+    const enum v4l2_quantization q =
+        avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE :
+        avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE :
+            V4L2_QUANTIZATION_DEFAULT;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
+        buf->context->format.fmt.pix_mp.quantization = q;
+    } else {
+        buf->context->format.fmt.pix.quantization = q;
+    }
+}
+
 static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
 {
     enum v4l2_ycbcr_encoding ycbcr;
@@ -232,6 +345,12 @@ static int v4l2_buf_is_top_first(const V4L2Buffer * const buf)
     return buf->buf.field == V4L2_FIELD_INTERLACED_TB;
 }
 
+static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff)
+{
+    buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE :
+        is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT;
+}
+
 static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
 {
     AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
@@ -561,7 +680,14 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
 
 int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
 {
+    out->buf.flags = frame->key_frame ? (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME) : (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME);
+    // Beware that colour info is held in format rather than the actual
+    // v4l2 buffer struct so this may not be as useful as you might hope
+    v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
+    v4l2_set_color_range(out, frame->color_range);
+    // PTS & interlace are buffer vars
     v4l2_set_pts(out, frame->pts, 0);
+    v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);
 
     return v4l2_buffer_swframe_to_buf(frame, out);
 }
-- 
2.43.0


From 00b41c1078ba91f2a50eca6ae031753be7f23a3f Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 17 Nov 2021 16:49:01 +0000
Subject: [PATCH 025/157] ffmpeg: Do not inc DTS on no decode output

V4L2 H264 decode has long latency and sometimes spits out a long stream
of output without input. In this case incrementing DTS is wrong. There
may be cases where the condition as written is correct so only "fix" in
the cases which cause problems
---
 fftools/ffmpeg.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
index 7194630162..04bea4ef4f 100644
--- a/fftools/ffmpeg.c
+++ b/fftools/ffmpeg.c
@@ -2612,7 +2612,12 @@ static int process_input_packet(InputStream *ist, const AVPacket *pkt, int no_eo
         case AVMEDIA_TYPE_VIDEO:
             ret = decode_video    (ist, repeating ? NULL : avpkt, &got_output, &duration_pts, !pkt,
                                    &decode_failed);
-            if (!repeating || !pkt || got_output) {
+            // Pi: Do not inc dts if no_cvt_hw set
+            // V4L2 H264 decode has long latency and sometimes spits out a long
+            // stream of output without input. In this case incrementing DTS is wrong.
+            // There may be cases where the condition as written is correct so only
+            // "fix" in the cases which cause problems
+            if (!repeating || !pkt || (got_output && !no_cvt_hw)) {
                 if (pkt && pkt->duration) {
                     duration_dts = av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q);
                 } else if(ist->dec_ctx->framerate.num != 0 && ist->dec_ctx->framerate.den != 0) {
-- 
2.43.0


From 1ca93f947fba143df718f4a4f743cadde2664ee6 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 17 Nov 2021 17:32:59 +0000
Subject: [PATCH 026/157] v4l2_m2m_dec: Adjust timebase if H264

Adjust AVCodecContext time_base if H264 in the same way that the
software decoder does.
---
 libavcodec/v4l2_m2m_dec.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 1851acbc93..aa1e5c1597 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -481,6 +481,16 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
 
     av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
 
+    if (avctx->codec_id == AV_CODEC_ID_H264) {
+        if (avctx->ticks_per_frame == 1) {
+            if(avctx->time_base.den < INT_MAX/2) {
+                avctx->time_base.den *= 2;
+            } else
+                avctx->time_base.num /= 2;
+        }
+        avctx->ticks_per_frame = 2;
+    }
+
     av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level);
     ret = ff_v4l2_m2m_create_context(priv, &s);
     if (ret < 0)
-- 
2.43.0


From 0dd3294ecaa3a233747005174362074f2d90fa88 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 17 Nov 2021 17:38:27 +0000
Subject: [PATCH 027/157] v4l2_m2m_dec: Produce best guess PTSs if none
 supplied

Filter scheduling gets confused by missing PTSs and makes poor guesses
more often than not.  Try to generate plausible timestamps where we are
missing them.
---
 libavcodec/v4l2_m2m.h     | 12 ++++++++
 libavcodec/v4l2_m2m_dec.c | 64 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 8f054f2f50..82feb0afdb 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -52,6 +52,16 @@ typedef struct V4L2m2mTrackEl {
     int64_t track_pts;
 } V4L2m2mTrackEl;
 
+typedef struct pts_stats_s
+{
+    void * logctx;
+    const char * name;  // For debug
+    unsigned int last_count;
+    unsigned int last_interval;
+    int64_t last_pts;
+    int64_t guess;
+} pts_stats_t;
+
 typedef struct V4L2m2mContext {
     char devname[PATH_MAX];
     int fd;
@@ -91,6 +101,8 @@ typedef struct V4L2m2mContext {
     unsigned int track_no;
     V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
 
+    pts_stats_t pts_stat;
+
     /* req pkt */
     int req_pkt;
 
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index aa1e5c1597..a5a2afbd27 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -42,6 +42,62 @@
 #include "v4l2_m2m.h"
 #include "v4l2_fmt.h"
 
+// Pick 64 for max last count - that is >1sec at 60fps
+#define STATS_LAST_COUNT_MAX 64
+#define STATS_INTERVAL_MAX (1 << 30)
+
+static int64_t pts_stats_guess(const pts_stats_t * const stats)
+{
+    if (stats->last_pts == AV_NOPTS_VALUE ||
+            stats->last_interval == 0 ||
+            stats->last_count >= STATS_LAST_COUNT_MAX)
+        return AV_NOPTS_VALUE;
+    return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval;
+}
+
+static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
+{
+    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
+        if (stats->last_count < STATS_LAST_COUNT_MAX)
+            ++stats->last_count;
+        return;
+    }
+
+    if (stats->last_pts != AV_NOPTS_VALUE) {
+        const int64_t interval = pts - stats->last_pts;
+
+        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
+            stats->last_count >= STATS_LAST_COUNT_MAX) {
+            if (stats->last_interval != 0)
+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
+                       __func__, stats->name, interval, stats->last_count);
+            stats->last_interval = 0;
+        }
+        else {
+            const int64_t frame_time = interval / (int64_t)stats->last_count;
+
+            if (frame_time != stats->last_interval)
+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
+                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
+            stats->last_interval = frame_time;
+        }
+    }
+
+    stats->last_pts = pts;
+    stats->last_count = 1;
+}
+
+static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
+{
+    *stats = (pts_stats_t){
+        .logctx = logctx,
+        .name = name,
+        .last_count = 1,
+        .last_interval = 0,
+        .last_pts = AV_NOPTS_VALUE
+    };
+}
+
 static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s)
 {
     int ret;
@@ -244,9 +300,11 @@ xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *cons
         return -1;
     }
 
-    frame->best_effort_timestamp = frame->pts;
+    pts_stats_add(&s->pts_stat, frame->pts);
+
+    frame->best_effort_timestamp = pts_stats_guess(&s->pts_stat);
     frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
-    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 ", DTS=%" PRId64 "\n", frame->pts, frame->pkt_dts);
+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
     return 0;
 }
 
@@ -496,6 +554,8 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     if (ret < 0)
         return ret;
 
+    pts_stats_init(&s->pts_stat, avctx, "decoder");
+
     capture = &s->capture;
     output = &s->output;
 
-- 
2.43.0


From bb0923ae0415c7f5209987e24439ae3deb7ea144 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 17 Nov 2021 17:59:27 +0000
Subject: [PATCH 028/157] v4l2_m2m_dec: Try harder to get an initial frame

If the input Q is full then wait on a short timeout for a capture frame
rather than stuffing yet still another frame into the input if we could
do that first. This attempts to restrict the sometimes daft initial
buffering that ends up confusing the rest of the system.
---
 libavcodec/v4l2_context.c | 2 +-
 libavcodec/v4l2_m2m_dec.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index eb901e8fab..ee5dc7b8d4 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -381,7 +381,7 @@ static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
 start:
     if (is_capture) {
         /* no need to listen to requests for more input while draining */
-        if (ctx_to_m2mctx(ctx)->draining)
+        if (ctx_to_m2mctx(ctx)->draining || timeout > 0)
             pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
     } else {
         pfd.events =  POLLOUT | POLLWRNORM;
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index a5a2afbd27..b49f470c0a 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -442,7 +442,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
                 // when discarding
                 // This returns AVERROR(EAGAIN) if there isn't a frame ready yet
                 // but there is room in the input Q
-                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, -1, 1);
+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1, 1);
 
                 if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
                     av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
-- 
2.43.0


From 5ded6fb6234deecd979bb0e251bb9cbd13952936 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 17 Nov 2021 18:04:56 +0000
Subject: [PATCH 029/157] Add a V4L2 M2M deinterlace filter

Add a V4L2 deinterlace filter that will accept DRMPRIME frames.

Multiple people have contributed to this:
Jernej Skrabec <jernej.skrabec@siol.net>
Alex Bee <knaerzche@gmail.com>
popcornmix <popcornmix@gmail.com>
John Cox <jc@kynesim.co.uk>

There is an unknown delay through the filter of typically one or three
fields which translates to 1 or 2 frames. Frames that are delayed are
lost at end of stream as the V4L2 filter has no flush control.
---
 libavcodec/v4l2_context.c            |    4 +-
 libavfilter/Makefile                 |    1 +
 libavfilter/allfilters.c             |    1 +
 libavfilter/vf_deinterlace_v4l2m2m.c | 1269 ++++++++++++++++++++++++++
 4 files changed, 1273 insertions(+), 2 deletions(-)
 create mode 100644 libavfilter/vf_deinterlace_v4l2m2m.c

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index ee5dc7b8d4..440dfaaba5 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -498,10 +498,10 @@ dequeue:
             return NULL;
         }
         --ctx->q_count;
-        av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d\n",
+        av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d field=%d\n",
                ctx->name, buf.index,
                buf.timestamp.tv_sec, buf.timestamp.tv_usec,
-               ctx->q_count, ++ctx->dq_count);
+               ctx->q_count, ++ctx->dq_count, buf.field);
 
         avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
         avbuf->status = V4L2BUF_AVAILABLE;
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index c14fc995a0..0e7b5856bd 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -262,6 +262,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER)                += vf_neighbor.o
 OBJS-$(CONFIG_DEFLICKER_FILTER)              += vf_deflicker.o
 OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER)        += vf_vpp_qsv.o
 OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER)      += vf_deinterlace_vaapi.o vaapi_vpp.o
+OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER)    += vf_deinterlace_v4l2m2m.o
 OBJS-$(CONFIG_DEJUDDER_FILTER)               += vf_dejudder.o
 OBJS-$(CONFIG_DELOGO_FILTER)                 += vf_delogo.o
 OBJS-$(CONFIG_DENOISE_VAAPI_FILTER)          += vf_misc_vaapi.o vaapi_vpp.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index b990a00152..357ff61ca8 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -248,6 +248,7 @@ extern const AVFilter ff_vf_derain;
 extern const AVFilter ff_vf_deshake;
 extern const AVFilter ff_vf_deshake_opencl;
 extern const AVFilter ff_vf_despill;
+extern const AVFilter ff_vf_deinterlace_v4l2m2m;
 extern const AVFilter ff_vf_detelecine;
 extern const AVFilter ff_vf_dilation;
 extern const AVFilter ff_vf_dilation_opencl;
diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
new file mode 100644
index 0000000000..1a933b7e0a
--- /dev/null
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -0,0 +1,1269 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * deinterlace video filter - V4L2 M2M
+ */
+
+#include <drm_fourcc.h>
+
+#include <linux/videodev2.h>
+
+#include <dirent.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <stdatomic.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/avstring.h"
+#include "libavutil/common.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_drm.h"
+#include "libavutil/internal.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/time.h"
+
+#define FF_INTERNAL_FIELDS 1
+#include "framequeue.h"
+#include "filters.h"
+#include "avfilter.h"
+#include "formats.h"
+#include "internal.h"
+#include "video.h"
+
+typedef struct V4L2Queue V4L2Queue;
+typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared;
+
+typedef struct V4L2PlaneInfo {
+    int bytesperline;
+    size_t length;
+} V4L2PlaneInfo;
+
+typedef struct V4L2Buffer {
+    int enqueued;
+    int reenqueue;
+    int fd;
+    struct v4l2_buffer buffer;
+    AVFrame frame;
+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
+    int num_planes;
+    V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES];
+    AVDRMFrameDescriptor drm_frame;
+    V4L2Queue *q;
+} V4L2Buffer;
+
+typedef struct V4L2Queue {
+    struct v4l2_format format;
+    int num_buffers;
+    V4L2Buffer *buffers;
+    DeintV4L2M2MContextShared *ctx;
+} V4L2Queue;
+
+typedef struct pts_stats_s
+{
+    void * logctx;
+    const char * name;  // For debug
+    unsigned int last_count;
+    unsigned int last_interval;
+    int64_t last_pts;
+} pts_stats_t;
+
+#define PTS_TRACK_SIZE 32
+typedef struct pts_track_el_s
+{
+    uint32_t n;
+    unsigned int interval;
+    AVFrame * props;
+} pts_track_el_t;
+
+typedef struct pts_track_s
+{
+    uint32_t n;
+    uint32_t last_n;
+    int got_2;
+    void * logctx;
+    pts_stats_t stats;
+    pts_track_el_t a[PTS_TRACK_SIZE];
+} pts_track_t;
+
+typedef struct DeintV4L2M2MContextShared {
+    void * logctx;  // For logging - will be NULL when done
+
+    int fd;
+    int done;
+    int width;
+    int height;
+    int orig_width;
+    int orig_height;
+    atomic_uint refcount;
+
+    AVBufferRef *hw_frames_ctx;
+
+    unsigned int field_order;
+
+    pts_track_t track;
+
+    V4L2Queue output;
+    V4L2Queue capture;
+} DeintV4L2M2MContextShared;
+
+typedef struct DeintV4L2M2MContext {
+    const AVClass *class;
+
+    DeintV4L2M2MContextShared *shared;
+} DeintV4L2M2MContext;
+
+static unsigned int pts_stats_interval(const pts_stats_t * const stats)
+{
+    return stats->last_interval;
+}
+
+// Pick 64 for max last count - that is >1sec at 60fps
+#define STATS_LAST_COUNT_MAX 64
+#define STATS_INTERVAL_MAX (1 << 30)
+static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
+{
+    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
+        if (stats->last_count < STATS_LAST_COUNT_MAX)
+            ++stats->last_count;
+        return;
+    }
+
+    if (stats->last_pts != AV_NOPTS_VALUE) {
+        const int64_t interval = pts - stats->last_pts;
+
+        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
+            stats->last_count >= STATS_LAST_COUNT_MAX) {
+            if (stats->last_interval != 0)
+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
+                       __func__, stats->name, interval, stats->last_count);
+            stats->last_interval = 0;
+        }
+        else {
+            const int64_t frame_time = interval / (int64_t)stats->last_count;
+
+            if (frame_time != stats->last_interval)
+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
+                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
+            stats->last_interval = frame_time;
+        }
+    }
+
+    stats->last_pts = pts;
+    stats->last_count = 1;
+}
+
+static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
+{
+    *stats = (pts_stats_t){
+        .logctx = logctx,
+        .name = name,
+        .last_count = 1,
+        .last_interval = 0,
+        .last_pts = AV_NOPTS_VALUE
+    };
+}
+
+static inline uint32_t pts_track_next_n(pts_track_t * const trk)
+{
+    if (++trk->n == 0)
+        trk->n = 1;
+    return trk->n;
+}
+
+static int pts_track_get_frame(pts_track_t * const trk, const struct timeval tv, AVFrame * const dst)
+{
+    uint32_t n = (uint32_t)(tv.tv_usec / 2 + tv.tv_sec * 500000);
+    pts_track_el_t * t;
+
+    // As a first guess assume that n==0 means last frame
+    if (n == 0) {
+        n = trk->last_n;
+        if (n == 0)
+            goto fail;
+    }
+
+    t = trk->a + (n & (PTS_TRACK_SIZE - 1));
+
+    if (t->n != n) {
+        av_log(trk->logctx, AV_LOG_ERROR, "%s: track failure: got %u, expected %u\n", __func__, n, trk->n);
+        goto fail;
+    }
+
+    // 1st frame is simple - just believe it
+    if (n != trk->last_n) {
+        trk->last_n = n;
+        trk->got_2 = 0;
+        return av_frame_copy_props(dst, t->props);
+    }
+
+    // Only believe in a single interpolated frame
+    if (trk->got_2)
+        goto fail;
+    trk->got_2 = 1;
+
+    av_frame_copy_props(dst, t->props);
+
+
+    // If we can't guess - don't
+    if (t->interval == 0) {
+        dst->best_effort_timestamp = AV_NOPTS_VALUE;
+        dst->pts = AV_NOPTS_VALUE;
+        dst->pkt_dts = AV_NOPTS_VALUE;
+    }
+    else {
+        if (dst->best_effort_timestamp != AV_NOPTS_VALUE)
+            dst->best_effort_timestamp += t->interval / 2;
+        if (dst->pts != AV_NOPTS_VALUE)
+            dst->pts += t->interval / 2;
+        if (dst->pkt_dts != AV_NOPTS_VALUE)
+            dst->pkt_dts += t->interval / 2;
+    }
+
+    return 0;
+
+fail:
+    trk->last_n = 0;
+    trk->got_2 = 0;
+    dst->pts = AV_NOPTS_VALUE;
+    dst->pkt_dts = AV_NOPTS_VALUE;
+    return 0;
+}
+
+static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src)
+{
+    const uint32_t n = pts_track_next_n(trk);
+    pts_track_el_t * const t = trk->a + (n & (PTS_TRACK_SIZE - 1));
+
+    pts_stats_add(&trk->stats, src->pts);
+
+    t->n = n;
+    t->interval = pts_stats_interval(&trk->stats); // guess that next interval is the same as the last
+    av_frame_unref(t->props);
+    av_frame_copy_props(t->props, src);
+
+    // We now know what the previous interval was, rather than having to guess,
+    // so set it.  There is a better than decent chance that this is before
+    // we use it.
+    if (t->interval != 0) {
+        pts_track_el_t * const prev_t = trk->a + ((n - 1) & (PTS_TRACK_SIZE - 1));
+        prev_t->interval = t->interval;
+    }
+
+    // In case deinterlace interpolates frames use every other usec
+    return (struct timeval){.tv_sec = n / 500000, .tv_usec = (n % 500000) * 2};
+}
+
+static void pts_track_uninit(pts_track_t * const trk)
+{
+    unsigned int i;
+    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
+        trk->a[i].n = 0;
+        av_frame_free(&trk->a[i].props);
+    }
+}
+
+static int pts_track_init(pts_track_t * const trk, void *logctx)
+{
+    unsigned int i;
+    trk->n = 1;
+    pts_stats_init(&trk->stats, logctx, "track");
+    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
+        trk->a[i].n = 0;
+        if ((trk->a[i].props = av_frame_alloc()) == NULL) {
+            pts_track_uninit(trk);
+            return AVERROR(ENOMEM);
+        }
+    }
+    return 0;
+}
+
+static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx)
+{
+    struct v4l2_capability cap;
+    int ret;
+
+    memset(&cap, 0, sizeof(cap));
+    ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap);
+    if (ret < 0)
+        return ret;
+
+    if (!(cap.capabilities & V4L2_CAP_STREAMING))
+        return AVERROR(EINVAL);
+
+    if (cap.capabilities & V4L2_CAP_VIDEO_M2M) {
+        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
+
+        return 0;
+    }
+
+    if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) {
+        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
+        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
+
+        return 0;
+    }
+
+    return AVERROR(EINVAL);
+}
+
+static int deint_v4l2m2m_try_format(V4L2Queue *queue)
+{
+    struct v4l2_format *fmt        = &queue->format;
+    DeintV4L2M2MContextShared *ctx = queue->ctx;
+    int ret, field;
+
+    ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt);
+    if (ret)
+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret);
+
+    if (V4L2_TYPE_IS_OUTPUT(fmt->type))
+        field = V4L2_FIELD_INTERLACED_TB;
+    else
+        field = V4L2_FIELD_NONE;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420;
+        fmt->fmt.pix_mp.field = field;
+        fmt->fmt.pix_mp.width = ctx->width;
+        fmt->fmt.pix_mp.height = ctx->height;
+    } else {
+        fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_YUV420;
+        fmt->fmt.pix.field = field;
+        fmt->fmt.pix.width = ctx->width;
+        fmt->fmt.pix.height = ctx->height;
+    }
+
+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__,
+		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
+		 fmt->fmt.pix_mp.pixelformat,
+		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
+
+    ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt);
+    if (ret)
+        return AVERROR(EINVAL);
+
+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__,
+		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
+		 fmt->fmt.pix_mp.pixelformat,
+		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        if (fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 ||
+            fmt->fmt.pix_mp.field != field) {
+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
+
+            return AVERROR(EINVAL);
+        }
+    } else {
+        if (fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 ||
+            fmt->fmt.pix.field != field) {
+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
+
+            return AVERROR(EINVAL);
+        }
+    }
+
+    return 0;
+}
+
+static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, int height, int pitch, int ysize)
+{
+    struct v4l2_format *fmt        = &queue->format;
+    DeintV4L2M2MContextShared *ctx = queue->ctx;
+    int ret;
+
+    struct v4l2_selection sel = {
+        .type = fmt->type,
+        .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS,
+    };
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        fmt->fmt.pix_mp.field = field;
+        fmt->fmt.pix_mp.width = width;
+        fmt->fmt.pix_mp.height = ysize / pitch;
+        fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch;
+        fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1);
+    } else {
+        fmt->fmt.pix.field = field;
+        fmt->fmt.pix.width = width;
+        fmt->fmt.pix.height = height;
+        fmt->fmt.pix.sizeimage = 0;
+        fmt->fmt.pix.bytesperline = 0;
+    }
+
+    ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt);
+    if (ret)
+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret);
+
+    ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel);
+    if (ret)
+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_SELECTION failed: %d\n", ret);
+
+    sel.r.width = width;
+    sel.r.height = height;
+    sel.r.left = 0;
+    sel.r.top = 0;
+    sel.target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE,
+    sel.flags = V4L2_SEL_FLAG_LE;
+
+    ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel);
+    if (ret)
+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_SELECTION failed: %d\n", ret);
+
+    return ret;
+}
+
+static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node)
+{
+    int ret;
+
+    ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0);
+    if (ctx->fd < 0)
+        return AVERROR(errno);
+
+    ret = deint_v4l2m2m_prepare_context(ctx);
+    if (ret)
+        goto fail;
+
+    ret = deint_v4l2m2m_try_format(&ctx->capture);
+    if (ret)
+        goto fail;
+
+    ret = deint_v4l2m2m_try_format(&ctx->output);
+    if (ret)
+        goto fail;
+
+    return 0;
+
+fail:
+    close(ctx->fd);
+    ctx->fd = -1;
+
+    return ret;
+}
+
+static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx)
+{
+    int ret = AVERROR(EINVAL);
+    struct dirent *entry;
+    char node[PATH_MAX];
+    DIR *dirp;
+
+    dirp = opendir("/dev");
+    if (!dirp)
+        return AVERROR(errno);
+
+    for (entry = readdir(dirp); entry; entry = readdir(dirp)) {
+
+        if (strncmp(entry->d_name, "video", 5))
+            continue;
+
+        snprintf(node, sizeof(node), "/dev/%s", entry->d_name);
+        av_log(ctx->logctx, AV_LOG_DEBUG, "probing device %s\n", node);
+        ret = deint_v4l2m2m_probe_device(ctx, node);
+        if (!ret)
+            break;
+    }
+
+    closedir(dirp);
+
+    if (ret) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "Could not find a valid device\n");
+        ctx->fd = -1;
+
+        return ret;
+    }
+
+    av_log(ctx->logctx, AV_LOG_INFO, "Using device %s\n", node);
+
+    return 0;
+}
+
+static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf)
+{
+    int ret;
+
+    ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer);
+    if (ret < 0)
+        return AVERROR(errno);
+
+    buf->enqueued = 1;
+
+    return 0;
+}
+
+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
+{
+    struct v4l2_exportbuffer expbuf;
+    int i, ret;
+
+    for (i = 0; i < avbuf->num_planes; i++) {
+        memset(&expbuf, 0, sizeof(expbuf));
+
+        expbuf.index = avbuf->buffer.index;
+        expbuf.type = avbuf->buffer.type;
+        expbuf.plane = i;
+
+        ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf);
+        if (ret < 0)
+            return AVERROR(errno);
+
+        avbuf->fd = expbuf.fd;
+
+        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) {
+            /* drm frame */
+            avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length;
+            avbuf->drm_frame.objects[i].fd = expbuf.fd;
+            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        } else {
+            /* drm frame */
+            avbuf->drm_frame.objects[0].size = avbuf->buffer.length;
+            avbuf->drm_frame.objects[0].fd = expbuf.fd;
+            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+        }
+    }
+
+    return 0;
+}
+
+static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
+{
+    struct v4l2_format *fmt = &queue->format;
+    DeintV4L2M2MContextShared *ctx = queue->ctx;
+    struct v4l2_requestbuffers req;
+    int ret, i, j, multiplanar;
+    uint32_t memory;
+
+    memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ?
+        V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
+
+    multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type);
+
+    memset(&req, 0, sizeof(req));
+    req.count = queue->num_buffers;
+    req.memory = memory;
+    req.type = fmt->type;
+
+    ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req);
+    if (ret < 0) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno));
+
+        return AVERROR(errno);
+    }
+
+    queue->num_buffers = req.count;
+    queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer));
+    if (!queue->buffers) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "malloc enomem\n");
+
+        return AVERROR(ENOMEM);
+    }
+
+    for (i = 0; i < queue->num_buffers; i++) {
+        V4L2Buffer *buf = &queue->buffers[i];
+
+        buf->enqueued = 0;
+        buf->fd = -1;
+        buf->q = queue;
+
+        buf->buffer.type = fmt->type;
+        buf->buffer.memory = memory;
+        buf->buffer.index = i;
+
+        if (multiplanar) {
+            buf->buffer.length = VIDEO_MAX_PLANES;
+            buf->buffer.m.planes = buf->planes;
+        }
+
+        ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer);
+        if (ret < 0) {
+            ret = AVERROR(errno);
+
+            goto fail;
+        }
+
+        if (multiplanar)
+            buf->num_planes = buf->buffer.length;
+        else
+            buf->num_planes = 1;
+
+        for (j = 0; j < buf->num_planes; j++) {
+            V4L2PlaneInfo *info = &buf->plane_info[j];
+
+            if (multiplanar) {
+                info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline;
+                info->length = buf->buffer.m.planes[j].length;
+            } else {
+                info->bytesperline = fmt->fmt.pix.bytesperline;
+                info->length = buf->buffer.length;
+            }
+        }
+
+        if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) {
+            ret = deint_v4l2m2m_enqueue_buffer(buf);
+            if (ret)
+                goto fail;
+
+            ret = v4l2_buffer_export_drm(buf);
+            if (ret)
+                goto fail;
+        }
+    }
+
+    return 0;
+
+fail:
+    for (i = 0; i < queue->num_buffers; i++)
+        if (queue->buffers[i].fd >= 0)
+            close(queue->buffers[i].fd);
+    av_free(queue->buffers);
+    queue->buffers = NULL;
+
+    return ret;
+}
+
+static int deint_v4l2m2m_streamon(V4L2Queue *queue)
+{
+    DeintV4L2M2MContextShared * const ctx = queue->ctx;
+    int type = queue->format.type;
+    int ret;
+
+    ret = ioctl(ctx->fd, VIDIOC_STREAMON, &type);
+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
+    if (ret < 0)
+        return AVERROR(errno);
+
+    return 0;
+}
+
+static int deint_v4l2m2m_streamoff(V4L2Queue *queue)
+{
+    DeintV4L2M2MContextShared * const ctx = queue->ctx;
+    int type = queue->format.type;
+    int ret;
+
+    ret = ioctl(ctx->fd, VIDIOC_STREAMOFF, &type);
+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
+    if (ret < 0)
+        return AVERROR(errno);
+
+    return 0;
+}
+
+// timeout in ms
+static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout)
+{
+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
+    DeintV4L2M2MContextShared *ctx = queue->ctx;
+    struct v4l2_buffer buf = { 0 };
+    V4L2Buffer* avbuf = NULL;
+    struct pollfd pfd;
+    short events;
+    int ret;
+
+    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
+        events =  POLLOUT | POLLWRNORM;
+    else
+        events = POLLIN | POLLRDNORM;
+
+    pfd.events = events;
+    pfd.fd = ctx->fd;
+
+    for (;;) {
+        ret = poll(&pfd, 1, timeout);
+        if (ret > 0)
+            break;
+        if (errno == EINTR)
+            continue;
+        return NULL;
+    }
+
+    if (pfd.revents & POLLERR)
+        return NULL;
+
+    if (pfd.revents & events) {
+        memset(&buf, 0, sizeof(buf));
+        buf.memory = V4L2_MEMORY_MMAP;
+        buf.type = queue->format.type;
+        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
+            memset(planes, 0, sizeof(planes));
+            buf.length = VIDEO_MAX_PLANES;
+            buf.m.planes = planes;
+        }
+
+        ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf);
+        if (ret) {
+            if (errno != EAGAIN)
+                av_log(ctx->logctx, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n",
+                       av_err2str(AVERROR(errno)));
+            return NULL;
+        }
+
+        avbuf = &queue->buffers[buf.index];
+        avbuf->enqueued = 0;
+        avbuf->buffer = buf;
+        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
+            memcpy(avbuf->planes, planes, sizeof(planes));
+            avbuf->buffer.m.planes = avbuf->planes;
+        }
+        return avbuf;
+    }
+
+    return NULL;
+}
+
+static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue)
+{
+    int i;
+    V4L2Buffer *buf = NULL;
+
+    for (i = 0; i < queue->num_buffers; i++)
+        if (!queue->buffers[i].enqueued) {
+            buf = &queue->buffers[i];
+            break;
+        }
+    return buf;
+}
+
+static void deint_v4l2m2m_unref_queued(V4L2Queue *queue)
+{
+    int i;
+    V4L2Buffer *buf = NULL;
+
+    if (!queue || !queue->buffers)
+        return;
+    for (i = 0; i < queue->num_buffers; i++) {
+        buf = &queue->buffers[i];
+        if (queue->buffers[i].enqueued)
+            av_frame_unref(&buf->frame);
+    }
+}
+
+static void recycle_q(V4L2Queue * const queue)
+{
+    V4L2Buffer* avbuf;
+    while (avbuf = deint_v4l2m2m_dequeue_buffer(queue, 0), avbuf) {
+        av_frame_unref(&avbuf->frame);
+    }
+}
+
+static int count_enqueued(V4L2Queue *queue)
+{
+    int i;
+    int n = 0;
+
+    if (queue->buffers == NULL)
+        return 0;
+
+    for (i = 0; i < queue->num_buffers; i++)
+        if (queue->buffers[i].enqueued)
+            ++n;
+    return n;
+}
+
+static int deint_v4l2m2m_enqueue_frame(V4L2Queue * const queue, AVFrame * const frame)
+{
+    DeintV4L2M2MContextShared *const ctx = queue->ctx;
+    AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0];
+    V4L2Buffer *buf;
+    int i;
+
+    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
+        recycle_q(queue);
+
+    buf = deint_v4l2m2m_find_free_buf(queue);
+    if (!buf) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d finding free buf\n", __func__, 0);
+        return AVERROR(EAGAIN);
+    }
+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type))
+        for (i = 0; i < drm_desc->nb_objects; i++)
+            buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd;
+    else
+        buf->buffer.m.fd = drm_desc->objects[0].fd;
+
+    buf->buffer.field = !frame->interlaced_frame ? V4L2_FIELD_NONE :
+        frame->top_field_first ? V4L2_FIELD_INTERLACED_TB :
+            V4L2_FIELD_INTERLACED_BT;
+
+    if (ctx->field_order != buf->buffer.field) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Field changed: %d->%d\n", __func__, ctx->field_order, buf->buffer.field);
+        ctx->field_order = buf->buffer.field;
+    }
+
+    buf->buffer.timestamp = pts_track_add_frame(&ctx->track, frame);
+
+    buf->drm_frame.objects[0].fd = drm_desc->objects[0].fd;
+
+    av_frame_move_ref(&buf->frame, frame);
+
+    return deint_v4l2m2m_enqueue_buffer(buf);
+}
+
+static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx)
+{
+    if (atomic_fetch_sub(&ctx->refcount, 1) == 1) {
+        V4L2Queue *capture = &ctx->capture;
+        V4L2Queue *output  = &ctx->output;
+        int i;
+
+        av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__);
+
+        if (ctx->fd >= 0) {
+            deint_v4l2m2m_streamoff(capture);
+            deint_v4l2m2m_streamoff(output);
+        }
+
+        if (capture->buffers)
+            for (i = 0; i < capture->num_buffers; i++) {
+                capture->buffers[i].q = NULL;
+                if (capture->buffers[i].fd >= 0)
+                    close(capture->buffers[i].fd);
+            }
+
+        deint_v4l2m2m_unref_queued(output);
+
+        av_buffer_unref(&ctx->hw_frames_ctx);
+
+        if (capture->buffers)
+            av_free(capture->buffers);
+
+        if (output->buffers)
+            av_free(output->buffers);
+
+        if (ctx->fd >= 0) {
+            close(ctx->fd);
+            ctx->fd = -1;
+        }
+
+        av_free(ctx);
+    }
+}
+
+static void v4l2_free_buffer(void *opaque, uint8_t *unused)
+{
+    V4L2Buffer *buf                = opaque;
+    DeintV4L2M2MContextShared *ctx = buf->q->ctx;
+
+    if (!ctx->done)
+        deint_v4l2m2m_enqueue_buffer(buf);
+
+    deint_v4l2m2m_destroy_context(ctx);
+}
+
+static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
+{
+    int av_pix_fmt = AV_PIX_FMT_YUV420P;
+    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
+    AVDRMLayerDescriptor *layer;
+
+    /* fill the DRM frame descriptor */
+    drm_desc->nb_objects = avbuf->num_planes;
+    drm_desc->nb_layers = 1;
+
+    layer = &drm_desc->layers[0];
+    layer->nb_planes = avbuf->num_planes;
+
+    for (int i = 0; i < avbuf->num_planes; i++) {
+        layer->planes[i].object_index = i;
+        layer->planes[i].offset = 0;
+        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
+    }
+
+    switch (av_pix_fmt) {
+    case AV_PIX_FMT_YUYV422:
+
+        layer->format = DRM_FORMAT_YUYV;
+        layer->nb_planes = 1;
+
+        break;
+
+    case AV_PIX_FMT_NV12:
+    case AV_PIX_FMT_NV21:
+
+        layer->format = av_pix_fmt == AV_PIX_FMT_NV12 ?
+            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
+
+        if (avbuf->num_planes > 1)
+            break;
+
+        layer->nb_planes = 2;
+
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
+            height;
+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
+        break;
+
+    case AV_PIX_FMT_YUV420P:
+
+        layer->format = DRM_FORMAT_YUV420;
+
+        if (avbuf->num_planes > 1)
+            break;
+
+        layer->nb_planes = 3;
+
+        layer->planes[1].object_index = 0;
+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
+            height;
+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
+
+        layer->planes[2].object_index = 0;
+        layer->planes[2].offset = layer->planes[1].offset +
+            ((avbuf->plane_info[0].bytesperline *
+              height) >> 2);
+        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
+        break;
+
+    default:
+        drm_desc->nb_layers = 0;
+        break;
+    }
+
+    return (uint8_t *) drm_desc;
+}
+
+// timeout in ms
+static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout)
+{
+    DeintV4L2M2MContextShared *ctx = queue->ctx;
+    V4L2Buffer* avbuf;
+
+    av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__);
+
+    avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout);
+    if (!avbuf) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout);
+        return AVERROR(EAGAIN);
+    }
+
+    // Fill in PTS and anciliary info from src frame
+    // we will want to overwrite some fields as only the pts/dts
+    // fields are updated with new timing in this fn
+    pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame);
+
+    frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame,
+                            sizeof(avbuf->drm_frame), v4l2_free_buffer,
+                            avbuf, AV_BUFFER_FLAG_READONLY);
+    if (!frame->buf[0]) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d creating buffer\n", __func__, 0);
+        return AVERROR(ENOMEM);
+    }
+
+    atomic_fetch_add(&ctx->refcount, 1);
+
+    frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height);
+    frame->format = AV_PIX_FMT_DRM_PRIME;
+    if (ctx->hw_frames_ctx)
+        frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
+    frame->height = ctx->height;
+    frame->width = ctx->width;
+
+    // Not interlaced now
+    frame->interlaced_frame = 0;
+    frame->top_field_first = 0;
+    // Pkt duration halved
+    frame->pkt_duration /= 2;
+
+    if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n");
+        frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM;
+    }
+
+    av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: PTS=%"PRId64"\n", __func__, frame->pts);
+    return 0;
+}
+
+static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
+{
+    AVFilterLink *inlink           = outlink->src->inputs[0];
+    AVFilterContext *avctx         = outlink->src;
+    DeintV4L2M2MContext *priv      = avctx->priv;
+    DeintV4L2M2MContextShared *ctx = priv->shared;
+    int ret;
+
+    ctx->height = avctx->inputs[0]->h;
+    ctx->width = avctx->inputs[0]->w;
+
+    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d\n", __func__, ctx->width, ctx->height);
+
+    outlink->time_base           = inlink->time_base;
+    outlink->w                   = inlink->w;
+    outlink->h                   = inlink->h;
+    outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+    outlink->format              = inlink->format;
+    outlink->frame_rate = (AVRational) {1, 0};  // Deny knowledge of frame rate
+
+    ret = deint_v4l2m2m_find_device(ctx);
+    if (ret)
+        return ret;
+
+    if (inlink->hw_frames_ctx) {
+        ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx);
+        if (!ctx->hw_frames_ctx)
+            return AVERROR(ENOMEM);
+    }
+    return 0;
+}
+
+static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
+{
+    AVFilterContext *avctx         = link->dst;
+    DeintV4L2M2MContext *priv      = avctx->priv;
+    DeintV4L2M2MContextShared *ctx = priv->shared;
+    V4L2Queue *capture             = &ctx->capture;
+    V4L2Queue *output              = &ctx->output;
+    int ret;
+
+    av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" (%"PRId64") field :%d interlaced: %d aspect:%d/%d\n",
+          __func__, in->pts, AV_NOPTS_VALUE, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den);
+    av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__,
+           avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out);
+
+    if (ctx->field_order == V4L2_FIELD_ANY) {
+        AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0];
+        ctx->orig_width = drm_desc->layers[0].planes[0].pitch;
+        ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width;
+
+        av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height,
+           drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset);
+
+        if (in->top_field_first)
+            ctx->field_order = V4L2_FIELD_INTERLACED_TB;
+        else
+            ctx->field_order = V4L2_FIELD_INTERLACED_BT;
+
+        ret = deint_v4l2m2m_set_format(output, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
+        if (ret)
+            return ret;
+
+        ret = deint_v4l2m2m_set_format(capture, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
+        if (ret)
+            return ret;
+
+        ret = deint_v4l2m2m_allocate_buffers(capture);
+        if (ret)
+            return ret;
+
+        ret = deint_v4l2m2m_streamon(capture);
+        if (ret)
+            return ret;
+
+        ret = deint_v4l2m2m_allocate_buffers(output);
+        if (ret)
+            return ret;
+
+        ret = deint_v4l2m2m_streamon(output);
+        if (ret)
+            return ret;
+    }
+
+    ret = deint_v4l2m2m_enqueue_frame(output, in);
+
+    av_log(priv, AV_LOG_TRACE, ">>> %s: %s\n", __func__, av_err2str(ret));
+    return ret;
+}
+
+static int deint_v4l2m2m_activate(AVFilterContext *avctx)
+{
+    DeintV4L2M2MContext * const priv = avctx->priv;
+    DeintV4L2M2MContextShared *const s = priv->shared;
+    AVFilterLink * const outlink = avctx->outputs[0];
+    AVFilterLink * const inlink = avctx->inputs[0];
+    int n = 0;
+    int cn = 99;
+    int instatus = 0;
+    int64_t inpts = 0;
+    int did_something = 0;
+
+    av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__);
+
+    FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx);
+
+    ff_inlink_acknowledge_status(inlink, &instatus, &inpts);
+
+    if (!ff_outlink_frame_wanted(outlink)) {
+        av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__);
+    }
+    else if (s->field_order != V4L2_FIELD_ANY)  // Can't DQ if no setup!
+    {
+        AVFrame * frame = av_frame_alloc();
+        int rv;
+
+again:
+        recycle_q(&s->output);
+        n = count_enqueued(&s->output);
+
+        if (frame == NULL) {
+            av_log(priv, AV_LOG_ERROR, "%s: error allocating frame\n", __func__);
+            return AVERROR(ENOMEM);
+        }
+
+        rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, n > 4 ? 300 : 0);
+        if (rv != 0) {
+            av_frame_free(&frame);
+            if (rv != AVERROR(EAGAIN)) {
+                av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv));
+                return rv;
+            }
+        }
+        else {
+            frame->interlaced_frame = 0;
+            // frame is always consumed by filter_frame - even on error despite
+            // a somewhat confusing comment in the header
+            rv = ff_filter_frame(outlink, frame);
+
+            if (instatus != 0) {
+                av_log(priv, AV_LOG_TRACE, "%s: eof loop\n", __func__);
+                goto again;
+            }
+
+            av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv));
+            did_something = 1;
+        }
+
+        cn = count_enqueued(&s->capture);
+    }
+
+    if (instatus != 0) {
+        ff_outlink_set_status(outlink, instatus, inpts);
+        av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(instatus));
+        return 0;
+    }
+
+    {
+        AVFrame * frame;
+        int rv;
+
+        recycle_q(&s->output);
+        n = count_enqueued(&s->output);
+
+        while (n < 6) {
+            if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) {
+                av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv));
+                return rv;
+            }
+
+            if (frame == NULL) {
+                av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
+                break;
+            }
+
+            deint_v4l2m2m_filter_frame(inlink, frame);
+            av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
+            ++n;
+        }
+    }
+
+    if (n < 6) {
+        ff_inlink_request_frame(inlink);
+        did_something = 1;
+        av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__);
+    }
+
+    if (n > 4 && ff_outlink_frame_wanted(outlink)) {
+        ff_filter_set_ready(avctx, 1);
+        did_something = 1;
+        av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__);
+    }
+
+    av_log(priv, AV_LOG_TRACE, ">>> %s: OK (n=%d, cn=%d)\n", __func__, n, cn);
+    return did_something ? 0 : FFERROR_NOT_READY;
+}
+
+static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
+{
+    DeintV4L2M2MContext * const priv = avctx->priv;
+    DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared));
+
+    if (!ctx) {
+        av_log(priv, AV_LOG_ERROR, "%s: error %d allocating context\n", __func__, 0);
+        return AVERROR(ENOMEM);
+    }
+    priv->shared = ctx;
+    ctx->logctx = priv;
+    ctx->fd = -1;
+    ctx->output.ctx = ctx;
+    ctx->output.num_buffers = 8;
+    ctx->capture.ctx = ctx;
+    ctx->capture.num_buffers = 12;
+    ctx->done = 0;
+    ctx->field_order = V4L2_FIELD_ANY;
+
+    pts_track_init(&ctx->track, priv);
+
+    atomic_init(&ctx->refcount, 1);
+
+    return 0;
+}
+
+static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
+{
+    DeintV4L2M2MContext *priv = avctx->priv;
+    DeintV4L2M2MContextShared *ctx = priv->shared;
+
+    ctx->done = 1;
+    ctx->logctx = NULL;  // Log to NULL works, log to missing crashes
+    pts_track_uninit(&ctx->track);
+    deint_v4l2m2m_destroy_context(ctx);
+}
+
+static const AVOption deinterlace_v4l2m2m_options[] = {
+    { NULL },
+};
+
+AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m);
+
+static const AVFilterPad deint_v4l2m2m_inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+    },
+};
+
+static const AVFilterPad deint_v4l2m2m_outputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_VIDEO,
+        .config_props  = deint_v4l2m2m_config_props,
+    },
+};
+
+AVFilter ff_vf_deinterlace_v4l2m2m = {
+    .name           = "deinterlace_v4l2m2m",
+    .description    = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"),
+    .priv_size      = sizeof(DeintV4L2M2MContext),
+    .init           = &deint_v4l2m2m_init,
+    .uninit         = &deint_v4l2m2m_uninit,
+    FILTER_INPUTS(deint_v4l2m2m_inputs),
+    FILTER_OUTPUTS(deint_v4l2m2m_outputs),
+    FILTER_SINGLE_SAMPLEFMT(AV_PIX_FMT_DRM_PRIME),
+    .priv_class     = &deinterlace_v4l2m2m_class,
+    .activate       = deint_v4l2m2m_activate,
+};
-- 
2.43.0


From 0d48f082740534fbf216aa2ee4b910a590f0af7b Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 2 Dec 2021 17:49:55 +0000
Subject: [PATCH 030/157] Put no_pts_rescale in context which makes more sense
 than an arg

---
 libavcodec/v4l2_buffers.c | 28 ++++++++++++++--------------
 libavcodec/v4l2_buffers.h |  5 ++---
 libavcodec/v4l2_context.c |  8 ++++----
 libavcodec/v4l2_context.h | 13 +++++++++----
 libavcodec/v4l2_m2m_dec.c |  9 +++++----
 5 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 126d2a17f4..22da6bd722 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -39,7 +39,7 @@
 #define USEC_PER_SEC 1000000
 static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };
 
-static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
+static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
 {
     return V4L2_TYPE_IS_OUTPUT(buf->context->type) ?
         container_of(buf->context, V4L2m2mContext, output) :
@@ -51,34 +51,34 @@ static inline AVCodecContext *logger(V4L2Buffer *buf)
     return buf_to_m2mctx(buf)->avctx;
 }
 
-static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf)
+static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf)
 {
-    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+    const V4L2m2mContext *s = buf_to_m2mctx(avbuf);
     const AVRational tb = s->avctx->pkt_timebase.num ?
         s->avctx->pkt_timebase :
         s->avctx->time_base;
     return tb.num && tb.den ? tb : v4l2_timebase;
 }
 
-static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts, int no_rescale)
+static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts)
 {
     /* convert pts to v4l2 timebase */
     const int64_t v4l2_pts =
-        no_rescale ? pts :
+        out->context->no_pts_rescale ? pts :
         pts == AV_NOPTS_VALUE ? 0 :
             av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
     out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
     out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
 }
 
-static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf, int no_rescale)
+static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf)
 {
     /* convert pts back to encoder timebase */
     const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
                         avbuf->buf.timestamp.tv_usec;
 
     return
-        no_rescale ? v4l2_pts :
+        avbuf->context->no_pts_rescale ? v4l2_pts :
         v4l2_pts == 0 ? AV_NOPTS_VALUE :
             av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
 }
@@ -686,13 +686,13 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
     v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
     v4l2_set_color_range(out, frame->color_range);
     // PTS & interlace are buffer vars
-    v4l2_set_pts(out, frame->pts, 0);
+    v4l2_set_pts(out, frame->pts);
     v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);
 
     return v4l2_buffer_swframe_to_buf(frame, out);
 }
 
-int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_rescale_pts)
+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
 {
     int ret;
     V4L2Context * const ctx = avbuf->context;
@@ -710,7 +710,7 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_resc
     frame->colorspace = v4l2_get_color_space(avbuf);
     frame->color_range = v4l2_get_color_range(avbuf);
     frame->color_trc = v4l2_get_color_trc(avbuf);
-    frame->pts = v4l2_get_pts(avbuf, no_rescale_pts);
+    frame->pts = v4l2_get_pts(avbuf);
     frame->pkt_dts = AV_NOPTS_VALUE;
     frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf);
     frame->top_field_first = v4l2_buf_is_top_first(avbuf);
@@ -757,13 +757,13 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
         pkt->flags |= AV_PKT_FLAG_CORRUPT;
     }
 
-    pkt->dts = pkt->pts = v4l2_get_pts(avbuf, 0);
+    pkt->dts = pkt->pts = v4l2_get_pts(avbuf);
 
     return 0;
 }
 
 int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-                                    const void *extdata, size_t extlen, int no_rescale_pts)
+                                    const void *extdata, size_t extlen)
 {
     int ret;
 
@@ -777,7 +777,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
     if (ret && ret != AVERROR(ENOMEM))
         return ret;
 
-    v4l2_set_pts(out, pkt->pts, no_rescale_pts);
+    v4l2_set_pts(out, pkt->pts);
 
     if (pkt->flags & AV_PKT_FLAG_KEY)
         out->flags = V4L2_BUF_FLAG_KEYFRAME;
@@ -787,7 +787,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
 
 int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
 {
-    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
+    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0);
 }
 
 
diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
index 111526aee3..641e0e147b 100644
--- a/libavcodec/v4l2_buffers.h
+++ b/libavcodec/v4l2_buffers.h
@@ -83,12 +83,11 @@ typedef struct V4L2Buffer {
  *
  * @param[in] frame The AVFRame to push the information to
  * @param[in] buf The V4L2Buffer to get the information from
- * @param[in] no_rescale_pts If non-zero do not rescale PTS
  *
  * @returns 0 in case of success, AVERROR(EINVAL) if the number of planes is incorrect,
  * AVERROR(ENOMEM) if the AVBufferRef can't be created.
  */
-int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf, int no_rescale_pts);
+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf);
 
 /**
  * Extracts the data from a V4L2Buffer to an AVPacket
@@ -113,7 +112,7 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf);
 int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
 
 int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-                                    const void *extdata, size_t extlen, int no_rescale_pts);
+                                    const void *extdata, size_t extlen);
 
 /**
  * Extracts the data from an AVFrame to a V4L2Buffer
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 440dfaaba5..64540a37b3 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -808,7 +808,7 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
 }
 
 int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
-                                   const void * extdata, size_t extlen, int no_rescale_pts)
+                                   const void * extdata, size_t extlen)
 {
     V4L2m2mContext *s = ctx_to_m2mctx(ctx);
     V4L2Buffer* avbuf;
@@ -827,7 +827,7 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
     if (!avbuf)
         return AVERROR(EAGAIN);
 
-    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts);
+    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen);
     if (ret == AVERROR(ENOMEM))
         av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
                __func__, pkt->size, avbuf->planes[0].length);
@@ -837,7 +837,7 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
     return ff_v4l2_buffer_enqueue(avbuf);
 }
 
-int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout, int no_rescale_pts)
+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
 {
     V4L2Buffer *avbuf;
 
@@ -854,7 +854,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout,
         return AVERROR(EAGAIN);
     }
 
-    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf, no_rescale_pts);
+    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
 }
 
 int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 37b0431400..4cc164886c 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -102,6 +102,13 @@ typedef struct V4L2Context {
      */
     int done;
 
+    /**
+     * PTS rescale not wanted
+     * If the PTS is just a dummy frame count then rescale is
+     * actively harmful
+     */
+    int no_pts_rescale;
+
     AVBufferRef *frames_ref;
     int q_count;
     int dq_count;
@@ -172,12 +179,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
  * @param[in] ctx The V4L2Context to dequeue from.
  * @param[inout] f The AVFrame to dequeue to.
  * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
- * @param[in] no_rescale_pts (0 rescale pts, 1 use pts as
- *       timestamp directly)
  *
  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
  */
-int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int no_rescale_pts);
+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
 
 /**
  * Enqueues a buffer to a V4L2Context from an AVPacket
@@ -189,7 +194,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int
  * @param[in] pkt A pointer to an AVPacket.
  * @return 0 in case of success, a negative error otherwise.
  */
-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size, int no_rescale_pts);
+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size);
 
 /**
  * Enqueues a buffer to a V4L2Context from an AVFrame
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index b49f470c0a..36754b314a 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -360,7 +360,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
             if (!s->draining) {
                 // Calling enqueue with an empty pkt starts drain
                 av_assert0(s->buf_pkt.size == 0);
-                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0, 1);
+                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
                 if (ret) {
                     av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret);
                     return ret;
@@ -381,8 +381,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
         return ret;
 
     ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt,
-                                         avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size,
-                                         1);
+                                         avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size);
 
     if (ret == AVERROR(EAGAIN)) {
         // Out of input buffers - keep packet
@@ -442,7 +441,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
                 // when discarding
                 // This returns AVERROR(EAGAIN) if there isn't a frame ready yet
                 // but there is room in the input Q
-                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1, 1);
+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1);
 
                 if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
                     av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
@@ -569,10 +568,12 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     output->av_codec_id = avctx->codec_id;
     output->av_pix_fmt  = AV_PIX_FMT_NONE;
     output->min_buf_size = max_coded_size(avctx);
+    output->no_pts_rescale = 1;
 
     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
     capture->av_pix_fmt = avctx->pix_fmt;
     capture->min_buf_size = 0;
+    capture->no_pts_rescale = 1;
 
     /* the client requests the codec to generate DRM frames:
      *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
-- 
2.43.0


From 2ca5bbbee693408c3363b30d2c5cd0d6c8ac33e4 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 8 Dec 2021 15:00:37 +0000
Subject: [PATCH 031/157] Use bitbuf min size for all streams

---
 libavcodec/v4l2_m2m_dec.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 36754b314a..48a6810d18 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -507,15 +507,12 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 }
 #endif
 
+// This heuristic is for H264 but use for everything
 static uint32_t max_coded_size(const AVCodecContext * const avctx)
 {
     uint32_t wxh = avctx->coded_width * avctx->coded_height;
     uint32_t size;
 
-    // Currently the only thing we try to set our own limits for is H264
-    if (avctx->codec_id != AV_CODEC_ID_H264)
-        return 0;
-
     size = wxh * 3 / 2;
     // H.264 Annex A table A-1 gives minCR which is either 2 or 4
     // unfortunately that doesn't yield an actually useful limit
-- 
2.43.0


From 28b8c8f7263a012a1a505a0903cd28de5d9d5e7b Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 3 Dec 2021 12:54:18 +0000
Subject: [PATCH 032/157] Track pending frames in v4l2 stateful

Track which frames are pending decode in the v4l2 stateful decoder.
This relies on DTS & PTS having some relationship to reality, so
any use of this code must cope with the results being wrong.

Also moves the xlat state vars out of the main context and into their
own structure.
---
 libavcodec/v4l2_m2m.h     |  15 ++++--
 libavcodec/v4l2_m2m_dec.c | 100 +++++++++++++++++++++++++++++---------
 2 files changed, 89 insertions(+), 26 deletions(-)

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 82feb0afdb..3f86809623 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -44,8 +44,10 @@
 #define FF_V4L2_M2M_TRACK_SIZE 128
 typedef struct V4L2m2mTrackEl {
     int     discard;   // If we see this buffer its been flushed, so discard
+    int     pending;
     int     pkt_size;
     int64_t pts;
+    int64_t dts;
     int64_t reordered_opaque;
     int64_t pkt_pos;
     int64_t pkt_duration;
@@ -62,6 +64,14 @@ typedef struct pts_stats_s
     int64_t guess;
 } pts_stats_t;
 
+typedef struct xlat_track_s {
+    unsigned int track_no;
+    int64_t last_pts;
+    int64_t last_pkt_dts;
+    int64_t last_opaque;
+    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
+} xlat_track_t;
+
 typedef struct V4L2m2mContext {
     char devname[PATH_MAX];
     int fd;
@@ -96,10 +106,7 @@ typedef struct V4L2m2mContext {
     int output_drm;
 
     /* Frame tracking */
-    int64_t last_pkt_dts;
-    int64_t last_opaque;
-    unsigned int track_no;
-    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
+    xlat_track_t xlat;
 
     pts_stats_t pts_stat;
 
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 48a6810d18..d8ebb466cd 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -242,22 +242,24 @@ static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts
 // buffer of all the things we want preserved (including the original PTS)
 // indexed by the tracking no.
 static void
-xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *const avpkt)
+xlat_pts_in(AVCodecContext *const avctx, xlat_track_t *const x, AVPacket *const avpkt)
 {
     int64_t track_pts;
 
     // Avoid 0
-    if (++s->track_no == 0)
-        s->track_no = 1;
+    if (++x->track_no == 0)
+        x->track_no = 1;
 
-    track_pts = track_to_pts(avctx, s->track_no);
+    track_pts = track_to_pts(avctx, x->track_no);
 
-    av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, s->track_no);
-    s->last_pkt_dts = avpkt->dts;
-    s->track_els[s->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
+    av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
+    x->last_pkt_dts = avpkt->dts;
+    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
         .discard          = 0,
+        .pending          = 1,
         .pkt_size         = avpkt->size,
         .pts              = avpkt->pts,
+        .dts              = avpkt->dts,
         .reordered_opaque = avctx->reordered_opaque,
         .pkt_pos          = avpkt->pos,
         .pkt_duration     = avpkt->duration,
@@ -268,31 +270,36 @@ xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *cons
 
 // Returns -1 if we should discard the frame
 static int
-xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *const frame)
+xlat_pts_out(AVCodecContext *const avctx,
+             xlat_track_t * const x,
+             pts_stats_t * const ps,
+             AVFrame *const frame)
 {
     unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
-    const V4L2m2mTrackEl *const t = s->track_els + n;
+    V4L2m2mTrackEl *const t = x->track_els + n;
     if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
     {
         av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
         frame->pts              = AV_NOPTS_VALUE;
-        frame->pkt_dts          = s->last_pkt_dts;
-        frame->reordered_opaque = s->last_opaque;
+        frame->pkt_dts          = x->last_pkt_dts;
+        frame->reordered_opaque = x->last_opaque;
         frame->pkt_pos          = -1;
         frame->pkt_duration     = 0;
         frame->pkt_size         = -1;
     }
     else if (!t->discard)
     {
-        frame->pts              = t->pts;
-        frame->pkt_dts          = s->last_pkt_dts;
+        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
+        frame->pkt_dts          = x->last_pkt_dts;
         frame->reordered_opaque = t->reordered_opaque;
         frame->pkt_pos          = t->pkt_pos;
         frame->pkt_duration     = t->pkt_duration;
         frame->pkt_size         = t->pkt_size;
 
-        s->last_opaque = s->track_els[n].reordered_opaque;
-        s->track_els[n].pts = AV_NOPTS_VALUE;  // If we hit this again deny accurate knowledge of PTS
+        x->last_opaque = x->track_els[n].reordered_opaque;
+        if (frame->pts != AV_NOPTS_VALUE)
+            x->last_pts = frame->pts;
+        t->pending = 0;
     }
     else
     {
@@ -300,14 +307,62 @@ xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *cons
         return -1;
     }
 
-    pts_stats_add(&s->pts_stat, frame->pts);
+    pts_stats_add(ps, frame->pts);
 
-    frame->best_effort_timestamp = pts_stats_guess(&s->pts_stat);
+    frame->best_effort_timestamp = pts_stats_guess(ps);
     frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
     av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
     return 0;
 }
 
+static void
+xlat_flush(xlat_track_t * const x)
+{
+    unsigned int i;
+    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) {
+        x->track_els[i].pending = 0;
+        x->track_els[i].discard = 1;
+    }
+    x->last_pts = AV_NOPTS_VALUE;
+}
+
+static void
+xlat_init(xlat_track_t * const x)
+{
+    memset(x, 0, sizeof(*x));
+    x->last_pts = AV_NOPTS_VALUE;
+}
+
+static int
+xlat_pending(const xlat_track_t * const x)
+{
+    unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
+    unsigned int i;
+    int r = 0;
+    int64_t now = AV_NOPTS_VALUE;
+
+    for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) {
+        const V4L2m2mTrackEl * const t = x->track_els + n;
+
+        if (!t->pending)
+            continue;
+
+        if (now == AV_NOPTS_VALUE)
+            now = t->dts;
+
+        if (t->pts == AV_NOPTS_VALUE ||
+            ((now == AV_NOPTS_VALUE || t->pts <= now) &&
+             (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts)))
+            ++r;
+    }
+
+    // If we never get any ideas about PTS vs DTS allow a lot more buffer
+    if (now == AV_NOPTS_VALUE)
+        r -= 16;
+
+    return r;
+}
+
 static inline int stream_started(const V4L2m2mContext * const s) {
     return s->capture.streamon && s->output.streamon;
 }
@@ -374,7 +429,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
             return ret;
         }
 
-        xlat_pts_in(avctx, s, &s->buf_pkt);
+        xlat_pts_in(avctx, &s->xlat, &s->buf_pkt);
     }
 
     if ((ret = check_output_streamon(avctx, s)) != 0)
@@ -417,6 +472,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
     int dst_rv = 1;  // Non-zero (done), non-negative (error) number
 
     do {
+        av_log(avctx, AV_LOG_INFO, "Pending=%d\n", xlat_pending(&s->xlat));
         src_rv = try_enqueue_src(avctx, s);
 
         // If we got a frame last time and we have nothing to enqueue then
@@ -451,7 +507,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
                            s->draining, s->capture.done, dst_rv);
 
                 // Go again if we got a frame that we need to discard
-            } while (dst_rv == 0 && xlat_pts_out(avctx, s, frame));
+            } while (dst_rv == 0 && xlat_pts_out(avctx, &s->xlat, &s->pts_stat, frame));
         }
 
         // Continue trying to enqueue packets if either
@@ -550,6 +606,7 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     if (ret < 0)
         return ret;
 
+    xlat_init(&s->xlat);
     pts_stats_init(&s->pts_stat, avctx, "decoder");
 
     capture = &s->capture;
@@ -632,7 +689,7 @@ static void v4l2_decode_flush(AVCodecContext *avctx)
     V4L2m2mContext * const s = priv->context;
     V4L2Context * const output = &s->output;
     V4L2Context * const capture = &s->capture;
-    int ret, i;
+    int ret;
 
     av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
 
@@ -646,8 +703,7 @@ static void v4l2_decode_flush(AVCodecContext *avctx)
 
     // V4L2 makes no guarantees about whether decoded frames are flushed or not
     // so mark all frames we are tracking to be discarded if they appear
-    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i)
-        s->track_els[i].discard = 1;
+    xlat_flush(&s->xlat);
 
     // resend extradata
     s->extdata_sent = 0;
-- 
2.43.0


From e193fa8e20e5dead50050ce5fc430975be8996f7 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 15 Dec 2021 17:58:21 +0000
Subject: [PATCH 033/157] Use pending tracking to reduce v4l2 latency

If there are more than 5 pending decodes outstanding then add a small
timeout to the capture poll to reduce the rate at which frames are
added.
---
 libavcodec/v4l2_m2m_dec.c | 58 ++++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index d8ebb466cd..7e7e4729d0 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -370,16 +370,19 @@ static inline int stream_started(const V4L2m2mContext * const s) {
 #define NQ_OK        0
 #define NQ_Q_FULL    1
 #define NQ_SRC_EMPTY 2
-#define NQ_DRAINING  3
-#define NQ_DEAD      4
+#define NQ_NONE      3
+#define NQ_DRAINING  4
+#define NQ_DEAD      5
 
 #define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
+#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE)
 
 // AVERROR_EOF     Flushing an already flushed stream
 // -ve             Error (all errors except EOF are unexpected)
 // NQ_OK (0)       OK
 // NQ_Q_FULL       Dst full (retry if we think V4L2 Q has space now)
 // NQ_SRC_EMPTY    Src empty (do not retry)
+// NQ_NONE         Enqueue not attempted
 // NQ_DRAINING     At EOS, dQ dest until EOS there too
 // NQ_DEAD         Not running (do not retry, do not attempt capture dQ)
 
@@ -468,23 +471,28 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
 static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 {
     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-    int src_rv;
+    int src_rv = NQ_NONE;
     int dst_rv = 1;  // Non-zero (done), non-negative (error) number
+    unsigned int i = 0;
 
     do {
-        av_log(avctx, AV_LOG_INFO, "Pending=%d\n", xlat_pending(&s->xlat));
-        src_rv = try_enqueue_src(avctx, s);
-
-        // If we got a frame last time and we have nothing to enqueue then
-        // return now. rv will be AVERROR(EAGAIN) indicating that we want more input
-        // This should mean that once decode starts we enter a stable state where
-        // we alternately ask for input and produce output
-        if (s->req_pkt && src_rv == NQ_SRC_EMPTY)
-            break;
-
-        if (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) {
-            av_log(avctx, AV_LOG_WARNING, "Poll says src Q has space but enqueue fail");
-            src_rv = NQ_SRC_EMPTY;  // If we can't enqueue pretend that there is nothing to enqueue
+        const int pending = xlat_pending(&s->xlat);
+        const int prefer_dq = (pending > 5);
+
+        // Enqueue another pkt for decode if
+        // (a) We don't have a lot of stuff in the buffer already OR
+        // (b) ... we (think we) do but we've failed to get a frame already OR
+        // (c) We've dequeued a lot of frames without asking for input
+        if (!prefer_dq || i != 0 || s->req_pkt > 2) {
+            src_rv = try_enqueue_src(avctx, s);
+
+            // If we got a frame last time or we've already tried to get a frame and
+            // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
+            // indicating that we want more input.
+            // This should mean that once decode starts we enter a stable state where
+            // we alternately ask for input and produce output
+            if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
+                break;
         }
 
         // Try to get a new frame if
@@ -495,9 +503,9 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
                 // Dequeue frame will unref any previous contents of frame
                 // if it returns success so we don't need an explicit unref
                 // when discarding
-                // This returns AVERROR(EAGAIN) if there isn't a frame ready yet
-                // but there is room in the input Q
-                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1);
+                // This returns AVERROR(EAGAIN) on timeout or if
+                // there is room in the input Q and timeout == -1
+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, prefer_dq ? 5 : -1);
 
                 if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
                     av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
@@ -510,10 +518,16 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
             } while (dst_rv == 0 && xlat_pts_out(avctx, &s->xlat, &s->pts_stat, frame));
         }
 
+        ++i;
+        if (i >= 256) {
+            av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i);
+            src_rv = AVERROR(EIO);
+        }
+
         // Continue trying to enqueue packets if either
         // (a) we succeeded last time OR
-        // (b) enqueue failed due to input Q full AND there is now room
-    } while (src_rv == NQ_OK || (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) );
+        // (b) we didn't ret a frame and we can retry the input
+    } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv)));
 
     // Ensure that the frame contains nothing if we aren't returning a frame
     // (might happen when discarding)
@@ -521,7 +535,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
         av_frame_unref(frame);
 
     // If we got a frame this time ask for a pkt next time
-    s->req_pkt = (dst_rv == 0);
+    s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0;
 
 #if 0
     if (dst_rv == 0)
-- 
2.43.0


From c3a89f13c32b99a76d5576a78d2f12727c400b9e Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 15 Dec 2021 12:23:54 +0000
Subject: [PATCH 034/157] Allow logger() to take const ctx

---
 libavcodec/v4l2_buffers.c | 2 +-
 libavcodec/v4l2_context.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 22da6bd722..39c0094aec 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -46,7 +46,7 @@ static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
         container_of(buf->context, V4L2m2mContext, capture);
 }
 
-static inline AVCodecContext *logger(V4L2Buffer *buf)
+static inline AVCodecContext *logger(const V4L2Buffer * const buf)
 {
     return buf_to_m2mctx(buf)->avctx;
 }
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 64540a37b3..d3df48aed4 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -43,14 +43,14 @@ struct v4l2_format_update {
     int update_avfmt;
 };
 
-static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx)
+static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
 {
     return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
         container_of(ctx, V4L2m2mContext, output) :
         container_of(ctx, V4L2m2mContext, capture);
 }
 
-static inline AVCodecContext *logger(V4L2Context *ctx)
+static inline AVCodecContext *logger(const V4L2Context *ctx)
 {
     return ctx_to_m2mctx(ctx)->avctx;
 }
-- 
2.43.0


From dc871df49288903e75b13b1f47c7df21571e9ebb Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 15 Dec 2021 13:00:27 +0000
Subject: [PATCH 035/157] Track numbere of bufs qed with an atomic

Safer and faster than counting status
---
 libavcodec/v4l2_buffers.c | 6 +++---
 libavcodec/v4l2_context.c | 3 ++-
 libavcodec/v4l2_context.h | 3 +--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 39c0094aec..2cf7be6632 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -922,6 +922,7 @@ fail:
 int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
 {
     int ret;
+    int qc;
 
     avbuf->buf.flags = avbuf->flags;
 
@@ -941,11 +942,10 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
         return AVERROR(err);
     }
 
-    ++avbuf->context->q_count;
+    qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1;
     av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
            avbuf->context->name, avbuf->buf.index,
-           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
-           avbuf->context->q_count);
+           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc);
 
     avbuf->status = V4L2BUF_IN_DRIVER;
 
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index d3df48aed4..268a057e53 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -599,7 +599,7 @@ static int v4l2_release_buffers(V4L2Context* ctx)
                     "  2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
         }
     }
-    ctx->q_count = 0;
+    atomic_store(&ctx->q_count, 0);
 
     return ret;
 }
@@ -1019,6 +1019,7 @@ int ff_v4l2_context_init(V4L2Context* ctx)
     }
 
     ff_mutex_init(&ctx->lock, NULL);
+    atomic_init(&ctx->q_count, 0);
 
     if (s->output_drm) {
         AVHWFramesContext *hwframes;
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 4cc164886c..a4176448d5 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -110,8 +110,7 @@ typedef struct V4L2Context {
     int no_pts_rescale;
 
     AVBufferRef *frames_ref;
-    int q_count;
-    int dq_count;
+    atomic_int q_count;
     struct ff_weak_link_master *wl_master;
 
     AVMutex lock;
-- 
2.43.0


From 3acbf1493d98d0500c3eb264c2a322289a370167 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 9 Dec 2021 12:01:25 +0000
Subject: [PATCH 036/157] Clear pkt_buf on flush

---
 libavcodec/v4l2_m2m_dec.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 7e7e4729d0..09ec496351 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -715,6 +715,9 @@ static void v4l2_decode_flush(AVCodecContext *avctx)
     if (ret < 0)
         av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret);
 
+    // Clear any buffered input packet
+    av_packet_unref(&s->buf_pkt);
+
     // V4L2 makes no guarantees about whether decoded frames are flushed or not
     // so mark all frames we are tracking to be discarded if they appear
     xlat_flush(&s->xlat);
-- 
2.43.0


From 7cb415333800b10920449a1f7d7896969992d4be Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 15 Dec 2021 12:52:56 +0000
Subject: [PATCH 037/157] Rework v4l2 buffer dequeue

---
 libavcodec/v4l2_context.c | 543 ++++++++++++++++++--------------------
 libavcodec/v4l2_context.h |   2 +
 libavcodec/v4l2_m2m.c     |   1 -
 libavcodec/v4l2_m2m.h     |  16 +-
 libavcodec/v4l2_m2m_dec.c | 138 ++++------
 5 files changed, 327 insertions(+), 373 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 268a057e53..d765181645 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -73,19 +73,27 @@ static AVRational v4l2_get_sar(V4L2Context *ctx)
     return sar;
 }
 
-static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2)
+static inline int ctx_buffers_alloced(const V4L2Context * const ctx)
 {
-    struct v4l2_format *fmt1 = &ctx->format;
-    int ret =  V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
-        fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
-        fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
-        :
-        fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
-        fmt1->fmt.pix.height != fmt2->fmt.pix.height;
+    return ctx->bufrefs != NULL;
+}
+
+// Width/Height changed or we don't have an alloc in the first place?
+static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2)
+{
+    const struct v4l2_format *fmt1 = &ctx->format;
+    int ret = !ctx_buffers_alloced(ctx) ||
+        (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+            fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
+            fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
+            :
+            fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
+            fmt1->fmt.pix.height != fmt2->fmt.pix.height);
 
     if (ret)
-        av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
+        av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n",
             ctx->name,
+            ctx_buffers_alloced(ctx),
             ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1),
             ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2));
 
@@ -167,10 +175,8 @@ static int do_source_change(V4L2m2mContext * const s)
 
     int ret;
     int reinit;
-    int full_reinit;
     struct v4l2_format cap_fmt = s->capture.format;
 
-    s->resize_pending = 0;
     s->capture.done = 0;
 
     ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
@@ -179,15 +185,21 @@ static int do_source_change(V4L2m2mContext * const s)
         return 0;
     }
 
-    s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
-
     get_default_selection(&s->capture, &s->capture.selection);
 
-    reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
+    reinit = ctx_resolution_changed(&s->capture, &cap_fmt);
+    s->capture.format = cap_fmt;
     if (reinit) {
         s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
         s->capture.width = ff_v4l2_get_format_width(&cap_fmt);
     }
+
+    // If we don't support selection (or it is bust) and we obviously have HD then kludge
+    if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) &&
+        (s->capture.height == 1088 && s->capture.width == 1920)) {
+        s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080};
+    }
+
     s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
 
     av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n",
@@ -195,11 +207,11 @@ static int do_source_change(V4L2m2mContext * const s)
            s->capture.selection.width, s->capture.selection.height,
            s->capture.selection.left, s->capture.selection.top);
 
-    s->reinit = 1;
-
     if (reinit) {
         if (avctx)
-            ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
+            ret = ff_set_dimensions(s->avctx,
+                                    s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width,
+                                    s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height);
         if (ret < 0)
             av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n");
 
@@ -208,11 +220,22 @@ static int do_source_change(V4L2m2mContext * const s)
             av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
             return AVERROR(EINVAL);
         }
+
+        // Update pixel format - should only actually do something on initial change
+        s->capture.av_pix_fmt =
+        ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
+        if (s->output_drm) {
+            avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
+            avctx->sw_pix_fmt = s->capture.av_pix_fmt;
+        }
+        else
+            avctx->pix_fmt = s->capture.av_pix_fmt;
+
         goto reinit_run;
     }
 
     /* Buffers are OK so just stream off to ack */
-    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only\n", __func__);
+    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__);
 
     ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
     if (ret)
@@ -225,54 +248,6 @@ reinit_run:
     return 1;
 }
 
-static int ctx_done(V4L2Context * const ctx)
-{
-    int rv = 0;
-    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-
-    ctx->done = 1;
-
-    if (s->resize_pending && !V4L2_TYPE_IS_OUTPUT(ctx->type))
-        rv = do_source_change(s);
-
-    return rv;
-}
-
-/**
- * handle resolution change event and end of stream event
- * returns 1 if reinit was successful, negative if it failed
- * returns 0 if reinit was not executed
- */
-static int v4l2_handle_event(V4L2Context *ctx)
-{
-    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-    struct v4l2_event evt = { 0 };
-    int ret;
-
-    ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt);
-    if (ret < 0) {
-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name);
-        return 0;
-    }
-
-    av_log(logger(ctx), AV_LOG_INFO, "Dq event %d\n", evt.type);
-
-    if (evt.type == V4L2_EVENT_EOS) {
-//        ctx->done = 1;
-        av_log(logger(ctx), AV_LOG_TRACE, "%s VIDIOC_EVENT_EOS\n", ctx->name);
-        return 0;
-    }
-
-    if (evt.type != V4L2_EVENT_SOURCE_CHANGE)
-        return 0;
-
-    s->resize_pending = 1;
-    if (!ctx->done)
-        return 0;
-
-    return do_source_change(s);
-}
-
 static int v4l2_stop_decode(V4L2Context *ctx)
 {
     struct v4l2_decoder_cmd cmd = {
@@ -313,243 +288,252 @@ static int v4l2_stop_encode(V4L2Context *ctx)
     return 0;
 }
 
-static int count_in_driver(const V4L2Context * const ctx)
+// DQ a buffer
+// Amalgamates all the various ways there are of signalling EOS/Event to
+// generate a consistant EPIPE.
+//
+// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped)
+//
+// Returns:
+//  0               Success
+//  AVERROR(EPIPE)  Nothing more to read
+//  *               AVERROR(..)
+
+ static int
+dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf)
 {
-    int i;
-    int n = 0;
+    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
+    AVCodecContext * const avctx = m->avctx;
+    V4L2Buffer * avbuf;
+    const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type);
 
-    if (!ctx->bufrefs)
-        return -1;
-
-    for (i = 0; i < ctx->num_buffers; ++i) {
-        V4L2Buffer *const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-        if (avbuf->status == V4L2BUF_IN_DRIVER)
-            ++n;
-    }
-    return n;
-}
+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
 
-static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
-{
-    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-    const int is_capture = !V4L2_TYPE_IS_OUTPUT(ctx->type);
-    struct v4l2_plane planes[VIDEO_MAX_PLANES];
-    struct v4l2_buffer buf = { 0 };
-    V4L2Buffer *avbuf;
-    struct pollfd pfd = {
-        .events =  POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */
-        .fd = ctx_to_m2mctx(ctx)->fd,
+    struct v4l2_buffer buf = {
+        .type = ctx->type,
+        .memory = V4L2_MEMORY_MMAP,
     };
-    int i, ret;
-    int no_rx_means_done = 0;
-
-    if (is_capture && ctx->bufrefs) {
-        for (i = 0; i < ctx->num_buffers; i++) {
-            avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-            if (avbuf->status == V4L2BUF_IN_DRIVER)
-                break;
-        }
-        if (i == ctx->num_buffers)
-            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers (%d) returned to "
-                                                "userspace. Increase num_capture_buffers "
-                                                "to prevent device deadlock or dropped "
-                                                "packets/frames.\n", i);
+
+    *ppavbuf = NULL;
+
+    if (ctx->flag_last)
+        return AVERROR(EPIPE);
+
+    if (is_mp) {
+        buf.length = VIDEO_MAX_PLANES;
+        buf.m.planes = planes;
     }
 
-#if 0
-    // I think this is true but pointless
-    // we will get some other form of EOF signal
-
-    /* if we are draining and there are no more capture buffers queued in the driver we are done */
-    if (is_capture && ctx_to_m2mctx(ctx)->draining) {
-        for (i = 0; i < ctx->num_buffers; i++) {
-            /* capture buffer initialization happens during decode hence
-             * detection happens at runtime
-             */
-            if (!ctx->bufrefs)
-                break;
-
-            avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-            if (avbuf->status == V4L2BUF_IN_DRIVER)
-                goto start;
+    while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) {
+        const int err = errno;
+        av_assert0(AVERROR(err) < 0);
+        if (err != EINTR) {
+            av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
+                ctx->name, av_err2str(AVERROR(err)));
+
+            if (err == EPIPE)
+                ctx->flag_last = 1;
+
+            return AVERROR(err);
         }
-        ctx->done = 1;
-        return NULL;
     }
-#endif
-
-start:
-    if (is_capture) {
-        /* no need to listen to requests for more input while draining */
-        if (ctx_to_m2mctx(ctx)->draining || timeout > 0)
-            pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
-    } else {
-        pfd.events =  POLLOUT | POLLWRNORM;
+    atomic_fetch_sub(&ctx->q_count, 1);
+
+    avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
+    avbuf->status = V4L2BUF_AVAILABLE;
+    avbuf->buf = buf;
+    if (is_mp) {
+        memcpy(avbuf->planes, planes, sizeof(planes));
+        avbuf->buf.m.planes = avbuf->planes;
     }
-    no_rx_means_done = s->resize_pending && is_capture;
 
-    for (;;) {
-        // If we have a resize pending then all buffers should be Qed
-        // With a resize pending we should be in drain but evidence suggests
-        // that not all decoders do this so poll to clear
-        int t2 = no_rx_means_done ? 0 : timeout < 0 ? 3000 : timeout;
-        const int e = pfd.events;
-
-        ret = poll(&pfd, 1, t2);
+    if (V4L2_TYPE_IS_CAPTURE(ctx->type)) {
+        // Zero length cap buffer return == EOS
+        if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) {
+            av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n");
 
-        if (ret > 0)
-            break;
+            // Must reQ so we don't leak
+            // May not matter if the next thing we do is release all the
+            // buffers but better to be tidy.
+            ff_v4l2_buffer_enqueue(avbuf);
 
-        if (ret < 0) {
-            int err = errno;
-            if (err == EINTR)
-                continue;
-            av_log(logger(ctx), AV_LOG_ERROR, "=== poll error %d (%s): events=%#x, cap buffers=%d\n",
-                   err, strerror(err),
-                   e, count_in_driver(ctx));
-            return NULL;
+            ctx->flag_last = 1;
+            return AVERROR(EPIPE);
         }
 
-        // ret == 0 (timeout)
-        if (no_rx_means_done) {
-            av_log(logger(ctx), AV_LOG_DEBUG, "Ctx done on timeout\n");
-            ret = ctx_done(ctx);
-            if (ret > 0)
-                goto start;
-        }
-        if (timeout == -1)
-            av_log(logger(ctx), AV_LOG_ERROR, "=== poll unexpected TIMEOUT: events=%#x, cap buffers=%d\n", e, count_in_driver(ctx));;
-        return NULL;
+#ifdef V4L2_BUF_FLAG_LAST
+        // If flag_last set then this contains data but is the last frame
+        // so remember that but return OK
+        if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0)
+            ctx->flag_last = 1;
+#endif
     }
 
-    /* 0. handle errors */
-    if (pfd.revents & POLLERR) {
-        /* if we are trying to get free buffers but none have been queued yet
-           no need to raise a warning */
-        if (timeout == 0) {
-            for (i = 0; i < ctx->num_buffers; i++) {
-                avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-                if (avbuf->status != V4L2BUF_AVAILABLE)
-                    av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
-            }
-        }
-        else
-            av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
+    *ppavbuf = avbuf;
+    return 0;
+}
 
-        return NULL;
-    }
+/**
+ * handle resolution change event and end of stream event
+ * Expects to be called after the stream has stopped
+ *
+ * returns 1 if reinit was successful, negative if it failed
+ * returns 0 if reinit was not executed
+ */
+static int
+get_event(V4L2m2mContext * const m)
+{
+    AVCodecContext * const avctx = m->avctx;
+    struct v4l2_event evt = { 0 };
 
-    /* 1. handle resolution changes */
-    if (pfd.revents & POLLPRI) {
-        ret = v4l2_handle_event(ctx);
-        if (ret < 0) {
-            /* if re-init failed, abort */
-            ctx->done = 1;
-            return NULL;
+    while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) {
+        const int rv = AVERROR(errno);
+        if (rv == AVERROR(EINTR))
+            continue;
+        if (rv == AVERROR(EAGAIN)) {
+            av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n");
+            return AVERROR_EOF;
         }
-        if (ret > 0)
-            goto start;
+        av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv));
+        return rv;
+    }
+
+    av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type);
+
+    if (evt.type == V4L2_EVENT_EOS) {
+        av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n");
+        return AVERROR_EOF;
     }
 
-    /* 2. dequeue the buffer */
-    if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
+    if (evt.type == V4L2_EVENT_SOURCE_CHANGE)
+        return do_source_change(m);
 
-        if (is_capture) {
-            /* there is a capture buffer ready */
-            if (pfd.revents & (POLLIN | POLLRDNORM))
-                goto dequeue;
+    return 0;
+}
 
-            // CAPTURE Q drained
-            if (no_rx_means_done) {
-                if (ctx_done(ctx) > 0)
-                    goto start;
-                return NULL;
-            }
 
-            /* the driver is ready to accept more input; instead of waiting for the capture
-             * buffer to complete we return NULL so input can proceed (we are single threaded)
-             */
-            if (pfd.revents & (POLLOUT | POLLWRNORM))
-                return NULL;
+// Get a buffer
+// If output then just gets the buffer in the expected way
+// If capture then runs the capture state m/c to deal with res change etc.
+// If return value == 0 then *ppavbuf != NULL
+
+static int
+get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout)
+{
+    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
+    AVCodecContext * const avctx = m->avctx;
+    const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type);
+
+    const unsigned int poll_cap = (POLLIN | POLLRDNORM);
+    const unsigned int poll_out = (POLLOUT | POLLWRNORM);
+    const unsigned int poll_event = POLLPRI;
+
+    *ppavbuf = NULL;
+
+    for (;;) {
+        struct pollfd pfd = {
+            .fd = m->fd,
+            // If capture && stream not started then assume we are waiting for the initial event
+            .events = !is_cap ? poll_out :
+                !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap :
+                    poll_event,
+        };
+        int ret;
+
+        if (ctx->done) {
+            av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name);
+            return AVERROR_EOF;
         }
 
-dequeue:
-        memset(&buf, 0, sizeof(buf));
-        buf.memory = V4L2_MEMORY_MMAP;
-        buf.type = ctx->type;
-        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-            memset(planes, 0, sizeof(planes));
-            buf.length = VIDEO_MAX_PLANES;
-            buf.m.planes = planes;
+        // If capture && timeout == -1 then also wait for rx buffer free
+        if (is_cap && timeout == -1 && m->output.streamon && !m->draining)
+            pfd.events |= poll_out;
+
+        // If nothing Qed all we will get is POLLERR - avoid that
+        if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) ||
+            (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) ||
+            (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) {
+            av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
+            return AVERROR(EAGAIN);
         }
 
-        while ((ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf)) == -1) {
-            const int err = errno;
-            if (err == EINTR)
+        // Timeout kludged s.t. "forever" eventually gives up & produces logging
+        // If waiting for an event when we have seen a last_frame then we expect
+        //   it to be ready already so force a short timeout
+        ret = poll(&pfd, 1,
+                   ff_v4l2_ctx_eos(ctx) ? 10 :
+                   timeout == -1 ? 3000 : timeout);
+        if (ret < 0) {
+            ret = AVERROR(errno);  // Remember errno before logging etc.
+            av_assert0(ret < 0);
+        }
+
+        av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n",
+               ctx->name, ret, timeout, pfd.events, pfd.revents);
+
+        if (ret < 0) {
+            if (ret == AVERROR(EINTR))
                 continue;
-            if (err != EAGAIN) {
-                // EPIPE on CAPTURE can be used instead of BUF_FLAG_LAST
-                if (err != EPIPE || !is_capture)
-                    av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
-                        ctx->name, av_err2str(AVERROR(err)));
-                if (ctx_done(ctx) > 0)
-                    goto start;
+            av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret));
+            return ret;
+        }
+
+        if (ret == 0) {
+            if (timeout == -1)
+                av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events);
+            if (ff_v4l2_ctx_eos(ctx)) {
+                av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name);
+                ret = get_event(m);
+                if (ret < 0) {
+                    ctx->done = 1;
+                    return ret;
+                }
             }
-            return NULL;
+            return AVERROR(EAGAIN);
         }
-        --ctx->q_count;
-        av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d field=%d\n",
-               ctx->name, buf.index,
-               buf.timestamp.tv_sec, buf.timestamp.tv_usec,
-               ctx->q_count, ++ctx->dq_count, buf.field);
-
-        avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
-        avbuf->status = V4L2BUF_AVAILABLE;
-        avbuf->buf = buf;
-        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-            memcpy(avbuf->planes, planes, sizeof(planes));
-            avbuf->buf.m.planes = avbuf->planes;
+
+        if ((pfd.revents & POLLERR) != 0) {
+            av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name);
+            return AVERROR_UNKNOWN;
         }
 
-        if (ctx_to_m2mctx(ctx)->draining && is_capture) {
-            int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
-                            buf.m.planes[0].bytesused : buf.bytesused;
-            if (bytesused == 0) {
-                av_log(logger(ctx), AV_LOG_DEBUG, "Buffer empty - reQ\n");
+        if ((pfd.revents & poll_event) != 0) {
+            ret = get_event(m);
+            if (ret < 0) {
+                ctx->done = 1;
+                return ret;
+            }
+            continue;
+        }
 
-                // Must reQ so we don't leak
-                // May not matter if the next thing we do is release all the
-                // buffers but better to be tidy.
-                ff_v4l2_buffer_enqueue(avbuf);
+        if ((pfd.revents & poll_cap) != 0) {
+            ret = dq_buf(ctx, ppavbuf);
+            if (ret == AVERROR(EPIPE))
+                continue;
+            return ret;
+        }
 
-                if (ctx_done(ctx) > 0)
-                    goto start;
-                return NULL;
-            }
-#ifdef V4L2_BUF_FLAG_LAST
-            if (buf.flags & V4L2_BUF_FLAG_LAST) {
-                av_log(logger(ctx), AV_LOG_TRACE, "FLAG_LAST set\n");
-                avbuf->status = V4L2BUF_IN_USE;  // Avoid flushing this buffer
-                ctx_done(ctx);
-            }
-#endif
+        if ((pfd.revents & poll_out) != 0) {
+            if (is_cap)
+                return AVERROR(EAGAIN);
+            return dq_buf(ctx, ppavbuf);
         }
 
-        return avbuf;
+        av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents);
+        return AVERROR_UNKNOWN;
     }
-
-    return NULL;
 }
 
 static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
 {
-    int timeout = 0; /* return when no more buffers to dequeue */
     int i;
 
     /* get back as many output buffers as possible */
     if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-          do {
-          } while (v4l2_dequeue_v4l2buf(ctx, timeout));
+        V4L2Buffer * avbuf;
+        do {
+            get_qbuf(ctx, &avbuf, 0);
+        } while (avbuf);
     }
 
     for (i = 0; i < ctx->num_buffers; i++) {
@@ -722,7 +706,7 @@ static void flush_all_buffers_status(V4L2Context* const ctx)
         if (buf->status == V4L2BUF_IN_DRIVER)
             buf->status = V4L2BUF_AVAILABLE;
     }
-    ctx->q_count = 0;
+    atomic_store(&ctx->q_count, 0);
 }
 
 static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
@@ -755,6 +739,10 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
     int ret;
     AVCodecContext * const avctx = logger(ctx);
 
+    // Avoid doing anything if there is nothing we can do
+    if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon)
+        return 0;
+
     ff_mutex_lock(&ctx->lock);
 
     if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
@@ -777,6 +765,9 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
                cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF");
     }
 
+    // Both stream off & on effectively clear flag_last
+    ctx->flag_last = 0;
+
     ff_mutex_unlock(&ctx->lock);
 
     return ret;
@@ -840,19 +831,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
 int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
 {
     V4L2Buffer *avbuf;
+    int rv;
 
-    /*
-     * timeout=-1 blocks until:
-     *  1. decoded frame available
-     *  2. an input buffer is ready to be dequeued
-     */
-    avbuf = v4l2_dequeue_v4l2buf(ctx, timeout);
-    if (!avbuf) {
-        if (ctx->done)
-            return AVERROR_EOF;
-
-        return AVERROR(EAGAIN);
-    }
+    if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
+        return rv;
 
     return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
 }
@@ -860,19 +842,10 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
 int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
 {
     V4L2Buffer *avbuf;
+    int rv;
 
-    /*
-     * blocks until:
-     *  1. encoded packet available
-     *  2. an input buffer ready to be dequeued
-     */
-    avbuf = v4l2_dequeue_v4l2buf(ctx, -1);
-    if (!avbuf) {
-        if (ctx->done)
-            return AVERROR_EOF;
-
-        return AVERROR(EAGAIN);
-    }
+    if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
+        return rv;
 
     return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
 }
@@ -956,6 +929,8 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers
     int ret;
     int i;
 
+    av_assert0(ctx->bufrefs == NULL);
+
     memset(&req, 0, sizeof(req));
     req.count = req_buffers;
     req.memory = V4L2_MEMORY_MMAP;
@@ -1033,8 +1008,8 @@ int ff_v4l2_context_init(V4L2Context* ctx)
         hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
         hwframes->format = AV_PIX_FMT_DRM_PRIME;
         hwframes->sw_format = ctx->av_pix_fmt;
-        hwframes->width = ctx->width;
-        hwframes->height = ctx->height;
+        hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width;
+        hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height;
         ret = av_hwframe_ctx_init(ctx->frames_ref);
         if (ret < 0)
             goto fail_unref_hwframes;
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index a4176448d5..565858a1ed 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -102,6 +102,8 @@ typedef struct V4L2Context {
      */
     int done;
 
+    int flag_last;
+
     /**
      * PTS rescale not wanted
      * If the PTS is just a dummy frame count then rescale is
diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index 516e6d9858..e26bd74c3e 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -235,7 +235,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s)
 
     /* 5. complete reinit */
     s->draining = 0;
-    s->reinit = 0;
 
     return 0;
 }
diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 3f86809623..d71f6b721c 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -84,8 +84,6 @@ typedef struct V4L2m2mContext {
     AVCodecContext *avctx;
     sem_t refsync;
     atomic_uint refcount;
-    int reinit;
-    int resize_pending;
 
     /* null frame/packet received */
     int draining;
@@ -180,15 +178,25 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx);
 int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);
 
 
-static inline unsigned int ff_v4l2_get_format_width(struct v4l2_format *fmt)
+static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt)
 {
     return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
 }
 
-static inline unsigned int ff_v4l2_get_format_height(struct v4l2_format *fmt)
+static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt)
 {
     return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
 }
 
+static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
+}
+
+static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx)
+{
+    return ctx->flag_last;
+}
+
 
 #endif /* AVCODEC_V4L2_M2M_H */
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 09ec496351..e4b6569ba5 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -113,9 +113,6 @@ static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *co
     if (ret < 0)
         av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n");
 
-    if (!s->capture.streamon || ret < 0)
-        return ret;
-
     ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd);
     if (ret < 0)
         av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno);
@@ -127,69 +124,12 @@ static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *co
 
 static int v4l2_try_start(AVCodecContext *avctx)
 {
-    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-    V4L2Context *const capture = &s->capture;
-    struct v4l2_selection selection = { 0 };
+    V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
     int ret;
 
     /* 1. start the output process */
     if ((ret = check_output_streamon(avctx, s)) != 0)
         return ret;
-
-    if (capture->streamon)
-        return 0;
-
-    /* 2. get the capture format */
-    capture->format.type = capture->type;
-    ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format);
-    if (ret) {
-        av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n");
-        return ret;
-    }
-
-    /* 2.1 update the AVCodecContext */
-    capture->av_pix_fmt =
-        ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
-    if (s->output_drm) {
-        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
-        avctx->sw_pix_fmt = capture->av_pix_fmt;
-    }
-    else
-        avctx->pix_fmt = capture->av_pix_fmt;
-
-    /* 3. set the crop parameters */
-#if 1
-    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-    selection.target = V4L2_SEL_TGT_CROP_DEFAULT;
-    ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
-    av_log(avctx, AV_LOG_INFO, "Post G selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height);
-#else
-    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-    selection.r.height = avctx->coded_height;
-    selection.r.width = avctx->coded_width;
-    av_log(avctx, AV_LOG_INFO, "Try selection %dx%d\n", avctx->coded_width, avctx->coded_height);
-    ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
-    av_log(avctx, AV_LOG_INFO, "Post S selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height);
-    if (1) {
-        ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
-        if (ret) {
-            av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
-        } else {
-            av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height);
-            /* update the size of the resulting frame */
-            capture->height = selection.r.height;
-            capture->width  = selection.r.width;
-        }
-    }
-#endif
-
-    /* 5. start the capture process */
-    ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
-    if (ret) {
-        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n");
-        return ret;
-    }
-
     return 0;
 }
 
@@ -364,7 +304,7 @@ xlat_pending(const xlat_track_t * const x)
 }
 
 static inline int stream_started(const V4L2m2mContext * const s) {
-    return s->capture.streamon && s->output.streamon;
+    return s->output.streamon;
 }
 
 #define NQ_OK        0
@@ -377,6 +317,9 @@ static inline int stream_started(const V4L2m2mContext * const s) {
 #define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
 #define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE)
 
+// do_not_get      If true then no new packet will be got but status will
+//                  be set appropriately
+
 // AVERROR_EOF     Flushing an already flushed stream
 // -ve             Error (all errors except EOF are unexpected)
 // NQ_OK (0)       OK
@@ -386,14 +329,14 @@ static inline int stream_started(const V4L2m2mContext * const s) {
 // NQ_DRAINING     At EOS, dQ dest until EOS there too
 // NQ_DEAD         Not running (do not retry, do not attempt capture dQ)
 
-static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s)
+static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get)
 {
     int ret;
 
     // If we don't already have a coded packet - get a new one
     // We will already have a coded pkt if the output Q was full last time we
     // tried to Q it
-    if (!s->buf_pkt.size) {
+    if (!s->buf_pkt.size && !do_not_get) {
         ret = ff_decode_get_packet(avctx, &s->buf_pkt);
 
         if (ret == AVERROR(EAGAIN)) {
@@ -435,6 +378,17 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
         xlat_pts_in(avctx, &s->xlat, &s->buf_pkt);
     }
 
+    if (s->draining) {
+        if (s->buf_pkt.size) {
+            av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n");
+            av_packet_unref(&s->buf_pkt);
+        }
+        return NQ_DRAINING;
+    }
+
+    if (!s->buf_pkt.size)
+        return NQ_NONE;
+
     if ((ret = check_output_streamon(avctx, s)) != 0)
         return ret;
 
@@ -471,7 +425,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
 static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 {
     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-    int src_rv = NQ_NONE;
+    int src_rv;
     int dst_rv = 1;  // Non-zero (done), non-negative (error) number
     unsigned int i = 0;
 
@@ -483,31 +437,40 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
         // (a) We don't have a lot of stuff in the buffer already OR
         // (b) ... we (think we) do but we've failed to get a frame already OR
         // (c) We've dequeued a lot of frames without asking for input
-        if (!prefer_dq || i != 0 || s->req_pkt > 2) {
-            src_rv = try_enqueue_src(avctx, s);
-
-            // If we got a frame last time or we've already tried to get a frame and
-            // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
-            // indicating that we want more input.
-            // This should mean that once decode starts we enter a stable state where
-            // we alternately ask for input and produce output
-            if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
-                break;
-        }
+        src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2));
+
+        // If we got a frame last time or we've already tried to get a frame and
+        // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
+        // indicating that we want more input.
+        // This should mean that once decode starts we enter a stable state where
+        // we alternately ask for input and produce output
+        if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
+            break;
 
         // Try to get a new frame if
         // (a) we haven't already got one AND
         // (b) enqueue returned a status indicating that decode should be attempted
         if (dst_rv != 0 && TRY_DQ(src_rv)) {
+            // Pick a timeout depending on state
+            const int t =
+                src_rv == NQ_DRAINING ? 300 :
+                prefer_dq ? 5 :
+                src_rv == NQ_Q_FULL ? -1 : 0;
+
             do {
                 // Dequeue frame will unref any previous contents of frame
                 // if it returns success so we don't need an explicit unref
                 // when discarding
                 // This returns AVERROR(EAGAIN) on timeout or if
                 // there is room in the input Q and timeout == -1
-                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, prefer_dq ? 5 : -1);
+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
 
-                if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
+                if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
+                    av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
+                    dst_rv = AVERROR_EOF;
+                    s->capture.done = 1;
+                }
+                else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
                     av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
                            s->draining, s->capture.done);
                 else if (dst_rv && dst_rv != AVERROR(EAGAIN))
@@ -630,8 +593,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
      * by the v4l2 driver; this event will trigger a full pipeline reconfig and
      * the proper values will be retrieved from the kernel driver.
      */
-    output->height = capture->height = avctx->coded_height;
-    output->width = capture->width = avctx->coded_width;
+//    output->height = capture->height = avctx->coded_height;
+//    output->width = capture->width = avctx->coded_width;
+    output->height = capture->height = 0;
+    output->width = capture->width = 0;
 
     output->av_codec_id = avctx->codec_id;
     output->av_pix_fmt  = AV_PIX_FMT_NONE;
@@ -703,7 +668,6 @@ static void v4l2_decode_flush(AVCodecContext *avctx)
     V4L2m2mContext * const s = priv->context;
     V4L2Context * const output = &s->output;
     V4L2Context * const capture = &s->capture;
-    int ret;
 
     av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
 
@@ -711,13 +675,19 @@ static void v4l2_decode_flush(AVCodecContext *avctx)
     // states like EOS processing so don't try to optimize out (having got it
     // wrong once)
 
-    ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
-    if (ret < 0)
-        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret);
+    ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
 
     // Clear any buffered input packet
     av_packet_unref(&s->buf_pkt);
 
+    // Clear a pending EOS
+    if (ff_v4l2_ctx_eos(capture)) {
+        // Arguably we could delay this but this is easy and doesn't require
+        // thought or extra vars
+        ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF);
+        ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
+    }
+
     // V4L2 makes no guarantees about whether decoded frames are flushed or not
     // so mark all frames we are tracking to be discarded if they appear
     xlat_flush(&s->xlat);
-- 
2.43.0


From 89746bf810ecbbc1291db1c396803115071fe238 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 9 Dec 2021 18:51:00 +0000
Subject: [PATCH 038/157] Honor result of ff_get_format if possible

---
 libavcodec/v4l2_m2m_dec.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index e4b6569ba5..c9655bcc3b 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -615,15 +615,19 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
      *       check the v4l2_get_drm_frame function.
      */
 
+    avctx->sw_pix_fmt = avctx->pix_fmt;
     gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
     av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n",
            avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
 
-    s->output_drm = 0;
     if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
         avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
         s->output_drm = 1;
     }
+    else {
+        capture->av_pix_fmt = gf_pix_fmt;
+        s->output_drm = 0;
+    }
 
     s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
     if (!s->device_ref) {
-- 
2.43.0


From 9ba0604fc2424aadf84b26cc57d176d1a3ea6c3a Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 14 Dec 2021 16:11:10 +0000
Subject: [PATCH 039/157] Add an always-reinit quirk

---
 libavcodec/v4l2_context.c |  7 +++++--
 libavcodec/v4l2_m2m.h     |  5 +++++
 libavcodec/v4l2_m2m_dec.c | 33 ++++++++++++++++++++++++++++++++-
 3 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index d765181645..c11b5e6863 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -188,6 +188,9 @@ static int do_source_change(V4L2m2mContext * const s)
     get_default_selection(&s->capture, &s->capture.selection);
 
     reinit = ctx_resolution_changed(&s->capture, &cap_fmt);
+    if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0)
+        reinit = 1;
+
     s->capture.format = cap_fmt;
     if (reinit) {
         s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
@@ -202,10 +205,10 @@ static int do_source_change(V4L2m2mContext * const s)
 
     s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
 
-    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n",
+    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d, reinit=%d\n",
            s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
            s->capture.selection.width, s->capture.selection.height,
-           s->capture.selection.left, s->capture.selection.top);
+           s->capture.selection.left, s->capture.selection.top, reinit);
 
     if (reinit) {
         if (avctx)
diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index d71f6b721c..f1923bb26d 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -113,6 +113,11 @@ typedef struct V4L2m2mContext {
 
     /* Ext data sent */
     int extdata_sent;
+
+#define FF_V4L2_QUIRK_REINIT_ALWAYS     1
+    /* Quirks */
+    unsigned int quirks;
+
 } V4L2m2mContext;
 
 typedef struct V4L2m2mPriv {
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index c9655bcc3b..e2b10f5e3a 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -540,6 +540,34 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 }
 #endif
 
+static int
+get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
+{
+    struct v4l2_capability cap;
+
+    memset(&cap, 0, sizeof(cap));
+    while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) {
+        int err = errno;
+        if (err == EINTR)
+            continue;
+        av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err));
+        return AVERROR(err);
+    }
+
+    // Could be made table driven if we have a few more but right now there
+    // seems no point
+
+    // Meson (amlogic) always gives a resolution changed event after output
+    // streamon and userspace must (re)allocate capture buffers and streamon
+    // capture to clear the event even if the capture buffers were the right
+    // size in the first place.
+    if (strcmp(cap.driver, "meson-vdec") == 0)
+        s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS;
+
+    av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks);
+    return 0;
+}
+
 // This heuristic is for H264 but use for everything
 static uint32_t max_coded_size(const AVCodecContext * const avctx)
 {
@@ -646,7 +674,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
         return ret;
     }
 
-    return v4l2_prepare_decoder(s);
+    if ((ret = v4l2_prepare_decoder(s)) < 0)
+        return ret;
+
+    return get_quirks(avctx, s);
 }
 
 static av_cold int v4l2_decode_close(AVCodecContext *avctx)
-- 
2.43.0


From a8f462aa153d463aebcf0b10849627c3bfe38d72 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 4 Jan 2022 16:58:31 +0000
Subject: [PATCH 040/157] v4l2_buffers: rework flags for keyframe

Previously flags could become confused and keyframe info could be lost.
This fixes that and removes the duplicate flags field in V4L2Buffer.
---
 libavcodec/v4l2_buffers.c | 15 ++++++++++-----
 libavcodec/v4l2_buffers.h |  1 -
 libavcodec/v4l2_context.c | 18 +++++++++++++++++-
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 2cf7be6632..62d1c26053 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -680,7 +680,9 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
 
 int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
 {
-    out->buf.flags = frame->key_frame ? (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME) : (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME);
+    out->buf.flags = frame->key_frame ?
+        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
+        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
     // Beware that colour info is held in format rather than the actual
     // v4l2 buffer struct so this may not be as useful as you might hope
     v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
@@ -706,6 +708,10 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
 
     /* 2. get frame information */
     frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME);
+    frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I :
+        (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P :
+        (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B :
+            AV_PICTURE_TYPE_NONE;
     frame->color_primaries = v4l2_get_color_primaries(avbuf);
     frame->colorspace = v4l2_get_color_space(avbuf);
     frame->color_range = v4l2_get_color_range(avbuf);
@@ -779,8 +785,9 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
 
     v4l2_set_pts(out, pkt->pts);
 
-    if (pkt->flags & AV_PKT_FLAG_KEY)
-        out->flags = V4L2_BUF_FLAG_KEYFRAME;
+    out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ?
+        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
+        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
 
     return ret;
 }
@@ -924,8 +931,6 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
     int ret;
     int qc;
 
-    avbuf->buf.flags = avbuf->flags;
-
     if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) {
         av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
                avbuf->context->name, avbuf->buf.index,
diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
index 641e0e147b..3b7ca4d99e 100644
--- a/libavcodec/v4l2_buffers.h
+++ b/libavcodec/v4l2_buffers.h
@@ -73,7 +73,6 @@ typedef struct V4L2Buffer {
     struct v4l2_buffer buf;
     struct v4l2_plane planes[VIDEO_MAX_PLANES];
 
-    int flags;
     enum V4L2Buffer_status status;
 
 } V4L2Buffer;
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index c11b5e6863..53b522d43e 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -527,6 +527,22 @@ get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout
     }
 }
 
+// Clear out flags and timestamps that should should be set by the user
+// Returns the passed avbuf
+static V4L2Buffer *
+clean_v4l2_buffer(V4L2Buffer * const avbuf)
+{
+    struct v4l2_buffer *const buf = &avbuf->buf;
+
+    buf->flags = 0;
+    buf->field = V4L2_FIELD_ANY;
+    buf->timestamp = (struct timeval){0};
+    buf->timecode = (struct v4l2_timecode){0};
+    buf->sequence = 0;
+
+    return avbuf;
+}
+
 static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
 {
     int i;
@@ -542,7 +558,7 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
     for (i = 0; i < ctx->num_buffers; i++) {
         V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
         if (avbuf->status == V4L2BUF_AVAILABLE)
-            return avbuf;
+            return clean_v4l2_buffer(avbuf);
     }
 
     return NULL;
-- 
2.43.0


From 66385b656c09df4edd944c3a9ad0b22c7fe84827 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 22 Mar 2022 11:44:30 +0000
Subject: [PATCH 041/157] v4l2m2m: Rework decode to wait for missing buffer,
 add dynamic pending

Previously receive_frame exited with EAGAIN if no capture buffer
availble in the Q.  Now it waits in the hope that another thread will
post one.

The prefer dQ logic is now dynamic to help with cases where PTS/DTS
lies.  If it looks like we are never getting a frame then the
threshold is increased.  It then slowly decays over time to cope with
false alarms.
---
 libavcodec/v4l2_buffers.c |  6 +++--
 libavcodec/v4l2_context.c |  7 +++--
 libavcodec/v4l2_context.h |  3 +++
 libavcodec/v4l2_m2m.h     |  2 ++
 libavcodec/v4l2_m2m_dec.c | 57 +++++++++++++++++++++++++++++++++++++--
 5 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 62d1c26053..8c4f18dbed 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -947,12 +947,14 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
         return AVERROR(err);
     }
 
+    // Lock not wanted - if called from buffer free then lock already obtained
     qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1;
+    avbuf->status = V4L2BUF_IN_DRIVER;
+    pthread_cond_broadcast(&avbuf->context->cond);
+
     av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
            avbuf->context->name, avbuf->buf.index,
            avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc);
 
-    avbuf->status = V4L2BUF_IN_DRIVER;
-
     return 0;
 }
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 53b522d43e..7ddb759810 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -300,6 +300,7 @@ static int v4l2_stop_encode(V4L2Context *ctx)
 // Returns:
 //  0               Success
 //  AVERROR(EPIPE)  Nothing more to read
+//  AVERROR(ENOSPC) No buffers in Q to put result in
 //  *               AVERROR(..)
 
  static int
@@ -457,7 +458,7 @@ get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout
             (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) ||
             (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) {
             av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
-            return AVERROR(EAGAIN);
+            return AVERROR(ENOSPC);
         }
 
         // Timeout kludged s.t. "forever" eventually gives up & produces logging
@@ -864,7 +865,7 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
     int rv;
 
     if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
-        return rv;
+        return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
 
     return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
 }
@@ -938,6 +939,7 @@ void ff_v4l2_context_release(V4L2Context* ctx)
     av_buffer_unref(&ctx->frames_ref);
 
     ff_mutex_destroy(&ctx->lock);
+    pthread_cond_destroy(&ctx->cond);
 }
 
 
@@ -1013,6 +1015,7 @@ int ff_v4l2_context_init(V4L2Context* ctx)
     }
 
     ff_mutex_init(&ctx->lock, NULL);
+    pthread_cond_init(&ctx->cond, NULL);
     atomic_init(&ctx->q_count, 0);
 
     if (s->output_drm) {
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 565858a1ed..0efff58f18 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -116,6 +116,7 @@ typedef struct V4L2Context {
     struct ff_weak_link_master *wl_master;
 
     AVMutex lock;
+    pthread_cond_t cond;
 } V4L2Context;
 
 /**
@@ -182,6 +183,8 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
  * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
  *
  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
+ *                AVERROR(ENOSPC) if no buffer availible to put
+ *                the frame in
  */
 int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
 
diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index f1923bb26d..9a20447030 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -105,6 +105,8 @@ typedef struct V4L2m2mContext {
 
     /* Frame tracking */
     xlat_track_t xlat;
+    int pending_hw;
+    int pending_n;
 
     pts_stats_t pts_stat;
 
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index e2b10f5e3a..2e30449dfc 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -251,7 +251,8 @@ xlat_pts_out(AVCodecContext *const avctx,
 
     frame->best_effort_timestamp = pts_stats_guess(ps);
     frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
-    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
+           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
     return 0;
 }
 
@@ -422,6 +423,36 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
     return ret;
 }
 
+static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
+{
+    int rv = 0;
+
+    ff_mutex_lock(&ctx->lock);
+
+    while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) {
+        if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) {
+            rv = AVERROR(errno);
+            av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv));
+            break;
+        }
+    }
+
+    ff_mutex_unlock(&ctx->lock);
+    return rv;
+}
+
+// Number of frames over what xlat_pending returns that we keep *16
+// This is a min value - if it appears to be too small the threshold should
+// adjust dynamically.
+#define PENDING_HW_MIN      (3 * 16)
+// Offset to use when setting dynamically
+// Set to %16 == 15 to avoid the threshold changing immediately as we relax
+#define PENDING_HW_OFFSET   (PENDING_HW_MIN - 1)
+// Number of consecutive times we've failed to get a frame when we prefer it
+// before we increase the prefer threshold (5ms * N = max expected decode
+// time)
+#define PENDING_N_THRESHOLD 6
+
 static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 {
     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
@@ -431,7 +462,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 
     do {
         const int pending = xlat_pending(&s->xlat);
-        const int prefer_dq = (pending > 5);
+        const int prefer_dq = (pending > s->pending_hw / 16);
 
         // Enqueue another pkt for decode if
         // (a) We don't have a lot of stuff in the buffer already OR
@@ -465,6 +496,27 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
                 // there is room in the input Q and timeout == -1
                 dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
 
+                // Failure due to no buffer in Q?
+                if (dst_rv == AVERROR(ENOSPC)) {
+                    // Wait & retry
+                    if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
+                        dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
+                    }
+                }
+
+                // Adjust dynamic pending threshold
+                if (dst_rv == 0) {
+                    if (--s->pending_hw < PENDING_HW_MIN)
+                        s->pending_hw = PENDING_HW_MIN;
+                    s->pending_n = 0;
+                }
+                else if (dst_rv == AVERROR(EAGAIN)) {
+                    if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
+                        s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
+                        s->pending_n = 0;
+                    }
+                }
+
                 if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
                     av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
                     dst_rv = AVERROR_EOF;
@@ -613,6 +665,7 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
 
     xlat_init(&s->xlat);
     pts_stats_init(&s->pts_stat, avctx, "decoder");
+    s->pending_hw = PENDING_HW_MIN;
 
     capture = &s->capture;
     output = &s->output;
-- 
2.43.0


From 1e7f61a010616b3c43c7b3b0d3d582d62f2dcddd Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 25 Mar 2022 15:37:58 +0000
Subject: [PATCH 042/157] v4l2_m2m2_dec: Avoid loop if unable to resize buffers

If source change signals a buffer size that cannot be honored give up
rather than looping indefinitely.  This happens on Pi if (say) a
2560x1440 h264 stream is presented to the decode.
---
 libavcodec/v4l2_context.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 7ddb759810..007a58c8f1 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -205,8 +205,9 @@ static int do_source_change(V4L2m2mContext * const s)
 
     s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
 
-    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d, reinit=%d\n",
+    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n",
            s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
+           s->capture.width, s->capture.height,
            s->capture.selection.width, s->capture.selection.height,
            s->capture.selection.left, s->capture.selection.top, reinit);
 
@@ -224,9 +225,17 @@ static int do_source_change(V4L2m2mContext * const s)
             return AVERROR(EINVAL);
         }
 
+        if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) ||
+            s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) {
+            av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n",
+                   s->capture.width, s->capture.height,
+                   ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format));
+            return AVERROR(EINVAL);
+        }
+
         // Update pixel format - should only actually do something on initial change
         s->capture.av_pix_fmt =
-        ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
+            ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
         if (s->output_drm) {
             avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
             avctx->sw_pix_fmt = s->capture.av_pix_fmt;
-- 
2.43.0


From 006f1b584db1beaa850b580ee940214f52498a0d Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 25 Mar 2022 18:14:40 +0000
Subject: [PATCH 043/157] v4l2dec: Improve size/format validation on init

---
 libavcodec/v4l2_m2m_dec.c      | 84 ++++++++++++++++++++++++++++++++--
 libavcodec/v4l2_request_hevc.c | 11 +++++
 2 files changed, 92 insertions(+), 3 deletions(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 2e30449dfc..8dcadf461b 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -592,6 +592,76 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 }
 #endif
 
+static int
+check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
+{
+    unsigned int i;
+    const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format);
+    const uint32_t w = avctx->coded_width;
+    const uint32_t h = avctx->coded_height;
+
+    if (w == 0 || h == 0 || fcc == 0) {
+        av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc));
+        return 0;
+    }
+
+    for (i = 0;; ++i) {
+        struct v4l2_frmsizeenum fs = {
+            .index = i,
+            .pixel_format = fcc,
+        };
+
+        while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) {
+            const int err = AVERROR(errno);
+            if (err == AVERROR(EINTR))
+                continue;
+            if (i == 0 && err == AVERROR(ENOTTY)) {
+                av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n");
+                return 0;
+            }
+            if (err != AVERROR(EINVAL)) {
+                av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err));
+                return err;
+            }
+            av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in frame size enums\n",
+                   w, h, av_fourcc2str(fcc));
+            return err;
+        }
+
+        switch (fs.type) {
+            case V4L2_FRMSIZE_TYPE_DISCRETE:
+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i,
+                       fs.discrete.width,fs.discrete.height);
+                if (w == fs.discrete.width && h == fs.discrete.height)
+                    return 0;
+                break;
+            case V4L2_FRMSIZE_TYPE_STEPWISE:
+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
+                       fs.stepwise.min_width, fs.stepwise.min_height,
+                       fs.stepwise.max_width, fs.stepwise.max_height,
+                       fs.stepwise.step_width,fs.stepwise.step_height);
+                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
+                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height &&
+                    (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 &&
+                    (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0)
+                    return 0;
+                break;
+            case V4L2_FRMSIZE_TYPE_CONTINUOUS:
+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
+                       fs.stepwise.min_width, fs.stepwise.min_height,
+                       fs.stepwise.max_width, fs.stepwise.max_height,
+                       fs.stepwise.step_width,fs.stepwise.step_height);
+                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
+                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height)
+                    return 0;
+                break;
+            default:
+                av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type);
+                return AVERROR(EINVAL);
+        }
+    }
+}
+
 static int
 get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
 {
@@ -698,8 +768,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
 
     avctx->sw_pix_fmt = avctx->pix_fmt;
     gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
-    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n",
-           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
+    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
+           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
+           avctx->coded_width, avctx->coded_height,
+           gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
 
     if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
         avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
@@ -730,7 +802,13 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     if ((ret = v4l2_prepare_decoder(s)) < 0)
         return ret;
 
-    return get_quirks(avctx, s);
+    if ((ret = get_quirks(avctx, s)) != 0)
+        return ret;
+
+    if ((ret = check_size(avctx, s)) != 0)
+        return ret;
+
+    return 0;
 }
 
 static av_cold int v4l2_decode_close(AVCodecContext *avctx)
diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
index b0a5930844..76ab0916cd 100644
--- a/libavcodec/v4l2_request_hevc.c
+++ b/libavcodec/v4l2_request_hevc.c
@@ -147,6 +147,17 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
 
     av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
 
+    // Give up immediately if this is something that we have no code to deal with
+    if (h->ps.sps->chroma_format_idc != 1) {
+        av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc);
+        return AVERROR_PATCHWELCOME;
+    }
+    if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) ||
+        h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) {
+        av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma);
+        return AVERROR_PATCHWELCOME;
+    }
+
     if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) {
         av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n");
         return (AVERROR(-ret));
-- 
2.43.0


From 60623e6a55b0861d9e36c72d0abfd50df6494620 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 13 Apr 2022 16:05:56 +0000
Subject: [PATCH 044/157] v4l2 stateless hevc: Add another API variation for
 linux 5.18

This is probably going to be a short lived variation and may end up
being reverted if no release using it ever ends up in the wild.
---
 libavcodec/Makefile            |   2 +-
 libavcodec/hevc-ctrls-v3.h     | 255 +++++++++++++++++++++++++++++++++
 libavcodec/v4l2_req_hevc_v3.c  |   3 +
 libavcodec/v4l2_req_hevc_vx.c  |  17 +++
 libavcodec/v4l2_req_media.c    |  15 +-
 libavcodec/v4l2_req_media.h    |   3 +
 libavcodec/v4l2_request_hevc.c |   6 +-
 libavcodec/v4l2_request_hevc.h |   1 +
 8 files changed, 295 insertions(+), 7 deletions(-)
 create mode 100644 libavcodec/hevc-ctrls-v3.h
 create mode 100644 libavcodec/v4l2_req_hevc_v3.c

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index e1aa0ba014..2b3c16185d 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -1000,7 +1000,7 @@ OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
 OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
 OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec.o
 OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o\
-                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o
+                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o
 OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
diff --git a/libavcodec/hevc-ctrls-v3.h b/libavcodec/hevc-ctrls-v3.h
new file mode 100644
index 0000000000..4e35bd583d
--- /dev/null
+++ b/libavcodec/hevc-ctrls-v3.h
@@ -0,0 +1,255 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * These are the HEVC state controls for use with stateless HEVC
+ * codec drivers.
+ *
+ * It turns out that these structs are not stable yet and will undergo
+ * more changes. So keep them private until they are stable and ready to
+ * become part of the official public API.
+ */
+
+#ifndef _HEVC_CTRLS_H_
+#define _HEVC_CTRLS_H_
+
+#include <linux/videodev2.h>
+
+/* The pixel format isn't stable at the moment and will likely be renamed. */
+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
+
+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
+
+/* enum v4l2_ctrl_type type values */
+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
+#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
+
+enum v4l2_mpeg_video_hevc_decode_mode {
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
+};
+
+enum v4l2_mpeg_video_hevc_start_code {
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
+};
+
+#define V4L2_HEVC_SLICE_TYPE_B	0
+#define V4L2_HEVC_SLICE_TYPE_P	1
+#define V4L2_HEVC_SLICE_TYPE_I	2
+
+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
+
+/* The controls are not stable at the moment and will likely be reworked. */
+struct v4l2_ctrl_hevc_sps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
+	__u16	pic_width_in_luma_samples;
+	__u16	pic_height_in_luma_samples;
+	__u8	bit_depth_luma_minus8;
+	__u8	bit_depth_chroma_minus8;
+	__u8	log2_max_pic_order_cnt_lsb_minus4;
+	__u8	sps_max_dec_pic_buffering_minus1;
+	__u8	sps_max_num_reorder_pics;
+	__u8	sps_max_latency_increase_plus1;
+	__u8	log2_min_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_luma_coding_block_size;
+	__u8	log2_min_luma_transform_block_size_minus2;
+	__u8	log2_diff_max_min_luma_transform_block_size;
+	__u8	max_transform_hierarchy_depth_inter;
+	__u8	max_transform_hierarchy_depth_intra;
+	__u8	pcm_sample_bit_depth_luma_minus1;
+	__u8	pcm_sample_bit_depth_chroma_minus1;
+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
+	__u8	num_short_term_ref_pic_sets;
+	__u8	num_long_term_ref_pics_sps;
+	__u8	chroma_format_idc;
+	__u8	sps_max_sub_layers_minus1;
+
+	__u64	flags;
+};
+
+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
+
+struct v4l2_ctrl_hevc_pps {
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
+	__u8	num_extra_slice_header_bits;
+	__u8	num_ref_idx_l0_default_active_minus1;
+	__u8	num_ref_idx_l1_default_active_minus1;
+	__s8	init_qp_minus26;
+	__u8	diff_cu_qp_delta_depth;
+	__s8	pps_cb_qp_offset;
+	__s8	pps_cr_qp_offset;
+	__u8	num_tile_columns_minus1;
+	__u8	num_tile_rows_minus1;
+	__u8	column_width_minus1[20];
+	__u8	row_height_minus1[22];
+	__s8	pps_beta_offset_div2;
+	__s8	pps_tc_offset_div2;
+	__u8	log2_parallel_merge_level_minus2;
+
+	__u8	padding[4];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
+
+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
+
+struct v4l2_hevc_dpb_entry {
+	__u64	timestamp;
+	__u8	flags;
+	__u8	field_pic;
+	__u16	pic_order_cnt[2];
+	__u8	padding[2];
+};
+
+struct v4l2_hevc_pred_weight_table {
+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__u8	padding[6];
+
+	__u8	luma_log2_weight_denom;
+	__s8	delta_chroma_log2_weight_denom;
+};
+
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
+
+struct v4l2_ctrl_hevc_slice_params {
+	__u32	bit_size;
+	__u32	data_bit_offset;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u32	slice_segment_addr;
+	__u32	num_entry_point_offsets;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+	__u8	nal_unit_type;
+	__u8	nuh_temporal_id_plus1;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	slice_type;
+	__u8	colour_plane_id;
+	__u16	slice_pic_order_cnt;
+	__u8	num_ref_idx_l0_active_minus1;
+	__u8	num_ref_idx_l1_active_minus1;
+	__u8	collocated_ref_idx;
+	__u8	five_minus_max_num_merge_cand;
+	__s8	slice_qp_delta;
+	__s8	slice_cb_qp_offset;
+	__s8	slice_cr_qp_offset;
+	__s8	slice_act_y_qp_offset;
+	__s8	slice_act_cb_qp_offset;
+	__s8	slice_act_cr_qp_offset;
+	__s8	slice_beta_offset_div2;
+	__s8	slice_tc_offset_div2;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
+	__u8	pic_struct;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+
+	__u8	padding[5];
+
+	__u32	entry_point_offset_minus1[256];
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
+	struct v4l2_hevc_pred_weight_table pred_weight_table;
+
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
+
+struct v4l2_ctrl_hevc_decode_params {
+	__s32	pic_order_cnt_val;
+	__u8	num_active_dpb_entries;
+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	num_poc_st_curr_before;
+	__u8	num_poc_st_curr_after;
+	__u8	num_poc_lt_curr;
+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u64	flags;
+};
+
+struct v4l2_ctrl_hevc_scaling_matrix {
+	__u8	scaling_list_4x4[6][16];
+	__u8	scaling_list_8x8[6][64];
+	__u8	scaling_list_16x16[6][64];
+	__u8	scaling_list_32x32[2][64];
+	__u8	scaling_list_dc_coef_16x16[6];
+	__u8	scaling_list_dc_coef_32x32[2];
+};
+
+/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
+#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
+/*
+ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
+ * the number of data (in bits) to skip in the
+ * slice segment header.
+ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
+ * to before syntax element "slice_temporal_mvp_enabled_flag".
+ * If IDR, the skipped bits are just "pic_output_flag"
+ * (separate_colour_plane_flag is not supported).
+ */
+#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
+
+#endif
diff --git a/libavcodec/v4l2_req_hevc_v3.c b/libavcodec/v4l2_req_hevc_v3.c
new file mode 100644
index 0000000000..dcc8d95632
--- /dev/null
+++ b/libavcodec/v4l2_req_hevc_v3.c
@@ -0,0 +1,3 @@
+#define HEVC_CTRLS_VERSION 3
+#include "v4l2_req_hevc_vx.c"
+
diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
index 0ae03b10c4..611fa21cc3 100644
--- a/libavcodec/v4l2_req_hevc_vx.c
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -16,6 +16,8 @@
 
 #elif HEVC_CTRLS_VERSION == 2
 #include "hevc-ctrls-v2.h"
+#elif HEVC_CTRLS_VERSION == 3
+#include "hevc-ctrls-v3.h"
 #else
 #error Unknown HEVC_CTRLS_VERSION
 #endif
@@ -147,6 +149,7 @@ static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_t
     }
 }
 
+#if HEVC_CTRLS_VERSION <= 2
 static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
 {
     const HEVCFrame *frame;
@@ -172,6 +175,7 @@ static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
 
     return 0;
 }
+#endif
 
 static unsigned int
 get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
@@ -247,7 +251,12 @@ fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const
             struct v4l2_hevc_dpb_entry * const entry = entries + n++;
 
             entry->timestamp = frame_capture_dpb(frame->frame);
+#if HEVC_CTRLS_VERSION <= 2
             entry->rps = find_frame_rps_type(h, entry->timestamp);
+#else
+            entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 :
+                V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE;
+#endif
             entry->field_pic = frame->frame->interlaced_frame;
 
             /* TODO: Interleaved: Get the POC for each field. */
@@ -1011,6 +1020,14 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
     };
     const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc);
 
+#if HEVC_CTRLS_VERSION == 2
+    if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0))
+        return AVERROR(EINVAL);
+#elif HEVC_CTRLS_VERSION == 3
+    if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0))
+        return AVERROR(EINVAL);
+#endif
+
     if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) {
         av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION);
         return AVERROR(EINVAL);
diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c
index eb00ecb406..980b306b8a 100644
--- a/libavcodec/v4l2_req_media.c
+++ b/libavcodec/v4l2_req_media.c
@@ -604,6 +604,7 @@ struct mediabufs_ctl {
 
     struct v4l2_format src_fmt;
     struct v4l2_format dst_fmt;
+    struct v4l2_capability capability;
 };
 
 static int qe_v4l2_queue(struct qent_base *const be,
@@ -1498,20 +1499,24 @@ void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc)
     mediabufs_ctl_delete(mbc);
 }
 
+unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc)
+{
+    return mbc->capability.version;
+}
+
 static int set_capabilities(struct mediabufs_ctl *const mbc)
 {
-    struct v4l2_capability capability = { 0 };
     uint32_t caps;
 
-    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &capability)) {
+    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) {
         int err = errno;
         request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err));
         return -err;
     }
 
-    caps = (capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
-            capability.device_caps :
-            capability.capabilities;
+    caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
+            mbc->capability.device_caps :
+            mbc->capability.capabilities;
 
     if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) {
         mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h
index 2f826cfb14..0307a831de 100644
--- a/libavcodec/v4l2_req_media.h
+++ b/libavcodec/v4l2_req_media.h
@@ -142,6 +142,9 @@ MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw,
                   struct dmabufs_ctl * const dbsc,
                   unsigned int n);
 
+#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c))
+unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc);
+
 struct mediabufs_ctl * mediabufs_ctl_new(void * const dc,
                      const char *vpath, struct pollqueue *const pq);
 void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc);
diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
index 76ab0916cd..20e4e0ab15 100644
--- a/libavcodec/v4l2_request_hevc.c
+++ b/libavcodec/v4l2_request_hevc.c
@@ -210,7 +210,11 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
         goto fail4;
     }
 
-    if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
+    if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
+        ctx->fns = &V2(ff_v4l2_req_hevc, 3);
+    }
+    else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n");
         ctx->fns = &V2(ff_v4l2_req_hevc, 2);
     }
diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
index f14f594564..ed48d62e2d 100644
--- a/libavcodec/v4l2_request_hevc.h
+++ b/libavcodec/v4l2_request_hevc.h
@@ -98,5 +98,6 @@ typedef struct v4l2_req_decode_fns {
 
 extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
 extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3);
 
 #endif
-- 
2.43.0


From 5bab9948f119dfcdc5dae19579e922f8da3b8360 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 3 May 2022 12:44:42 +0000
Subject: [PATCH 045/157] Remove V4l2 frame size check for meson-vdec

---
 libavcodec/v4l2_m2m.h     |  3 ++-
 libavcodec/v4l2_m2m_dec.c | 10 +++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 9a20447030..6bd5e8eda7 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -116,7 +116,8 @@ typedef struct V4L2m2mContext {
     /* Ext data sent */
     int extdata_sent;
 
-#define FF_V4L2_QUIRK_REINIT_ALWAYS     1
+#define FF_V4L2_QUIRK_REINIT_ALWAYS             1
+#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN    2
     /* Quirks */
     unsigned int quirks;
 
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 8dcadf461b..888ba67fea 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -604,6 +604,10 @@ check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
         av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc));
         return 0;
     }
+    if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) {
+        av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc));
+        return 0;
+    }
 
     for (i = 0;; ++i) {
         struct v4l2_frmsizeenum fs = {
@@ -623,8 +627,8 @@ check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
                 av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err));
                 return err;
             }
-            av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in frame size enums\n",
-                   w, h, av_fourcc2str(fcc));
+            av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n",
+                   w, h, av_fourcc2str(fcc), i);
             return err;
         }
 
@@ -684,7 +688,7 @@ get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
     // capture to clear the event even if the capture buffers were the right
     // size in the first place.
     if (strcmp(cap.driver, "meson-vdec") == 0)
-        s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS;
+        s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN;
 
     av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks);
     return 0;
-- 
2.43.0


From 4c8fc3cac18ae9e8f581201620ada2e37791a558 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 23 May 2022 18:05:20 +0100
Subject: [PATCH 046/157] v4l2m2m_dec: Make some error rturns a bit more robust

---
 libavcodec/v4l2_context.c |  5 ++---
 libavcodec/v4l2_m2m_dec.c | 23 ++++++++++++++---------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 007a58c8f1..b3662aedaa 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -765,7 +765,7 @@ static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
 int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
 {
     int type = ctx->type;
-    int ret;
+    int ret = 0;
     AVCodecContext * const avctx = logger(ctx);
 
     // Avoid doing anything if there is nothing we can do
@@ -777,8 +777,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
     if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
         stuff_all_buffers(avctx, ctx);
 
-    ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
-    if (ret < 0) {
+    if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) {
         const int err = errno;
         av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name,
                cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err);
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 888ba67fea..88a341aae2 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -110,16 +110,21 @@ static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *co
         return 0;
 
     ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON);
-    if (ret < 0)
-        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n");
-
-    ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd);
-    if (ret < 0)
-        av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno);
-    else
-        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_DECODER_CMD start OK\n");
+    if (ret != 0) {
+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret));
+        return ret;
+    }
 
-    return ret;
+    // STREAMON should do implicit START so this just for those that don't.
+    // It is optional so don't worry if it fails
+    if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) {
+        ret = AVERROR(errno);
+        av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret));
+    }
+    else {
+        av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n");
+    }
+    return 0;
 }
 
 static int v4l2_try_start(AVCodecContext *avctx)
-- 
2.43.0


From 8d25cb150a32ddd9568171d0d09c187bb705ad59 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 24 May 2022 17:02:58 +0000
Subject: [PATCH 047/157] v4l2m2m_dec: Support in-pkt AV_PKT_DATA_NEW_EXTRADATA

Support packet side-data containing AV_PKT_DATA_NEW_EXTRADATA.  Should
also detect and complain about unexpected streams of empty packets.

This functionality untested as I haven't yet found anything that creates
NEW_EXTRADATA side data.
---
 libavcodec/v4l2_m2m.c     |  1 +
 libavcodec/v4l2_m2m.h     |  3 +++
 libavcodec/v4l2_m2m_dec.c | 49 ++++++++++++++++++++++++++++++++++++---
 3 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index e26bd74c3e..6dd01e2e00 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -251,6 +251,7 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context)
     av_frame_unref(s->frame);
     av_frame_free(&s->frame);
     av_packet_unref(&s->buf_pkt);
+    av_freep(&s->extdata_data);
 
     av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n");
 
diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 6bd5e8eda7..19d618698d 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -115,6 +115,9 @@ typedef struct V4L2m2mContext {
 
     /* Ext data sent */
     int extdata_sent;
+    /* Ext data sent in packet - overrides ctx */
+    uint8_t * extdata_data;
+    size_t extdata_size;
 
 #define FF_V4L2_QUIRK_REINIT_ALWAYS             1
 #define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN    2
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 88a341aae2..392a68f0c7 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -343,7 +343,46 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
     // We will already have a coded pkt if the output Q was full last time we
     // tried to Q it
     if (!s->buf_pkt.size && !do_not_get) {
-        ret = ff_decode_get_packet(avctx, &s->buf_pkt);
+        unsigned int i;
+
+        for (i = 0; i < 256; ++i) {
+            uint8_t * side_data;
+            size_t side_size;
+
+            ret = ff_decode_get_packet(avctx, &s->buf_pkt);
+            if (ret != 0)
+                break;
+
+            // New extradata is the only side-data we undertand
+            side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
+            if (side_data) {
+                av_log(avctx, AV_LOG_DEBUG, "New extradata\n");
+                av_freep(&s->extdata_data);
+                if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) {
+                    av_log(avctx, AV_LOG_ERROR, "Failed to alloc %zd bytes of extra data\n", side_size);
+                    return AVERROR(ENOMEM);
+                }
+                memcpy(s->extdata_data, side_data, side_size);
+                s->extdata_size = side_size;
+                s->extdata_sent = 0;
+            }
+
+            if (s->buf_pkt.size != 0)
+                break;
+
+            if (s->buf_pkt.side_data_elems == 0) {
+                av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n");
+                ret = AVERROR_EOF;
+                break;
+            }
+
+            // Retry a side-data only pkt
+        }
+        // If i >= 256 something has gone wrong
+        if (i >= 256) {
+            av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n");
+            return AVERROR(EIO);
+        }
 
         if (ret == AVERROR(EAGAIN)) {
             if (!stream_started(s)) {
@@ -398,8 +437,12 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
     if ((ret = check_output_streamon(avctx, s)) != 0)
         return ret;
 
-    ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt,
-                                         avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size);
+    if (s->extdata_sent)
+        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
+    else if (s->extdata_data)
+        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);
+    else
+        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size);
 
     if (ret == AVERROR(EAGAIN)) {
         // Out of input buffers - keep packet
-- 
2.43.0


From f1e8df2f5830a88636db10d16c6ff7b970d62616 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 24 May 2022 20:02:48 +0000
Subject: [PATCH 048/157] v4l2m2m_dec: Catch repeated Q fulls

---
 libavcodec/v4l2_m2m_dec.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 392a68f0c7..7e17044706 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -504,13 +504,14 @@ static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
 static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 {
     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-    int src_rv;
+    int src_rv = NQ_OK;
     int dst_rv = 1;  // Non-zero (done), non-negative (error) number
     unsigned int i = 0;
 
     do {
         const int pending = xlat_pending(&s->xlat);
         const int prefer_dq = (pending > s->pending_hw / 16);
+        const int last_src_rv = src_rv;
 
         // Enqueue another pkt for decode if
         // (a) We don't have a lot of stuff in the buffer already OR
@@ -526,6 +527,11 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
         if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
             break;
 
+        if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) {
+            av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n");
+            break;
+        }
+
         // Try to get a new frame if
         // (a) we haven't already got one AND
         // (b) enqueue returned a status indicating that decode should be attempted
-- 
2.43.0


From 8836d71a373e2c3d110bb799336c74851aa68bf2 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 27 May 2022 09:36:51 +0000
Subject: [PATCH 049/157] hevc: If hwaccel avoid creation of s/w only vars

---
 libavcodec/hevc_refs.c | 35 +++++++++++++++++++++--------------
 libavcodec/hevcdec.c   | 42 +++++++++++++++++++++++++++++-------------
 2 files changed, 50 insertions(+), 27 deletions(-)

diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
index 811e8feff8..f7cf14eabc 100644
--- a/libavcodec/hevc_refs.c
+++ b/libavcodec/hevc_refs.c
@@ -98,18 +98,22 @@ static HEVCFrame *alloc_frame(HEVCContext *s)
         if (!frame->rpl_buf)
             goto fail;
 
-        frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
-        if (!frame->tab_mvf_buf)
-            goto fail;
-        frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
+        if (s->tab_mvf_pool) {
+            frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
+            if (!frame->tab_mvf_buf)
+                goto fail;
+            frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
+        }
 
-        frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
-        if (!frame->rpl_tab_buf)
-            goto fail;
-        frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
-        frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
-        for (j = 0; j < frame->ctb_count; j++)
-            frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
+        if (s->rpl_tab_pool) {
+            frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
+            if (!frame->rpl_tab_buf)
+                goto fail;
+            frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
+            frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
+            for (j = 0; j < frame->ctb_count; j++)
+                frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
+        }
 
         frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
         frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
@@ -297,14 +301,17 @@ static int init_slice_rpl(HEVCContext *s)
     int ctb_count    = frame->ctb_count;
     int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
     int i;
+    RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
 
     if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab))
         return AVERROR_INVALIDDATA;
 
-    for (i = ctb_addr_ts; i < ctb_count; i++)
-        frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
+    if (frame->rpl_tab) {
+        for (i = ctb_addr_ts; i < ctb_count; i++)
+            frame->rpl_tab[i] = tab;
+    }
 
-    frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts];
+    frame->refPicList = tab->refPicList;
 
     return 0;
 }
diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
index e892436f94..a2c29a611c 100644
--- a/libavcodec/hevcdec.c
+++ b/libavcodec/hevcdec.c
@@ -536,6 +536,16 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps,
     if (!sps)
         return 0;
 
+    // If hwaccel then we don't need all the s/w decode helper arrays
+    if (s->avctx->hwaccel) {
+        export_stream_params(s, sps);
+
+        s->avctx->pix_fmt = pix_fmt;
+        s->ps.sps = sps;
+        s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
+        return 0;
+    }
+
     ret = pic_arrays_init(s, sps);
     if (ret < 0)
         goto fail;
@@ -2893,11 +2903,13 @@ static int hevc_frame_start(HEVCContext *s)
                            ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
     int ret;
 
-    memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
-    memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
-    memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
-    memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
-    memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
+    if (s->horizontal_bs) {
+        memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
+        memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
+        memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
+        memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
+        memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
+    }
 
     s->is_decoded        = 0;
     s->first_nal_type    = s->nal_unit_type;
@@ -3441,15 +3453,19 @@ static int hevc_ref_frame(HEVCContext *s, HEVCFrame *dst, HEVCFrame *src)
         dst->needs_fg = 1;
     }
 
-    dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
-    if (!dst->tab_mvf_buf)
-        goto fail;
-    dst->tab_mvf = src->tab_mvf;
+    if (src->tab_mvf_buf) {
+        dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
+        if (!dst->tab_mvf_buf)
+            goto fail;
+        dst->tab_mvf = src->tab_mvf;
+    }
 
-    dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
-    if (!dst->rpl_tab_buf)
-        goto fail;
-    dst->rpl_tab = src->rpl_tab;
+    if (src->rpl_tab_buf) {
+        dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
+        if (!dst->rpl_tab_buf)
+            goto fail;
+        dst->rpl_tab = src->rpl_tab;
+    }
 
     dst->rpl_buf = av_buffer_ref(src->rpl_buf);
     if (!dst->rpl_buf)
-- 
2.43.0


From dca72038c6f6101d27e48ab224995ed8872d0701 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 30 May 2022 17:51:44 +0100
Subject: [PATCH 050/157] rpi_sand: Add SAND30->NV12 conversion

C code only. Reworks the hwcontext_drm conversion to use the
rpi_sand_fns generic frame convert fn rather than calling the
individual conversion functions directly. This keeps all teh stride and
size logic in a single place.
---
 libavutil/hwcontext_drm.c | 46 ++++++++------------
 libavutil/rpi_sand_fns.c  | 89 +++++++++++++++++++++++++++++++++++++++
 libavutil/rpi_sand_fns.h  |  5 +++
 3 files changed, 111 insertions(+), 29 deletions(-)

diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c
index baf18920fa..137a952d2c 100644
--- a/libavutil/hwcontext_drm.c
+++ b/libavutil/hwcontext_drm.c
@@ -234,14 +234,14 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx,
                                     enum AVHWFrameTransferDirection dir,
                                     enum AVPixelFormat **formats)
 {
-    enum AVPixelFormat *pix_fmts;
+    enum AVPixelFormat *p;
 
-    pix_fmts = av_malloc_array(2, sizeof(*pix_fmts));
-    if (!pix_fmts)
+    p = *formats = av_malloc_array(3, sizeof(*p));
+    if (!p)
         return AVERROR(ENOMEM);
 
     // **** Offer native sand too ????
-    pix_fmts[0] =
+    *p++ =
 #if CONFIG_SAND
         ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
             AV_PIX_FMT_YUV420P :
@@ -249,9 +249,14 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx,
             AV_PIX_FMT_YUV420P10LE :
 #endif
             ctx->sw_format;
-    pix_fmts[1] = AV_PIX_FMT_NONE;
 
-    *formats = pix_fmts;
+#if CONFIG_SAND
+    if (ctx->sw_format == AV_PIX_FMT_RPI4_10 ||
+        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128)
+        *p++ = AV_PIX_FMT_NV12;
+#endif
+
+    *p = AV_PIX_FMT_NONE;
     return 0;
 }
 
@@ -294,29 +299,12 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc,
         const unsigned int w = FFMIN(dst->width, map->width);
         const unsigned int h = FFMIN(dst->height, map->height);
 
-        if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) {
-            av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
-                                     map->data[0],
-                                     128, stride2,
-                                     0, 0, w, h);
-            av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
-                                     dst->data[2], dst->linesize[2],
-                                     map->data[1],
-                                     128, stride2,
-                                     0, 0, w / 2, h / 2);
-        }
-        else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) {
-            av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
-                                     map->data[0],
-                                     128, stride2,
-                                     0, 0, w, h);
-            av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
-                                     dst->data[2], dst->linesize[2],
-                                     map->data[1],
-                                     128, stride2,
-                                     0, 0, w / 2, h / 2);
-        }
-        else
+        map->crop_top = 0;
+        map->crop_bottom = 0;
+        map->crop_left = 0;
+        map->crop_right = 0;
+
+        if (av_rpi_sand_to_planar_frame(dst, map) != 0)
         {
             av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
             err = AVERROR(EINVAL);
diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
index 1f543e9357..256c3d532f 100644
--- a/libavutil/rpi_sand_fns.c
+++ b/libavutil/rpi_sand_fns.c
@@ -229,6 +229,75 @@ void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_
     }
 }
 
+// Fetches a single patch - offscreen fixup not done here
+// w <= stride1
+// single lose bottom 2 bits truncation
+// _x & _w in pixels, strides in bytes
+void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h)
+{
+    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
+    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
+    const unsigned int x1 = ((_x + _w) / 3) * 4;
+    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
+    const unsigned int mask = stride1 - 1;
+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
+
+#if HAVE_SAND_ASM && 0
+    if (_x == 0) {
+        ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
+        return;
+    }
+#endif
+
+    if (x0 == x1) {
+        // *******************
+        // Partial single word xfer
+        return;
+    }
+
+    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
+    {
+        unsigned int x = x0;
+        const uint32_t * p = (const uint32_t *)p0;
+        uint8_t * d = dst;
+
+        if (xskip0 != 0) {
+            const uint32_t p3 = *p++;
+
+            if (xskip0 == 1)
+                *d++ = (p3 >> 12) & 0xff;
+            *d++ = (p3 >> 22) & 0xff;
+
+            if (((x += 4) & mask) == 0)
+                p += slice_inc;
+        }
+
+        while (x != x1) {
+            const uint32_t p3 = *p++;
+            *d++ = (p3 >> 2) & 0xff;
+            *d++ = (p3 >> 12) & 0xff;
+            *d++ = (p3 >> 22) & 0xff;
+
+            if (((x += 4) & mask) == 0)
+                p += slice_inc;
+        }
+
+        if (xrem1 != 0) {
+            const uint32_t p3 = *p;
+
+            *d++ = (p3 >> 2) & 0xff;
+            if (xrem1 == 2)
+                *d++ = (p3 >> 12) & 0xff;
+        }
+    }
+}
+
+
 
 // w/h in pixels
 void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
@@ -310,6 +379,16 @@ int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
                                              av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
                                              x/2, y/2,  w/2, h/2);
                     break;
+                case AV_PIX_FMT_NV12:
+                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
+                                             src->data[0],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x, y, w, h);
+                    av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1],
+                                             src->data[1],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x/2, y/2, w, h/2);
+                    break;
                 default:
                     return -1;
             }
@@ -344,6 +423,16 @@ int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
                                              av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
                                              x/2, y/2, w/2, h/2);
                     break;
+                case AV_PIX_FMT_NV12:
+                    av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0],
+                                             src->data[0],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x, y, w, h);
+                    av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1],
+                                             src->data[1],
+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
+                                             x/2, y/2, w, h/2);
+                    break;
                 default:
                     return -1;
             }
diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
index 634b55e800..462ccb8abd 100644
--- a/libavutil/rpi_sand_fns.h
+++ b/libavutil/rpi_sand_fns.h
@@ -85,6 +85,11 @@ void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_
                              unsigned int _x, unsigned int y,
                              unsigned int _w, unsigned int h);
 
+void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
+                             const uint8_t * src,
+                             unsigned int stride1, unsigned int stride2,
+                             unsigned int _x, unsigned int y,
+                             unsigned int _w, unsigned int h);
 
 // w/h in pixels
 void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
-- 
2.43.0


From 6d7f9c7c140e054de1cba7dab60f805aa3712a0a Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 1 Jun 2022 17:49:26 +0000
Subject: [PATCH 051/157] rpi_sand: Add SAND30->NV12 asm for Armv7 & Armv8

Also reworks the previous Armv8 SAND30->Y16 function in a slightly more
efficient way that makes it look more like the Armv7 version.
---
 libavutil/aarch64/rpi_sand_neon.S | 549 ++++++++++++++++++------------
 libavutil/aarch64/rpi_sand_neon.h |   4 +
 libavutil/arm/rpi_sand_neon.S     | 239 ++++++++++---
 libavutil/arm/rpi_sand_neon.h     |  11 +
 libavutil/rpi_sand_fns.c          |   2 +-
 5 files changed, 541 insertions(+), 264 deletions(-)

diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
index cdcf71ee67..2f07d9674c 100644
--- a/libavutil/aarch64/rpi_sand_neon.S
+++ b/libavutil/aarch64/rpi_sand_neon.S
@@ -248,228 +248,6 @@ incomplete_block_loop_end_c8:
     ret
 endfunc
 
-//void ff_rpi_sand30_lines_to_planar_y16(
-//  uint8_t * dest,             // [x0]
-//  unsigned int dst_stride,    // [w1] -> assumed to be equal to _w
-//  const uint8_t * src,        // [x2]
-//  unsigned int src_stride1,   // [w3] -> 128
-//  unsigned int src_stride2,   // [w4]
-//  unsigned int _x,            // [w5]
-//  unsigned int y,             // [w6]
-//  unsigned int _w,            // [w7]
-//  unsigned int h);            // [sp, #0]
-
-function ff_rpi_sand30_lines_to_planar_y16, export=1
-    stp x19, x20, [sp, #-48]!
-    stp x21, x22, [sp, #16]
-    stp x23, x24, [sp, #32]
-
-    // w6 = argument h
-    ldr w6, [sp, #48]
-
-    // slice_inc = ((stride2 - 1) * stride1)
-    mov w5, w4
-    sub w5, w5, #1
-    lsl w5, w5, #7
-
-    // total number of bytes per row = (width / 3) * 4
-    mov w8, w7
-    mov w9, #3
-    udiv w8, w8, w9
-    lsl w8, w8, #2
-
-    // number of full 128 byte blocks to be processed
-    mov w9, #96
-    udiv w9, w7, w9 // = (width * 4) / (3*128) = width/96
-
-    // w10 = number of full integers to process (4 bytes)
-    // w11 = remaning zero to two 10bit values still to copy over
-    mov w12, #96
-    mul w12, w9, w12
-    sub w12, w7, w12  // width - blocks*96 = remaining points per row
-    mov w11, #3
-    udiv w10, w12, w11 // full integers to process = w12 / 3 
-    mul w11, w10, w11  // #integers *3
-    sub w11, w12, w11  // remaining 0-2 points = remaining points - integers*3
-
-    // increase w9 by one if w10+w11 is not zero, and decrease the row count by one
-    // this is to efficiently copy incomplete blocks at the end of the rows
-    // the last row is handled explicitly to avoid writing out of bounds
-    add w22, w10, w11
-    cmp w22, #0
-    cset w22, ne // 1 iff w10+w11 not zero, 0 otherwise
-    add w9, w9, w22
-    sub w6, w6, #1
-
-    // store the number of bytes in w20 which we copy too much for every row
-    // when the width of the frame is not a multiple of 96 (128bytes storing 96 10bit values)
-    mov w20, #96*2
-    mul w20, w20, w9
-    sub w20, w1, w20
-
-    mov w23, #0 // flag to check whether the last line had already been processed
-    
-    // bitmask to clear the uppper 6bits of the result values
-    mov x19, #0x03ff03ff03ff03ff
-    dup v22.2d, x19
-
-    // row counter = 0
-    eor w12, w12, w12
-row_loop_y16:
-    cmp w12, w6               // jump to row_loop_y16_fin if we processed all rows
-    bge row_loop_y16_fin
-
-    mov x13, x2               // row src
-    eor w14, w14, w14         // full block counter
-block_loop_y16:
-    cmp w14, w9
-    bge block_loop_y16_fin
-
-    // load 64 bytes
-    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
-   
-    // process v0 and v1
-    xtn v16.4h, v0.4s
-    ushr v0.4s, v0.4s, #10
-    xtn v17.4h, v0.4s
-    ushr v0.4s, v0.4s, #10
-    xtn v18.4h, v0.4s
-   
-    xtn2 v16.8h, v1.4s
-    and v16.16b, v16.16b, v22.16b
-    ushr v1.4s, v1.4s, #10
-    xtn2 v17.8h, v1.4s
-    and v17.16b, v17.16b, v22.16b
-    ushr v1.4s, v1.4s, #10
-    xtn2 v18.8h, v1.4s
-    and v18.16b, v18.16b, v22.16b
-
-    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
-
-    // process v2 and v3
-    xtn v23.4h, v2.4s
-    ushr v2.4s, v2.4s, #10
-    xtn v24.4h, v2.4s
-    ushr v2.4s, v2.4s, #10
-    xtn v25.4h, v2.4s
-    
-    xtn2 v23.8h, v3.4s
-    and v23.16b, v23.16b, v22.16b
-    ushr v3.4s, v3.4s, #10
-    xtn2 v24.8h, v3.4s
-    and v24.16b, v24.16b, v22.16b
-    ushr v3.4s, v3.4s, #10
-    xtn2 v25.8h, v3.4s
-    and v25.16b, v25.16b, v22.16b
-
-    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
-
-    // load the second half of the block -> 64 bytes into registers v4-v7
-    ld1 { v4.4s,  v5.4s,  v6.4s,  v7.4s }, [x13], #64
-    
-    // process v4 and v5
-    xtn v16.4h, v4.4s
-    ushr v4.4s, v4.4s, #10
-    xtn v17.4h, v4.4s
-    ushr v4.4s, v4.4s, #10
-    xtn v18.4h, v4.4s
-   
-    xtn2 v16.8h, v5.4s 
-    and v16.16b, v16.16b, v22.16b
-    ushr v5.4s, v5.4s, #10
-    xtn2 v17.8h, v5.4s
-    and v17.16b, v17.16b, v22.16b
-    ushr v5.4s, v5.4s, #10
-    xtn2 v18.8h, v5.4s
-    and v18.16b, v18.16b, v22.16b
-
-    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
-
-    // v6 and v7
-    xtn v23.4h, v6.4s
-    ushr v6.4s, v6.4s, #10
-    xtn v24.4h, v6.4s
-    ushr v6.4s, v6.4s, #10
-    xtn v25.4h, v6.4s
-   
-    xtn2 v23.8h, v7.4s 
-    and v23.16b, v23.16b, v22.16b
-    ushr v7.4s, v7.4s, #10
-    xtn2 v24.8h, v7.4s
-    and v24.16b, v24.16b, v22.16b
-    ushr v7.4s, v7.4s, #10
-    xtn2 v25.8h, v7.4s
-    and v25.16b, v25.16b, v22.16b
-
-    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
- 
-    add x13, x13, x5          // row src += slice_inc
-    add w14, w14, #1
-    b block_loop_y16
-block_loop_y16_fin:
-
-    
-
-
-    add x2, x2, #128          // src += stride1 (start of the next row)
-    add x0, x0, w20, sxtw     // subtract the bytes we copied too much from dst
-    add w12, w12, #1
-    b row_loop_y16
-row_loop_y16_fin:
-
-    // check whether we have incomplete blocks at the end of every row
-    // in that case decrease row block count by one
-    // change height back to it's original value (meaning increase it by 1)
-    // and jump back to another iteration of row_loop_y16
-
-    cmp w23, #1
-    beq row_loop_y16_fin2 // don't continue here if we already processed the last row
-    add w6, w6, #1    // increase height to the original value
-    sub w9, w9, w22   // block count - 1 or 0, depending on the remaining bytes count
-    mov w23, #1
-    b row_loop_y16
-row_loop_y16_fin2:
-
-    sub x0, x0, w20, sxtw // with the last row we didn't actually move the dst ptr to far ahead, therefore readd the diference
-
-    // now we've got to handle the last block in the last row
-    eor w12, w12, w12 // w12 = 0 = counter
-integer_loop_y16:
-    cmp w12, w10
-    bge integer_loop_y16_fin
-    ldr w14, [x13], #4
-    and w15, w14, #0x3ff
-    strh w15, [x0], #2
-    lsr w14, w14, #10
-    and w15, w14, #0x3ff
-    strh w15, [x0], #2
-    lsr w14, w14, #10
-    and w15, w14, #0x3ff
-    strh w15, [x0], #2
-    add w12, w12, #1
-    b integer_loop_y16
-integer_loop_y16_fin:
-
-final_values_y16:
-    // remaining point count = w11
-    ldr w14, [x13], #4
-    cmp w11, #0
-    beq final_values_y16_fin
-    and w15, w14, #0x3ff
-    strh w15, [x0], #2
-    cmp w11, #1
-    beq final_values_y16_fin
-    lsr w14, w14, #10
-    and w15, w14, #0x3ff
-    strh w15, [x0], #2
-final_values_y16_fin:
-
-    ldp x23, x24, [sp, #32]
-    ldp x21, x22, [sp, #16]
-    ldp x19, x20, [sp], #48
-    ret
-endfunc
-
 //void ff_rpi_sand30_lines_to_planar_c16(
 //  uint8_t * dst_u,            // [x0]
 //  unsigned int dst_stride_u,  // [w1] == _w*2
@@ -674,3 +452,330 @@ endfunc
 //  unsigned int _w,
 //  unsigned int h);
 
+// void ff_rpi_sand30_lines_to_planar_y8(
+//   uint8_t * dest,            : x0
+//   unsigned int dst_stride,   : w1
+//   const uint8_t * src,       : x2
+//   unsigned int src_stride1,  : w3, always 128
+//   unsigned int src_stride2,  : w4
+//   unsigned int _x,           : w5
+//   unsigned int y,            : w6
+//   unsigned int _w,           : w7
+//   unsigned int h);           : [sp, #0]
+//
+// Assumes that we are starting on a stripe boundary and that overreading
+// within the stripe is OK. However it does respect the dest size for wri
+
+function ff_rpi_sand30_lines_to_planar_y16, export=1
+                lsl             w4,  w4,  #7
+                sub             w4,  w4,  #64
+                sub             w1,  w1,  w7, lsl #1
+                uxtw            x6,  w6
+                add             x8,  x2,  x6, lsl #7
+                ldr             w6,  [sp, #0]
+
+10:
+                mov             x2,  x8
+                mov             w5,  w7
+1:
+                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
+                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
+
+                subs            w5,  w5,  #96
+
+                // v0, v1
+
+                shrn            v18.4h,  v0.4s,   #14
+                xtn             v16.4h,  v0.4s
+                shrn            v17.4h,  v0.4s,   #10
+
+                shrn2           v18.8h,  v1.4s,   #14
+                xtn2            v16.8h,  v1.4s
+                shrn2           v17.8h,  v1.4s,   #10
+
+                ushr            v18.8h,  v18.8h,  #6
+                bic             v16.8h,  #0xfc,   lsl #8
+                bic             v17.8h,  #0xfc,   lsl #8
+
+                // v2, v3
+
+                shrn            v21.4h,  v2.4s,   #14
+                xtn             v19.4h,  v2.4s
+                shrn            v20.4h,  v2.4s,   #10
+
+                shrn2           v21.8h,  v3.4s,   #14
+                xtn2            v19.8h,  v3.4s
+                shrn2           v20.8h,  v3.4s,   #10
+
+                ushr            v21.8h,  v21.8h,  #6
+                bic             v19.8h,  #0xfc,   lsl #8
+                bic             v20.8h,  #0xfc,   lsl #8
+
+                // v4, v5
+
+                shrn            v24.4h,  v4.4s,   #14
+                xtn             v22.4h,  v4.4s
+                shrn            v23.4h,  v4.4s,   #10
+
+                shrn2           v24.8h,  v5.4s,   #14
+                xtn2            v22.8h,  v5.4s
+                shrn2           v23.8h,  v5.4s,   #10
+
+                ushr            v24.8h,  v24.8h,  #6
+                bic             v22.8h,  #0xfc,   lsl #8
+                bic             v23.8h,  #0xfc,   lsl #8
+
+                // v6, v7
+
+                shrn            v27.4h,  v6.4s,   #14
+                xtn             v25.4h,  v6.4s
+                shrn            v26.4h,  v6.4s,   #10
+
+                shrn2           v27.8h,  v7.4s,   #14
+                xtn2            v25.8h,  v7.4s
+                shrn2           v26.8h,  v7.4s,   #10
+
+                ushr            v27.8h,  v27.8h,  #6
+                bic             v25.8h,  #0xfc,   lsl #8
+                bic             v26.8h,  #0xfc,   lsl #8
+
+                blt             2f
+
+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
+                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
+                st3             {v22.8h, v23.8h, v24.8h}, [x0], #48
+                st3             {v25.8h, v26.8h, v27.8h}, [x0], #48
+
+                bne             1b
+
+11:
+                subs            w6,  w6,  #1
+                add             x0,  x0,  w1,  uxtw
+                add             x8,  x8,  #128
+                bne             10b
+
+                ret
+
+// Partial final write
+2:
+                cmp             w5,  #48-96
+                blt             1f
+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
+                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
+                beq             11b
+                mov             v16.16b, v22.16b
+                mov             v17.16b, v23.16b
+                sub             w5,  w5,  #48
+                mov             v18.16b, v24.16b
+                mov             v19.16b, v25.16b
+                mov             v20.16b, v26.16b
+                mov             v21.16b, v27.16b
+1:
+                cmp             w5,  #24-96
+                blt             1f
+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
+                beq             11b
+                mov             v16.16b, v19.16b
+                mov             v17.16b, v20.16b
+                sub             w5,  w5,  #24
+                mov             v18.16b, v21.16b
+1:
+                cmp             w5,  #12-96
+                blt             1f
+                st3             {v16.4h, v17.4h, v18.4h}, [x0], #24
+                beq             11b
+                mov             v16.2d[0], v16.2d[1]
+                sub             w5,  w5,  #12
+                mov             v17.2d[0], v17.2d[1]
+                mov             v18.2d[0], v18.2d[1]
+1:
+                cmp             w5,  #6-96
+                blt             1f
+                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
+                st3             {v16.h, v17.h, v18.h}[1], [x0], #6
+                beq             11b
+                mov             v16.2s[0], v16.2s[1]
+                sub             w5,  w5,  #6
+                mov             v17.2s[0], v17.2s[1]
+                mov             v18.2s[0], v18.2s[1]
+1:
+                cmp             w5,  #3-96
+                blt             1f
+                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
+                beq             11b
+                mov             v16.4h[0], v16.4h[1]
+                sub             w5,  w5,  #3
+                mov             v17.4h[0], v17.4h[1]
+1:
+                cmp             w5,  #2-96
+                blt             1f
+                st2             {v16.h, v17.h}[0], [x0], #4
+                b               11b
+1:
+                st1             {v16.h}[0], [x0], #2
+                b               11b
+
+endfunc
+
+// void ff_rpi_sand30_lines_to_planar_y8(
+//   uint8_t * dest,            : x0
+//   unsigned int dst_stride,   : w1
+//   const uint8_t * src,       : x2
+//   unsigned int src_stride1,  : w3, always 128
+//   unsigned int src_stride2,  : w4
+//   unsigned int _x,           : w5
+//   unsigned int y,            : w6
+//   unsigned int _w,           : w7
+//   unsigned int h);           : [sp, #0]
+//
+// Assumes that we are starting on a stripe boundary and that overreading
+// within the stripe is OK. However it does respect the dest size for wri
+
+function ff_rpi_sand30_lines_to_planar_y8, export=1
+                lsl             w4,  w4,  #7
+                sub             w4,  w4,  #64
+                sub             w1,  w1,  w7
+                uxtw            x6,  w6
+                add             x8,  x2,  x6, lsl #7
+                ldr             w6,  [sp, #0]
+
+10:
+                mov             x2,  x8
+                mov             w5,  w7
+1:
+                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
+                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
+
+                subs            w5,  w5,  #96
+
+                // v0, v1
+
+                shrn            v18.4h,  v0.4s,   #16
+                xtn             v16.4h,  v0.4s
+                shrn            v17.4h,  v0.4s,   #12
+
+                shrn2           v18.8h,  v1.4s,   #16
+                xtn2            v16.8h,  v1.4s
+                shrn2           v17.8h,  v1.4s,   #12
+
+                shrn            v18.8b,  v18.8h,  #6
+                shrn            v16.8b,  v16.8h,  #2
+                xtn             v17.8b,  v17.8h
+
+                // v2, v3
+
+                shrn            v21.4h,  v2.4s,   #16
+                xtn             v19.4h,  v2.4s
+                shrn            v20.4h,  v2.4s,   #12
+
+                shrn2           v21.8h,  v3.4s,   #16
+                xtn2            v19.8h,  v3.4s
+                shrn2           v20.8h,  v3.4s,   #12
+
+                shrn2           v18.16b, v21.8h,  #6
+                shrn2           v16.16b, v19.8h,  #2
+                xtn2            v17.16b, v20.8h
+
+                // v4, v5
+
+                shrn            v24.4h,  v4.4s,   #16
+                xtn             v22.4h,  v4.4s
+                shrn            v23.4h,  v4.4s,   #12
+
+                shrn2           v24.8h,  v5.4s,   #16
+                xtn2            v22.8h,  v5.4s
+                shrn2           v23.8h,  v5.4s,   #12
+
+                shrn            v21.8b,  v24.8h,  #6
+                shrn            v19.8b,  v22.8h,  #2
+                xtn             v20.8b,  v23.8h
+
+                // v6, v7
+
+                shrn            v27.4h,  v6.4s,   #16
+                xtn             v25.4h,  v6.4s
+                shrn            v26.4h,  v6.4s,   #12
+
+                shrn2           v27.8h,  v7.4s,   #16
+                xtn2            v25.8h,  v7.4s
+                shrn2           v26.8h,  v7.4s,   #12
+
+                shrn2           v21.16b, v27.8h,  #6
+                shrn2           v19.16b, v25.8h,  #2
+                xtn2            v20.16b, v26.8h
+
+                blt             2f
+
+                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
+                st3             {v19.16b, v20.16b, v21.16b}, [x0], #48
+
+                bne             1b
+
+11:
+                subs            w6,  w6,  #1
+                add             x0,  x0,  w1,  uxtw
+                add             x8,  x8,  #128
+                bne             10b
+
+                ret
+
+// Partial final write
+2:
+                cmp             w5,  #48-96
+                blt             1f
+                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
+                beq             11b
+                mov             v16.16b, v22.16b
+                mov             v17.16b, v23.16b
+                sub             w5,  w5,  #48
+                mov             v18.16b, v24.16b
+1:
+                cmp             w5,  #24-96
+                blt             1f
+                st3             {v16.8b, v17.8b, v18.8b}, [x0], #24
+                beq             11b
+                mov             v16.2d[0], v16.2d[1]
+                sub             w5,  w5,  #24
+                mov             v17.2d[0], v17.2d[1]
+                mov             v18.2d[0], v18.2d[1]
+1:
+                cmp             w5,  #12-96
+                blt             1f
+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
+                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
+                st3             {v16.b, v17.b, v18.b}[2], [x0], #3
+                st3             {v16.b, v17.b, v18.b}[3], [x0], #3
+                beq             11b
+                mov             v16.2s[0], v16.2s[1]
+                sub             w5,  w5,  #12
+                mov             v17.2s[0], v17.2s[1]
+                mov             v18.2s[0], v18.2s[1]
+1:
+                cmp             w5,  #6-96
+                blt             1f
+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
+                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
+                beq             11b
+                mov             v16.4h[0], v16.4h[1]
+                sub             w5,  w5,  #6
+                mov             v17.4h[0], v17.4h[1]
+                mov             v18.4h[0], v18.4h[1]
+1:
+                cmp             w5,  #3-96
+                blt             1f
+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
+                beq             11b
+                mov             v16.8b[0], v16.8b[1]
+                sub             w5,  w5,  #3
+                mov             v17.8b[0], v17.8b[1]
+1:
+                cmp             w5,  #2-96
+                blt             1f
+                st2             {v16.b, v17.b}[0], [x0], #2
+                b               11b
+1:
+                st1             {v16.b}[0], [x0], #1
+                b               11b
+
+endfunc
+
diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h
index b3aa481ea4..2a56135bc3 100644
--- a/libavutil/aarch64/rpi_sand_neon.h
+++ b/libavutil/aarch64/rpi_sand_neon.h
@@ -49,6 +49,10 @@ void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_
   uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1,
   unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
 
+void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
index 80890fe985..60e697f681 100644
--- a/libavutil/arm/rpi_sand_neon.S
+++ b/libavutil/arm/rpi_sand_neon.S
@@ -360,7 +360,6 @@ function ff_rpi_sand30_lines_to_planar_y16, export=1
                 ldr             r6,  [sp, #36]
                 ldr             r7,  [sp, #32]  @ y
                 mov             r12, #48
-                vmov.u16        q15, #0x3ff
                 sub             r3,  #1
                 lsl             r3,  #7
                 sub             r1,  r1,  r6,  lsl #1
@@ -376,37 +375,33 @@ function ff_rpi_sand30_lines_to_planar_y16, export=1
                 vldm            r2!, {q10-q13}
                 add             lr,  #64
 
-                vshr.u32        q14, q10, #20    @ Cannot vshrn.u32 #20!
+                vshrn.u32       d4 , q10, #14    @ Cannot vshrn.u32 #20!
                 ands            lr,  #127
                 vshrn.u32       d2,  q10, #10
                 vmovn.u32       d0,  q10
-                vmovn.u32       d4,  q14
 
-                vshr.u32        q14, q11, #20
+                vshrn.u32       d5,  q11, #14
                 it              eq
                 addeq           r2,  r3
                 vshrn.u32       d3,  q11, #10
                 vmovn.u32       d1,  q11
-                vmovn.u32       d5,  q14
 
                 subs            r5,  #48
-                vand            q0,  q15
-                vand            q1,  q15
-                vand            q2,  q15
+                vshr.u16        q2,  #6
+                vbic.u16        q0,  #0xfc00
+                vbic.u16        q1,  #0xfc00
 
-                vshr.u32        q14, q12, #20
+                vshrn.u32       d20, q12, #14
                 vshrn.u32       d18, q12, #10
                 vmovn.u32       d16, q12
-                vmovn.u32       d20, q14
 
-                vshr.u32        q14, q13, #20
+                vshrn.u32       d21, q13, #14
                 vshrn.u32       d19, q13, #10
                 vmovn.u32       d17, q13
-                vmovn.u32       d21, q14
 
-                vand            q8,  q15
-                vand            q9,  q15
-                vand            q10, q15
+                vshr.u16        q10, #6
+                vbic.u16        q8,  #0xfc00
+                vbic.u16        q9 , #0xfc00
                 blt             2f
 
                 vst3.16         {d0,  d2,  d4},  [r0], r12
@@ -499,7 +494,6 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1
                 ldr             r7,  [sp, #48]
                 ldr             r9,  [sp, #52]
                 mov             r12, #48
-                vmov.u16        q15, #0x3ff
                 sub             r8,  #1
                 lsl             r8,  #7
                 add             r5,  r5,  r7,  lsl #7
@@ -515,48 +509,44 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1
                 add             lr,  #64
 
                 @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2
-                vshr.u32        q14, q0,  #20
-                vshrn.u32       d16, q0,  #10
+                vshrn.u32       d20, q0,  #14
                 vmovn.u32       d18, q0
+                vshrn.u32       d0,  q0,  #10
                 ands            lr,  #127
-                vmovn.u32       d20, q14
 
-                vshr.u32        q14, q1,  #20
-                vshrn.u32       d17, q1,  #10
+                vshrn.u32       d21, q1,  #14
                 vmovn.u32       d19, q1
-                vmovn.u32       d21, q14
+                vshrn.u32       d1,  q1,  #10
 
-                vshr.u32        q14, q2,  #20
                 vshrn.u32       d22, q2,  #10
-                vmovn.u32       d24, q2
-                vmovn.u32       d26, q14
+                vmovn.u32       d2,  q2
+                vshrn.u32       d4,  q2,  #14
 
-                vshr.u32        q14, q3,  #20
-                vshrn.u32       d23, q3,  #10
-                vmovn.u32       d25, q3
                 add             r10, r0,  #24
-                vmovn.u32       d27, q14
+                vshrn.u32       d23, q3,  #10
+                vmovn.u32       d3,  q3
+                vshrn.u32       d5,  q3,  #14
 
                 it              eq
                 addeq           r4,  r8
-                vuzp.16         q8,  q11
-                vuzp.16         q9,  q12
-                vuzp.16         q10, q13
+                vuzp.16         q0,  q11
+                vuzp.16         q9,  q1
+                vuzp.16         q10, q2
 
-                @ q8   V0, V3,.. -> q0
+                @ q0   V0, V3,..
                 @ q9   U0, U3...
                 @ q10  U1, U4...
                 @ q11  U2, U5,..
-                @ q12  V1, V4,.. -> q1
-                @ q13  V2, V5,.. -> q2
+                @ q1   V1, V4,
+                @ q2   V2, V5,..
 
                 subs            r6,  #24
-                vand            q11, q15
-                vand            q9,  q15
-                vand            q10, q15
-                vand            q0,  q8,  q15
-                vand            q1,  q12, q15
-                vand            q2,  q13, q15
+                vbic.u16        q11, #0xfc00
+                vbic.u16        q9,  #0xfc00
+                vshr.u16        q10, #6
+                vshr.u16        q2,  #6
+                vbic.u16        q0,  #0xfc00
+                vbic.u16        q1,  #0xfc00
 
                 blt             2f
 
@@ -765,4 +755,171 @@ function ff_rpi_sand30_lines_to_planar_p010, export=1
 endfunc
 
 
+@ void ff_rpi_sand30_lines_to_planar_y8(
+@   uint8_t * dest,             // [r0]
+@   unsigned int dst_stride,    // [r1]
+@   const uint8_t * src,        // [r2]
+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+@   unsigned int src_stride2,   // [sp, #0]  -> r3
+@   unsigned int _x,            // [sp, #4]  Ignored - 0
+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+@   unsigned int h);            // [sp, #16] -> r7
+@
+@ Assumes that we are starting on a stripe boundary and that overreading
+@ within the stripe is OK. However it does respect the dest size for wri
+
+function ff_rpi_sand30_lines_to_planar_y8, export=1
+                push            {r4-r8, lr}     @ +24
+                ldr             r3,  [sp, #24]
+                ldr             r6,  [sp, #36]
+                ldr             r7,  [sp, #32]  @ y
+                mov             r12, #48
+                lsl             r3,  #7
+                sub             r1,  r1,  r6
+                add             r8,  r2,  r7,  lsl #7
+                ldr             r7,  [sp, #40]
+
+10:
+                mov             r2,  r8
+                add             r4,  r0,  #24
+                mov             r5,  r6
+1:
+                vldm            r2,  {q8-q15}
+
+                subs            r5,  #96
+
+                vmovn.u32       d0,  q8
+                vshrn.u32       d2,  q8,  #12
+                vshrn.u32       d4,  q8,  #16    @ Cannot vshrn.u32 #20!
+
+                add             r2,  r3
+
+                vmovn.u32       d1,  q9
+                vshrn.u32       d3,  q9,  #12
+                vshrn.u32       d5,  q9,  #16
+
+                pld             [r2, #0]
+
+                vshrn.u16       d0,  q0,  #2
+                vmovn.u16       d1,  q1
+                vshrn.u16       d2,  q2,  #6
+
+                vmovn.u32       d16, q10
+                vshrn.u32       d18, q10, #12
+                vshrn.u32       d20, q10, #16
+
+                vmovn.u32       d17, q11
+                vshrn.u32       d19, q11, #12
+                vshrn.u32       d21, q11, #16
+
+                pld             [r2, #64]
+
+                vshrn.u16       d4,  q8,  #2
+                vmovn.u16       d5,  q9
+                vshrn.u16       d6,  q10, #6
+
+                vmovn.u32       d16, q12
+                vshrn.u32       d18, q12, #12
+                vshrn.u32       d20, q12, #16
+
+                vmovn.u32       d17, q13
+                vshrn.u32       d19, q13, #12
+                vshrn.u32       d21, q13, #16
+
+                vshrn.u16       d16, q8,  #2
+                vmovn.u16       d17, q9
+                vshrn.u16       d18, q10, #6
+
+                vmovn.u32       d20, q14
+                vshrn.u32       d22, q14, #12
+                vshrn.u32       d24, q14, #16
+
+                vmovn.u32       d21, q15
+                vshrn.u32       d23, q15, #12
+                vshrn.u32       d25, q15, #16
+
+                vshrn.u16       d20, q10, #2
+                vmovn.u16       d21, q11
+                vshrn.u16       d22, q12, #6
+
+                blt             2f
+
+                vst3.8          {d0,  d1,  d2},  [r0], r12
+                vst3.8          {d4,  d5,  d6},  [r4], r12
+                vst3.8          {d16, d17, d18}, [r0], r12
+                vst3.8          {d20, d21, d22}, [r4], r12
+
+                bne             1b
+
+11:
+                subs            r7,  #1
+                add             r0,  r1
+                add             r8,  #128
+                bne             10b
+
+                pop             {r4-r8, pc}
+
+@ Partial final write
+2:
+                cmp             r5,  #48-96
+                blt             1f
+                vst3.8          {d0,  d1,  d2},  [r0], r12
+                vst3.8          {d4,  d5,  d6},  [r4], r12
+                beq             11b
+                vmov            q0,  q8
+                vmov            q2,  q10
+                sub             r5,  #48
+                vmov            d2,  d18
+                vmov            d6,  d22
+1:
+                cmp             r5,  #24-96
+                blt             1f
+                vst3.8          {d0,  d1,  d2},  [r0]!
+                beq             11b
+                vmov            q0,  q2
+                sub             r5,  #24
+                vmov            d2,  d6
+1:
+                cmp             r5,  #12-96
+                blt             1f
+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
+                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
+                vst3.8          {d0[2], d1[2], d2[2]}, [r0]!
+                vst3.8          {d0[3], d1[3], d2[3]}, [r0]!
+                beq             11b
+                vmov            s0,  s1
+                sub             r5,  #12
+                vmov            s2,  s3
+                vmov            s4,  s5
+1:
+                cmp             r5,  #6-96
+                blt             1f
+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
+                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
+                add             r0,  #12
+                beq             11b
+                vshr.u32        d0,  #16
+                sub             r5,  #6
+                vshr.u32        d1,  #16
+                vshr.u32        d2,  #16
+1:
+                cmp             r5, #3-96
+                blt             1f
+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
+                beq             11b
+                sub             r5, #3
+                vshr.u32        d0, #8
+                vshr.u32        d1, #8
+1:
+                cmp             r5, #2-96
+                blt             1f
+                vst2.8          {d0[0], d1[0]}, [r0]!
+                b               11b
+1:
+                vst1.8          {d0[0]}, [r0]!
+                b               11b
+
+endfunc
+
 
diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h
index 447f367bea..d457c10870 100644
--- a/libavutil/arm/rpi_sand_neon.h
+++ b/libavutil/arm/rpi_sand_neon.h
@@ -95,5 +95,16 @@ void ff_rpi_sand30_lines_to_planar_p010(
   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
   unsigned int h);            // [sp, #16] -> r7
 
+void ff_rpi_sand30_lines_to_planar_y8(
+  uint8_t * dest,             // [r0]
+  unsigned int dst_stride,    // [r1]
+  const uint8_t * src,        // [r2]
+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
+  unsigned int src_stride2,   // [sp, #0]  -> r3
+  unsigned int _x,            // [sp, #4]  Ignored - 0
+  unsigned int y,             // [sp, #8]  (r7 in prefix)
+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
+  unsigned int h);            // [sp, #16] -> r7
+
 #endif // AVUTIL_ARM_SAND_NEON_H
 
diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
index 256c3d532f..b6071e2928 100644
--- a/libavutil/rpi_sand_fns.c
+++ b/libavutil/rpi_sand_fns.c
@@ -247,7 +247,7 @@ void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
     const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
     const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
 
-#if HAVE_SAND_ASM && 0
+#if HAVE_SAND_ASM
     if (_x == 0) {
         ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
         return;
-- 
2.43.0


From 97b84aff2923db1ed20f5386cf7a80c295c1078f Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 7 Jun 2022 14:46:12 +0000
Subject: [PATCH 052/157] v4l2_m2m_enc: Add the ability to encode DRM_PRIME
 frames

---
 libavcodec/v4l2_buffers.c | 100 +++++++++++---
 libavcodec/v4l2_buffers.h |  20 ++-
 libavcodec/v4l2_context.c | 212 +++++++++++++++++++++++++---
 libavcodec/v4l2_context.h |  15 +-
 libavcodec/v4l2_m2m.c     |  37 +++--
 libavcodec/v4l2_m2m.h     |   3 +
 libavcodec/v4l2_m2m_dec.c | 171 ++++++-----------------
 libavcodec/v4l2_m2m_enc.c | 283 +++++++++++++++++++++++++++++++++++++-
 8 files changed, 643 insertions(+), 198 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 8c4f18dbed..9ef2f40e39 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -29,6 +29,8 @@
 #include <fcntl.h>
 #include <poll.h>
 #include "libavcodec/avcodec.h"
+#include "libavcodec/internal.h"
+#include "libavutil/avassert.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/hwcontext.h"
 #include "v4l2_context.h"
@@ -60,27 +62,39 @@ static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf)
     return tb.num && tb.den ? tb : v4l2_timebase;
 }
 
+static inline struct timeval tv_from_int(const int64_t t)
+{
+    return (struct timeval){
+        .tv_usec = t % USEC_PER_SEC,
+        .tv_sec  = t / USEC_PER_SEC
+    };
+}
+
+static inline int64_t int_from_tv(const struct timeval t)
+{
+    return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec;
+}
+
 static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts)
 {
     /* convert pts to v4l2 timebase */
     const int64_t v4l2_pts =
-        out->context->no_pts_rescale ? pts :
         pts == AV_NOPTS_VALUE ? 0 :
             av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
-    out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
-    out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
+    out->buf.timestamp = tv_from_int(v4l2_pts);
 }
 
 static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf)
 {
+    const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp);
+    return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE;
+#if 0
     /* convert pts back to encoder timebase */
-    const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
-                        avbuf->buf.timestamp.tv_usec;
-
     return
         avbuf->context->no_pts_rescale ? v4l2_pts :
         v4l2_pts == 0 ? AV_NOPTS_VALUE :
             av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
+#endif
 }
 
 static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length)
@@ -435,7 +449,7 @@ static void v4l2_free_bufref(void *opaque, uint8_t *data)
 
         ff_mutex_lock(&ctx->lock);
 
-        avbuf->status = V4L2BUF_AVAILABLE;
+        ff_v4l2_buffer_set_avail(avbuf);
 
         if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) {
             av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name);
@@ -599,6 +613,38 @@ static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes)
     return i != 0  && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA));
 }
 
+static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+{
+    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
+
+    if (frame->format != AV_PIX_FMT_DRM_PRIME || !src)
+        return AVERROR(EINVAL);
+
+    av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF);
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
+        // Only currently cope with single buffer types
+        if (out->buf.length != 1)
+            return AVERROR_PATCHWELCOME;
+        if (src->nb_objects != 1)
+            return AVERROR(EINVAL);
+
+        out->planes[0].m.fd = src->objects[0].fd;
+    }
+    else {
+        if (src->nb_objects != 1)
+            return AVERROR(EINVAL);
+
+        out->buf.m.fd      = src->objects[0].fd;
+    }
+
+    // No need to copy src AVDescriptor and if we did then we may confuse
+    // fd close on free
+    out->ref_buf = av_buffer_ref(frame->buf[0]);
+
+    return 0;
+}
+
 static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
 {
     int i;
@@ -678,7 +724,7 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
  *
  ******************************************************************************/
 
-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts)
 {
     out->buf.flags = frame->key_frame ?
         (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
@@ -688,10 +734,15 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
     v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
     v4l2_set_color_range(out, frame->color_range);
     // PTS & interlace are buffer vars
-    v4l2_set_pts(out, frame->pts);
+    if (track_ts)
+        out->buf.timestamp = tv_from_int(track_ts);
+    else
+        v4l2_set_pts(out, frame->pts);
     v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);
 
-    return v4l2_buffer_swframe_to_buf(frame, out);
+    return frame->format == AV_PIX_FMT_DRM_PRIME ?
+        v4l2_buffer_primeframe_to_buf(frame, out) :
+        v4l2_buffer_swframe_to_buf(frame, out);
 }
 
 int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
@@ -754,6 +805,7 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
 
     pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
     pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset;
+    pkt->flags = 0;
 
     if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
         pkt->flags |= AV_PKT_FLAG_KEY;
@@ -768,8 +820,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
     return 0;
 }
 
-int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-                                    const void *extdata, size_t extlen)
+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
+                                    const void *extdata, size_t extlen,
+                                    const int64_t timestamp)
 {
     int ret;
 
@@ -783,7 +836,10 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
     if (ret && ret != AVERROR(ENOMEM))
         return ret;
 
-    v4l2_set_pts(out, pkt->pts);
+    if (timestamp)
+        out->buf.timestamp = tv_from_int(timestamp);
+    else
+        v4l2_set_pts(out, pkt->pts);
 
     out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ?
         (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
@@ -794,7 +850,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
 
 int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
 {
-    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0);
+    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
 }
 
 
@@ -814,13 +870,15 @@ static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data)
             close(avbuf->drm_frame.objects[i].fd);
     }
 
+    av_buffer_unref(&avbuf->ref_buf);
+
     ff_weak_link_unref(&avbuf->context_wl);
 
     av_free(avbuf);
 }
 
 
-int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx)
+int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem)
 {
     int ret, i;
     V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf));
@@ -837,7 +895,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
     }
 
     avbuf->context = ctx;
-    avbuf->buf.memory = V4L2_MEMORY_MMAP;
+    avbuf->buf.memory = mem;
     avbuf->buf.type = ctx->type;
     avbuf->buf.index = index;
 
@@ -867,6 +925,8 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
         avbuf->num_planes = 1;
 
     for (i = 0; i < avbuf->num_planes; i++) {
+        const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP &&
+            (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm);
 
         avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
             ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline :
@@ -875,21 +935,17 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
         if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
 
-            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
-                !buf_to_m2mctx(avbuf)->output_drm) {
+            if (want_mmap)
                 avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
                                                PROT_READ | PROT_WRITE, MAP_SHARED,
                                                buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
-            }
         } else {
             avbuf->plane_info[i].length = avbuf->buf.length;
 
-            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
-                !buf_to_m2mctx(avbuf)->output_drm) {
+            if (want_mmap)
                 avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
                                                PROT_READ | PROT_WRITE, MAP_SHARED,
                                                buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
-            }
         }
 
         if (avbuf->plane_info[i].mm_addr == MAP_FAILED) {
diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
index 3b7ca4d99e..1ac32c5989 100644
--- a/libavcodec/v4l2_buffers.h
+++ b/libavcodec/v4l2_buffers.h
@@ -59,6 +59,10 @@ typedef struct V4L2Buffer {
 
     /* DRM descriptor */
     AVDRMFrameDescriptor drm_frame;
+    /* For DRM_PRIME encode - need to keep a ref to the source buffer till we
+     * are done
+     */
+    AVBufferRef * ref_buf;
 
     /* keep track of the mmap address and mmap length */
     struct V4L2Plane_info {
@@ -110,8 +114,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf);
  */
 int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
 
-int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-                                    const void *extdata, size_t extlen);
+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
+                                    const void *extdata, size_t extlen,
+                                    const int64_t timestamp);
 
 /**
  * Extracts the data from an AVFrame to a V4L2Buffer
@@ -121,7 +126,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
  *
  * @returns 0 in case of success, a negative AVERROR code otherwise
  */
-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts);
 
 /**
  * Initializes a V4L2Buffer
@@ -131,7 +136,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
  *
  * @returns 0 in case of success, a negative AVERROR code otherwise
  */
-int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx);
+int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem);
 
 /**
  * Enqueues a V4L2Buffer
@@ -142,5 +147,12 @@ int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context
  */
 int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf);
 
+static inline void
+ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf)
+{
+    avbuf->status = V4L2BUF_AVAILABLE;
+    av_buffer_unref(&avbuf->ref_buf);
+}
+
 
 #endif // AVCODEC_V4L2_BUFFERS_H
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index b3662aedaa..7a707d21fc 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -43,6 +43,160 @@ struct v4l2_format_update {
     int update_avfmt;
 };
 
+
+static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
+{
+    return (int64_t)n;
+}
+
+static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
+{
+    return (unsigned int)pts;
+}
+
+// FFmpeg requires us to propagate a number of vars from the coded pkt into
+// the decoded frame. The only thing that tracks like that in V4L2 stateful
+// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
+// guarantees about PTS being unique or specified for every frame so replace
+// the supplied PTS with a simple incrementing number and keep a circular
+// buffer of all the things we want preserved (including the original PTS)
+// indexed by the tracking no.
+static int64_t
+xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt)
+{
+    int64_t track_pts;
+
+    // Avoid 0
+    if (++x->track_no == 0)
+        x->track_no = 1;
+
+    track_pts = track_to_pts(avctx, x->track_no);
+
+    av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
+    x->last_pkt_dts = avpkt->dts;
+    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
+        .discard          = 0,
+        .pending          = 1,
+        .pkt_size         = avpkt->size,
+        .pts              = avpkt->pts,
+        .dts              = avpkt->dts,
+        .reordered_opaque = avctx->reordered_opaque,
+        .pkt_pos          = avpkt->pos,
+        .pkt_duration     = avpkt->duration,
+        .track_pts        = track_pts
+    };
+    return track_pts;
+}
+
+static int64_t
+xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame)
+{
+    int64_t track_pts;
+
+    // Avoid 0
+    if (++x->track_no == 0)
+        x->track_no = 1;
+
+    track_pts = track_to_pts(avctx, x->track_no);
+
+    av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no);
+    x->last_pkt_dts = frame->pkt_dts;
+    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
+        .discard          = 0,
+        .pending          = 1,
+        .pkt_size         = 0,
+        .pts              = frame->pts,
+        .dts              = AV_NOPTS_VALUE,
+        .reordered_opaque = frame->reordered_opaque,
+        .pkt_pos          = frame->pkt_pos,
+        .pkt_duration     = frame->pkt_duration,
+        .track_pts        = track_pts
+    };
+    return track_pts;
+}
+
+
+// Returns -1 if we should discard the frame
+static int
+xlat_pts_frame_out(AVCodecContext *const avctx,
+             xlat_track_t * const x,
+             AVFrame *const frame)
+{
+    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
+    V4L2m2mTrackEl *const t = x->track_els + n;
+    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
+    {
+        av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
+               "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
+        frame->pts              = AV_NOPTS_VALUE;
+        frame->pkt_dts          = x->last_pkt_dts;
+        frame->reordered_opaque = x->last_opaque;
+        frame->pkt_pos          = -1;
+        frame->pkt_duration     = 0;
+        frame->pkt_size         = -1;
+    }
+    else if (!t->discard)
+    {
+        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
+        frame->pkt_dts          = x->last_pkt_dts;
+        frame->reordered_opaque = t->reordered_opaque;
+        frame->pkt_pos          = t->pkt_pos;
+        frame->pkt_duration     = t->pkt_duration;
+        frame->pkt_size         = t->pkt_size;
+
+        x->last_opaque = x->track_els[n].reordered_opaque;
+        if (frame->pts != AV_NOPTS_VALUE)
+            x->last_pts = frame->pts;
+        t->pending = 0;
+    }
+    else
+    {
+        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
+        return -1;
+    }
+
+    av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
+           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
+    return 0;
+}
+
+// Returns -1 if we should discard the frame
+static int
+xlat_pts_pkt_out(AVCodecContext *const avctx,
+             xlat_track_t * const x,
+             AVPacket *const pkt)
+{
+    unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE;
+    V4L2m2mTrackEl *const t = x->track_els + n;
+    if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts)
+    {
+        av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
+               "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
+        pkt->pts                = AV_NOPTS_VALUE;
+    }
+    else if (!t->discard)
+    {
+        pkt->pts                = t->pending ? t->pts : AV_NOPTS_VALUE;
+
+        x->last_opaque = x->track_els[n].reordered_opaque;
+        if (pkt->pts != AV_NOPTS_VALUE)
+            x->last_pts = pkt->pts;
+        t->pending = 0;
+    }
+    else
+    {
+        av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
+        return -1;
+    }
+
+    // * Would like something much better than this...xlat(offset + out_count)?
+    pkt->dts = pkt->pts;
+    av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n",
+           pkt->pts, t->track_pts, n);
+    return 0;
+}
+
+
 static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
 {
     return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
@@ -353,12 +507,14 @@ dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf)
     atomic_fetch_sub(&ctx->q_count, 1);
 
     avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
-    avbuf->status = V4L2BUF_AVAILABLE;
+    ff_v4l2_buffer_set_avail(avbuf);
     avbuf->buf = buf;
     if (is_mp) {
         memcpy(avbuf->planes, planes, sizeof(planes));
         avbuf->buf.m.planes = avbuf->planes;
     }
+    // Done with any attached buffer
+    av_buffer_unref(&avbuf->ref_buf);
 
     if (V4L2_TYPE_IS_CAPTURE(ctx->type)) {
         // Zero length cap buffer return == EOS
@@ -733,7 +889,7 @@ static void flush_all_buffers_status(V4L2Context* const ctx)
     for (i = 0; i < ctx->num_buffers; ++i) {
         struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
         if (buf->status == V4L2BUF_IN_DRIVER)
-            buf->status = V4L2BUF_AVAILABLE;
+            ff_v4l2_buffer_set_avail(buf);
     }
     atomic_store(&ctx->q_count, 0);
 }
@@ -787,6 +943,8 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
     {
         if (cmd == VIDIOC_STREAMOFF)
             flush_all_buffers_status(ctx);
+        else
+            ctx->first_buf = 1;
 
         ctx->streamon = (cmd == VIDIOC_STREAMON);
         av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name,
@@ -803,14 +961,16 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
 
 int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
 {
-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    V4L2m2mContext *const s = ctx_to_m2mctx(ctx);
+    AVCodecContext *const avctx = s->avctx;
+    int64_t track_ts;
     V4L2Buffer* avbuf;
     int ret;
 
     if (!frame) {
         ret = v4l2_stop_encode(ctx);
         if (ret)
-            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
+            av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
         s->draining= 1;
         return 0;
     }
@@ -819,7 +979,9 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
     if (!avbuf)
         return AVERROR(EAGAIN);
 
-    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf);
+    track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame);
+
+    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts);
     if (ret)
         return ret;
 
@@ -830,14 +992,16 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
                                    const void * extdata, size_t extlen)
 {
     V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    AVCodecContext *const avctx = s->avctx;
     V4L2Buffer* avbuf;
     int ret;
+    int64_t track_ts;
 
     if (!pkt->size) {
         ret = v4l2_stop_decode(ctx);
         // Log but otherwise ignore stop failure
         if (ret)
-            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
+            av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
         s->draining = 1;
         return 0;
     }
@@ -846,7 +1010,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
     if (!avbuf)
         return AVERROR(EAGAIN);
 
-    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen);
+    track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt);
+
+    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts);
     if (ret == AVERROR(ENOMEM))
         av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
                __func__, pkt->size, avbuf->planes[0].length);
@@ -858,24 +1024,36 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
 
 int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
 {
+    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    AVCodecContext *const avctx = s->avctx;
     V4L2Buffer *avbuf;
     int rv;
 
-    if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
-        return rv;
+    do {
+        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
+            return rv;
+        if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0)
+            return rv;
+    } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0);
 
-    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
+   return 0;
 }
 
 int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
 {
+    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    AVCodecContext *const avctx = s->avctx;
     V4L2Buffer *avbuf;
     int rv;
 
-    if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
-        return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
+    do {
+        if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
+            return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
+        if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0)
+            return rv;
+    } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0);
 
-    return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
+    return 0;
 }
 
 int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
@@ -951,7 +1129,7 @@ void ff_v4l2_context_release(V4L2Context* ctx)
 }
 
 
-static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers)
+static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem)
 {
     V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
     struct v4l2_requestbuffers req;
@@ -962,7 +1140,7 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers
 
     memset(&req, 0, sizeof(req));
     req.count = req_buffers;
-    req.memory = V4L2_MEMORY_MMAP;
+    req.memory = mem;
     req.type = ctx->type;
     while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) {
         if (errno != EINTR) {
@@ -986,7 +1164,7 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers
     }
 
     for (i = 0; i < ctx->num_buffers; i++) {
-        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx);
+        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem);
         if (ret) {
             av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
             goto fail_release;
@@ -1052,7 +1230,7 @@ int ff_v4l2_context_init(V4L2Context* ctx)
         goto fail_unref_hwframes;
     }
 
-    ret = create_buffers(ctx, ctx->num_buffers);
+    ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem);
     if (ret < 0)
         goto fail_unref_hwframes;
 
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 0efff58f18..21265f1bd7 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -91,11 +91,19 @@ typedef struct V4L2Context {
      */
     int num_buffers;
 
+    /**
+     * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF
+     */
+    enum v4l2_memory buf_mem;
+
     /**
      * Whether the stream has been started (VIDIOC_STREAMON has been sent).
      */
     int streamon;
 
+    /* 1st buffer after stream on */
+    int first_buf;
+
     /**
      *  Either no more buffers available or an unrecoverable error was notified
      *  by the V4L2 kernel driver: once set the context has to be exited.
@@ -105,11 +113,10 @@ typedef struct V4L2Context {
     int flag_last;
 
     /**
-     * PTS rescale not wanted
-     * If the PTS is just a dummy frame count then rescale is
-     * actively harmful
+     * If NZ then when Qing frame/pkt use this rather than the
+     * "real" PTS
      */
-    int no_pts_rescale;
+    uint64_t track_ts;
 
     AVBufferRef *frames_ref;
     atomic_int q_count;
diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index 6dd01e2e00..1e30d15fd8 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -35,6 +35,14 @@
 #include "v4l2_fmt.h"
 #include "v4l2_m2m.h"
 
+static void
+xlat_init(xlat_track_t * const x)
+{
+    memset(x, 0, sizeof(*x));
+    x->last_pts = AV_NOPTS_VALUE;
+}
+
+
 static inline int v4l2_splane_video(struct v4l2_capability *cap)
 {
     if (cap->capabilities & (V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT) &&
@@ -67,7 +75,9 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)
 
     s->capture.done = s->output.done = 0;
     s->capture.name = "capture";
+    s->capture.buf_mem = V4L2_MEMORY_MMAP;
     s->output.name = "output";
+    s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
     atomic_init(&s->refcount, 0);
     sem_init(&s->refsync, 0, 0);
 
@@ -334,35 +344,38 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *priv)
     return v4l2_configure_contexts(s);
 }
 
-int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s)
+int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps)
 {
-    *s = av_mallocz(sizeof(V4L2m2mContext));
-    if (!*s)
+    V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext));
+
+    *pps = NULL;
+    if (!s)
         return AVERROR(ENOMEM);
 
-    priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext),
+    priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s),
                                          &v4l2_m2m_destroy_context, NULL, 0);
     if (!priv->context_ref) {
-        av_freep(s);
+        av_free(s);
         return AVERROR(ENOMEM);
     }
 
     /* assign the context */
-    priv->context = *s;
-    (*s)->priv = priv;
+    priv->context = s;
+    s->priv = priv;
 
     /* populate it */
-    priv->context->capture.num_buffers = priv->num_capture_buffers;
-    priv->context->output.num_buffers  = priv->num_output_buffers;
-    priv->context->self_ref = priv->context_ref;
-    priv->context->fd = -1;
+    s->capture.num_buffers = priv->num_capture_buffers;
+    s->output.num_buffers  = priv->num_output_buffers;
+    s->self_ref = priv->context_ref;
+    s->fd = -1;
+    xlat_init(&s->xlat);
 
     priv->context->frame = av_frame_alloc();
     if (!priv->context->frame) {
         av_buffer_unref(&priv->context_ref);
-        *s = NULL; /* freed when unreferencing context_ref */
         return AVERROR(ENOMEM);
     }
 
+    *pps = s;
     return 0;
 }
diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 19d618698d..d6cdaf65e1 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -103,6 +103,9 @@ typedef struct V4L2m2mContext {
     /* generate DRM frames */
     int output_drm;
 
+    /* input frames are drmprime */
+    int input_drm;
+
     /* Frame tracking */
     xlat_track_t xlat;
     int pending_hw;
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 7e17044706..fbbfc81342 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -169,96 +169,17 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s)
     return 0;
 }
 
-static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
-{
-    return (int64_t)n;
-}
-
-static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
-{
-    return (unsigned int)pts;
-}
-
-// FFmpeg requires us to propagate a number of vars from the coded pkt into
-// the decoded frame. The only thing that tracks like that in V4L2 stateful
-// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
-// guarantees about PTS being unique or specified for every frame so replace
-// the supplied PTS with a simple incrementing number and keep a circular
-// buffer of all the things we want preserved (including the original PTS)
-// indexed by the tracking no.
 static void
-xlat_pts_in(AVCodecContext *const avctx, xlat_track_t *const x, AVPacket *const avpkt)
-{
-    int64_t track_pts;
-
-    // Avoid 0
-    if (++x->track_no == 0)
-        x->track_no = 1;
-
-    track_pts = track_to_pts(avctx, x->track_no);
-
-    av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
-    x->last_pkt_dts = avpkt->dts;
-    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
-        .discard          = 0,
-        .pending          = 1,
-        .pkt_size         = avpkt->size,
-        .pts              = avpkt->pts,
-        .dts              = avpkt->dts,
-        .reordered_opaque = avctx->reordered_opaque,
-        .pkt_pos          = avpkt->pos,
-        .pkt_duration     = avpkt->duration,
-        .track_pts        = track_pts
-    };
-    avpkt->pts = track_pts;
-}
-
-// Returns -1 if we should discard the frame
-static int
-xlat_pts_out(AVCodecContext *const avctx,
-             xlat_track_t * const x,
+set_best_effort_pts(AVCodecContext *const avctx,
              pts_stats_t * const ps,
              AVFrame *const frame)
 {
-    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
-    V4L2m2mTrackEl *const t = x->track_els + n;
-    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
-    {
-        av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-        frame->pts              = AV_NOPTS_VALUE;
-        frame->pkt_dts          = x->last_pkt_dts;
-        frame->reordered_opaque = x->last_opaque;
-        frame->pkt_pos          = -1;
-        frame->pkt_duration     = 0;
-        frame->pkt_size         = -1;
-    }
-    else if (!t->discard)
-    {
-        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
-        frame->pkt_dts          = x->last_pkt_dts;
-        frame->reordered_opaque = t->reordered_opaque;
-        frame->pkt_pos          = t->pkt_pos;
-        frame->pkt_duration     = t->pkt_duration;
-        frame->pkt_size         = t->pkt_size;
-
-        x->last_opaque = x->track_els[n].reordered_opaque;
-        if (frame->pts != AV_NOPTS_VALUE)
-            x->last_pts = frame->pts;
-        t->pending = 0;
-    }
-    else
-    {
-        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-        return -1;
-    }
-
     pts_stats_add(ps, frame->pts);
 
     frame->best_effort_timestamp = pts_stats_guess(ps);
     frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
-    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
-           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
-    return 0;
+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n",
+           frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
 }
 
 static void
@@ -272,13 +193,6 @@ xlat_flush(xlat_track_t * const x)
     x->last_pts = AV_NOPTS_VALUE;
 }
 
-static void
-xlat_init(xlat_track_t * const x)
-{
-    memset(x, 0, sizeof(*x));
-    x->last_pts = AV_NOPTS_VALUE;
-}
-
 static int
 xlat_pending(const xlat_track_t * const x)
 {
@@ -419,8 +333,6 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
             av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret);
             return ret;
         }
-
-        xlat_pts_in(avctx, &s->xlat, &s->buf_pkt);
     }
 
     if (s->draining) {
@@ -542,49 +454,47 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
                 prefer_dq ? 5 :
                 src_rv == NQ_Q_FULL ? -1 : 0;
 
-            do {
-                // Dequeue frame will unref any previous contents of frame
-                // if it returns success so we don't need an explicit unref
-                // when discarding
-                // This returns AVERROR(EAGAIN) on timeout or if
-                // there is room in the input Q and timeout == -1
-                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
-
-                // Failure due to no buffer in Q?
-                if (dst_rv == AVERROR(ENOSPC)) {
-                    // Wait & retry
-                    if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
-                        dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
-                    }
+            // Dequeue frame will unref any previous contents of frame
+            // if it returns success so we don't need an explicit unref
+            // when discarding
+            // This returns AVERROR(EAGAIN) on timeout or if
+            // there is room in the input Q and timeout == -1
+            dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
+
+            // Failure due to no buffer in Q?
+            if (dst_rv == AVERROR(ENOSPC)) {
+                // Wait & retry
+                if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
+                    dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
                 }
+            }
+
+            // Adjust dynamic pending threshold
+            if (dst_rv == 0) {
+                if (--s->pending_hw < PENDING_HW_MIN)
+                    s->pending_hw = PENDING_HW_MIN;
+                s->pending_n = 0;
 
-                // Adjust dynamic pending threshold
-                if (dst_rv == 0) {
-                    if (--s->pending_hw < PENDING_HW_MIN)
-                        s->pending_hw = PENDING_HW_MIN;
+                set_best_effort_pts(avctx, &s->pts_stat, frame);
+            }
+            else if (dst_rv == AVERROR(EAGAIN)) {
+                if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
+                    s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
                     s->pending_n = 0;
                 }
-                else if (dst_rv == AVERROR(EAGAIN)) {
-                    if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
-                        s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
-                        s->pending_n = 0;
-                    }
-                }
+            }
 
-                if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
-                    av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
-                    dst_rv = AVERROR_EOF;
-                    s->capture.done = 1;
-                }
-                else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
-                    av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
-                           s->draining, s->capture.done);
-                else if (dst_rv && dst_rv != AVERROR(EAGAIN))
-                    av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
-                           s->draining, s->capture.done, dst_rv);
-
-                // Go again if we got a frame that we need to discard
-            } while (dst_rv == 0 && xlat_pts_out(avctx, &s->xlat, &s->pts_stat, frame));
+            if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
+                av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
+                dst_rv = AVERROR_EOF;
+                s->capture.done = 1;
+            }
+            else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
+                av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
+                       s->draining, s->capture.done);
+            else if (dst_rv && dst_rv != AVERROR(EAGAIN))
+                av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
+                       s->draining, s->capture.done, dst_rv);
         }
 
         ++i;
@@ -791,7 +701,6 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     if (ret < 0)
         return ret;
 
-    xlat_init(&s->xlat);
     pts_stats_init(&s->pts_stat, avctx, "decoder");
     s->pending_hw = PENDING_HW_MIN;
 
@@ -810,12 +719,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     output->av_codec_id = avctx->codec_id;
     output->av_pix_fmt  = AV_PIX_FMT_NONE;
     output->min_buf_size = max_coded_size(avctx);
-    output->no_pts_rescale = 1;
 
     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
     capture->av_pix_fmt = avctx->pix_fmt;
     capture->min_buf_size = 0;
-    capture->no_pts_rescale = 1;
 
     /* the client requests the codec to generate DRM frames:
      *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
index 9a0837ecf3..05ff6ba726 100644
--- a/libavcodec/v4l2_m2m_enc.c
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -24,6 +24,8 @@
 #include <linux/videodev2.h>
 #include <sys/ioctl.h>
 #include <search.h>
+#include <drm_fourcc.h>
+
 #include "encode.h"
 #include "libavcodec/avcodec.h"
 #include "libavutil/pixdesc.h"
@@ -38,6 +40,34 @@
 #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x
 #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x
 
+// P030 should be defined in drm_fourcc.h and hopefully will be sometime
+// in the future but until then...
+#ifndef DRM_FORMAT_P030
+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
+#endif
+
+#ifndef DRM_FORMAT_NV15
+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
+#endif
+
+#ifndef DRM_FORMAT_NV20
+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
+#endif
+
+#ifndef V4L2_CID_CODEC_BASE
+#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
+#endif
+
+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
+// in videodev2.h hopefully will be sometime in the future but until then...
+#ifndef V4L2_PIX_FMT_NV12_10_COL128
+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
+#endif
+
+#ifndef V4L2_PIX_FMT_NV12_COL128
+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
+#endif
+
 static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den)
 {
     struct v4l2_streamparm parm = { 0 };
@@ -148,15 +178,14 @@ static inline int v4l2_mpeg4_profile_from_ff(int p)
 static int v4l2_check_b_frame_support(V4L2m2mContext *s)
 {
     if (s->avctx->max_b_frames)
-        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n");
+        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames);
 
-    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0);
+    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1);
     v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0);
     if (s->avctx->max_b_frames == 0)
         return 0;
 
     avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding");
-
     return AVERROR_PATCHWELCOME;
 }
 
@@ -271,13 +300,184 @@ static int v4l2_prepare_encoder(V4L2m2mContext *s)
     return 0;
 }
 
+static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame)
+{
+    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
+
+    const uint32_t drm_fmt = src->layers[0].format;
+    // Treat INVALID as LINEAR
+    const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
+        DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
+    uint32_t pix_fmt = 0;
+    uint32_t w = 0;
+    uint32_t h = 0;
+    uint32_t bpl = src->layers[0].planes[0].pitch;
+
+    // We really don't expect multiple layers
+    // All formats that we currently cope with are single object
+
+    if (src->nb_layers != 1 || src->nb_objects != 1)
+        return AVERROR(EINVAL);
+
+    switch (drm_fmt) {
+        case DRM_FORMAT_YUV420:
+            if (mod == DRM_FORMAT_MOD_LINEAR) {
+                if (src->layers[0].nb_planes != 3)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_YUV420;
+                h = src->layers[0].planes[1].offset / bpl;
+                w = bpl;
+            }
+            break;
+
+        case DRM_FORMAT_NV12:
+            if (mod == DRM_FORMAT_MOD_LINEAR) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_NV12;
+                h = src->layers[0].planes[1].offset / bpl;
+                w = bpl;
+            }
+            else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_NV12_COL128;
+                w = bpl;
+                h = src->layers[0].planes[1].offset / 128;
+                bpl = fourcc_mod_broadcom_param(mod);
+            }
+            break;
+
+        case DRM_FORMAT_P030:
+            if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt =  V4L2_PIX_FMT_NV12_10_COL128;
+                w = bpl / 2;  // Matching lie to how we construct this
+                h = src->layers[0].planes[1].offset / 128;
+                bpl = fourcc_mod_broadcom_param(mod);
+            }
+            break;
+
+        default:
+            break;
+    }
+
+    if (!pix_fmt)
+        return AVERROR(EINVAL);
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
+        struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
+
+        pix->width = w;
+        pix->height = h;
+        pix->pixelformat = pix_fmt;
+        pix->plane_fmt[0].bytesperline = bpl;
+        pix->num_planes = 1;
+    }
+    else {
+        struct v4l2_pix_format *const pix = &format->fmt.pix;
+
+        pix->width = w;
+        pix->height = h;
+        pix->pixelformat = pix_fmt;
+        pix->bytesperline = bpl;
+    }
+
+    return 0;
+}
+
+// Do we have similar enough formats to be usable?
+static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b)
+{
+    if (a->type != b->type)
+        return 0;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) {
+        const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp;
+        const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp;
+        unsigned int i;
+        if (pa->pixelformat != pb->pixelformat ||
+            pa->num_planes != pb->num_planes)
+            return 0;
+        for (i = 0; i != pa->num_planes; ++i) {
+            if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline)
+                return 0;
+        }
+    }
+    else {
+        const struct v4l2_pix_format *const pa = &a->fmt.pix;
+        const struct v4l2_pix_format *const pb = &b->fmt.pix;
+        if (pa->pixelformat != pb->pixelformat ||
+            pa->bytesperline != pb->bytesperline)
+            return 0;
+    }
+    return 1;
+}
+
+
 static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
 {
     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
     V4L2Context *const output = &s->output;
 
+    // Signal EOF if needed
+    if (!frame) {
+        return ff_v4l2_context_enqueue_frame(output, frame);
+    }
+
+    if (s->input_drm && !output->streamon) {
+        int rv;
+        struct v4l2_format req_format = {.type = output->format.type};
+
+        // Set format when we first get a buffer
+        if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n");
+            return rv;
+        }
+
+        ff_v4l2_context_release(output);
+
+        output->format = req_format;
+
+        if ((rv = ff_v4l2_context_set_format(output)) != 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n");
+            return rv;
+        }
+
+        if (!fmt_eq(&req_format, &output->format)) {
+            av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n");
+            return AVERROR(EINVAL);
+        }
+
+        output->selection.top = frame->crop_top;
+        output->selection.left = frame->crop_left;
+        output->selection.width = av_frame_cropped_width(frame);
+        output->selection.height = av_frame_cropped_height(frame);
+
+        if ((rv = ff_v4l2_context_init(output)) != 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n");
+            return rv;
+        }
+
+        {
+            struct v4l2_selection selection = {
+                .type = V4L2_BUF_TYPE_VIDEO_OUTPUT,
+                .target = V4L2_SEL_TGT_CROP,
+                .r = output->selection
+            };
+            if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) {
+                av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n",
+                       selection.r.width, selection.r.height, selection.r.left, selection.r.top,
+                       av_err2str(AVERROR(errno)));
+            }
+            av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n",
+                   selection.r.width, selection.r.height, selection.r.left, selection.r.top);
+        }
+    }
+
 #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME
-    if (frame && frame->pict_type == AV_PICTURE_TYPE_I)
+    if (frame->pict_type == AV_PICTURE_TYPE_I)
         v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1);
 #endif
 
@@ -328,7 +528,70 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
     }
 
 dequeue:
-    return ff_v4l2_context_dequeue_packet(capture, avpkt);
+    if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
+        return ret;
+
+    if (capture->first_buf == 1) {
+        uint8_t * data;
+        const int len = avpkt->size;
+
+        // 1st buffer after streamon should be SPS/PPS
+        capture->first_buf = 2;
+
+        // Clear both possible stores so there is no chance of confusion
+        av_freep(&s->extdata_data);
+        s->extdata_size = 0;
+        av_freep(&avctx->extradata);
+        avctx->extradata_size = 0;
+
+        if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL)
+            memcpy(data, avpkt->data, len);
+
+        av_packet_unref(avpkt);
+
+        if (data == NULL)
+            return AVERROR(ENOMEM);
+
+        // We need to copy the header, but keep local if not global
+        if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) {
+            avctx->extradata = data;
+            avctx->extradata_size = len;
+        }
+        else {
+            s->extdata_data = data;
+            s->extdata_size = len;
+        }
+
+        if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
+            return ret;
+    }
+
+    // First frame must be key so mark as such even if encoder forgot
+    if (capture->first_buf == 2)
+        avpkt->flags |= AV_PKT_FLAG_KEY;
+
+    // Add SPS/PPS to the start of every key frame if non-global headers
+    if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) {
+        const size_t newlen = s->extdata_size + avpkt->size;
+        AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE);
+
+        if (buf == NULL) {
+            av_packet_unref(avpkt);
+            return AVERROR(ENOMEM);
+        }
+
+        memcpy(buf->data, s->extdata_data, s->extdata_size);
+        memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size);
+
+        av_buffer_unref(&avpkt->buf);
+        avpkt->buf = buf;
+        avpkt->data = buf->data;
+        avpkt->size = newlen;
+    }
+
+//    av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret);
+    capture->first_buf = 0;
+    return 0;
 }
 
 static av_cold int v4l2_encode_init(AVCodecContext *avctx)
@@ -340,6 +603,8 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
     uint32_t v4l2_fmt_output;
     int ret;
 
+    av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt);
+
     ret = ff_v4l2_m2m_create_context(priv, &s);
     if (ret < 0)
         return ret;
@@ -347,13 +612,17 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
     capture = &s->capture;
     output  = &s->output;
 
+    s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME);
+
     /* common settings output/capture */
     output->height = capture->height = avctx->height;
     output->width = capture->width = avctx->width;
 
     /* output context */
     output->av_codec_id = AV_CODEC_ID_RAWVIDEO;
-    output->av_pix_fmt = avctx->pix_fmt;
+    output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt :
+            avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt :
+            AV_PIX_FMT_YUV420P;
 
     /* capture context */
     capture->av_codec_id = avctx->codec_id;
@@ -372,7 +641,7 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
         v4l2_fmt_output = output->format.fmt.pix.pixelformat;
 
     pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO);
-    if (pix_fmt_output != avctx->pix_fmt) {
+    if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) {
         const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output);
         av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name);
         return AVERROR(EINVAL);
-- 
2.43.0


From 017190149841c7bb96aee2550454176becdf1218 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 8 Jun 2022 16:13:31 +0000
Subject: [PATCH 053/157] v4l2_m2m_dec: Use DTS for best effort PTS if PTS is
 always NO_PTS

If we do have DTS but don't have PTS then assume PTS=DTS.
Also get rid of last_dts from tracking as its info wasn't actually
useful in any way.
---
 libavcodec/v4l2_context.c | 6 ++----
 libavcodec/v4l2_m2m.h     | 1 -
 libavcodec/v4l2_m2m_dec.c | 8 +++++++-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 7a707d21fc..6b97eab41e 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -73,7 +73,6 @@ xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPack
     track_pts = track_to_pts(avctx, x->track_no);
 
     av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
-    x->last_pkt_dts = avpkt->dts;
     x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
         .discard          = 0,
         .pending          = 1,
@@ -100,7 +99,6 @@ xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFr
     track_pts = track_to_pts(avctx, x->track_no);
 
     av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no);
-    x->last_pkt_dts = frame->pkt_dts;
     x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
         .discard          = 0,
         .pending          = 1,
@@ -129,7 +127,7 @@ xlat_pts_frame_out(AVCodecContext *const avctx,
         av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
                "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
         frame->pts              = AV_NOPTS_VALUE;
-        frame->pkt_dts          = x->last_pkt_dts;
+        frame->pkt_dts          = AV_NOPTS_VALUE;
         frame->reordered_opaque = x->last_opaque;
         frame->pkt_pos          = -1;
         frame->pkt_duration     = 0;
@@ -138,7 +136,7 @@ xlat_pts_frame_out(AVCodecContext *const avctx,
     else if (!t->discard)
     {
         frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
-        frame->pkt_dts          = x->last_pkt_dts;
+        frame->pkt_dts          = t->dts;
         frame->reordered_opaque = t->reordered_opaque;
         frame->pkt_pos          = t->pkt_pos;
         frame->pkt_duration     = t->pkt_duration;
diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index d6cdaf65e1..ee72beb052 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -67,7 +67,6 @@ typedef struct pts_stats_s
 typedef struct xlat_track_s {
     unsigned int track_no;
     int64_t last_pts;
-    int64_t last_pkt_dts;
     int64_t last_opaque;
     V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
 } xlat_track_t;
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index fbbfc81342..485a96f4b4 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -177,7 +177,13 @@ set_best_effort_pts(AVCodecContext *const avctx,
     pts_stats_add(ps, frame->pts);
 
     frame->best_effort_timestamp = pts_stats_guess(ps);
-    frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
+    // If we can't guess from just PTS - try DTS
+    if (frame->best_effort_timestamp == AV_NOPTS_VALUE)
+        frame->best_effort_timestamp = frame->pkt_dts;
+
+    // We can't emulate what s/w does in a useful manner and using the
+    // "correct" answer seems to just confuse things.
+    frame->pkt_dts               = frame->pts;
     av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n",
            frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
 }
-- 
2.43.0


From 941b9086238323dd0e239b10a8b8e108953b1289 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 30 Jun 2022 15:59:23 +0000
Subject: [PATCH 054/157] v4l2: Update H265 request for current API

This works with v9 of the H265 patch set which hopefully will be the
last one. Hevc controls extracted from patched v4l2-controls into
hevc-ctrls-v4 - if HEVC controls found in the system v4l2-controls then
those will be used instead.
---
 libavcodec/Makefile            |   2 +-
 libavcodec/hevc-ctrls-v4.h     | 515 +++++++++++++++++++++++++++++++++
 libavcodec/v4l2_req_hevc_v4.c  |   3 +
 libavcodec/v4l2_req_hevc_vx.c  |  81 ++++--
 libavcodec/v4l2_request_hevc.c |   6 +-
 libavcodec/v4l2_request_hevc.h |   1 +
 6 files changed, 583 insertions(+), 25 deletions(-)
 create mode 100644 libavcodec/hevc-ctrls-v4.h
 create mode 100644 libavcodec/v4l2_req_hevc_v4.c

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 2b3c16185d..d433a71236 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -1000,7 +1000,7 @@ OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
 OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
 OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec.o
 OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o\
-                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o
+                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o  v4l2_req_hevc_v4.o
 OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h
new file mode 100644
index 0000000000..7e05f6e7c3
--- /dev/null
+++ b/libavcodec/hevc-ctrls-v4.h
@@ -0,0 +1,515 @@
+/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */
+/*
+ *  Video for Linux Two controls header file
+ *
+ *  Copyright (C) 1999-2012 the contributors
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  Alternatively you can redistribute this file under the terms of the
+ *  BSD license as stated below:
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *  3. The names of its contributors may not be used to endorse or promote
+ *     products derived from this software without specific prior written
+ *     permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ *  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  The contents of this header was split off from videodev2.h. All control
+ *  definitions should be added to this header, which is included by
+ *  videodev2.h.
+ */
+
+#ifndef AVCODEC_HEVC_CTRLS_V4_H
+#define AVCODEC_HEVC_CTRLS_V4_H
+
+#include <linux/const.h>
+#include <linux/types.h>
+
+#define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_STATELESS_BASE + 400)
+#define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_STATELESS_BASE + 401)
+#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 402)
+#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_STATELESS_BASE + 403)
+#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 404)
+#define V4L2_CID_STATELESS_HEVC_DECODE_MODE	(V4L2_CID_CODEC_STATELESS_BASE + 405)
+#define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_STATELESS_BASE + 406)
+#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407)
+
+enum v4l2_stateless_hevc_decode_mode {
+	V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
+	V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
+};
+
+enum v4l2_stateless_hevc_start_code {
+	V4L2_STATELESS_HEVC_START_CODE_NONE,
+	V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
+};
+
+#define V4L2_HEVC_SLICE_TYPE_B	0
+#define V4L2_HEVC_SLICE_TYPE_P	1
+#define V4L2_HEVC_SLICE_TYPE_I	2
+
+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
+
+/**
+ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set
+ *
+ * @video_parameter_set_id: specifies the value of the
+ *			vps_video_parameter_set_id of the active VPS
+ * @seq_parameter_set_id: provides an identifier for the SPS for
+ *			  reference by other syntax elements
+ * @pic_width_in_luma_samples:	specifies the width of each decoded picture
+ *				in units of luma samples
+ * @pic_height_in_luma_samples: specifies the height of each decoded picture
+ *				in units of luma samples
+ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the
+ *                         samples of the luma array
+ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the
+ *                           samples of the chroma arrays
+ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of
+ *                                     the variable MaxPicOrderCntLsb
+ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum
+ *                                    required size of the decoded picture
+ *                                    buffer for the codec video sequence
+ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures
+ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the
+ *				    value of SpsMaxLatencyPictures array
+ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum
+ *					    luma coding block size
+ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between
+ *					      the maximum and minimum luma
+ *					      coding block size
+ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma
+ *					       transform block size
+ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between
+ *						 the maximum and minimum luma
+ *						 transform block size
+ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy
+ *					 depth for transform units of
+ *					 coding units coded in inter
+ *					 prediction mode
+ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy
+ *					 depth for transform units of
+ *					 coding units coded in intra
+ *					 prediction mode
+ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of
+ *                                    bits used to represent each of PCM sample
+ *                                    values of the luma component
+ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number
+ *                                      of bits used to represent each of PCM
+ *                                      sample values of the chroma components
+ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the
+ *                                              minimum size of coding blocks
+ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between
+ *						  the maximum and minimum size of
+ *						  coding blocks
+ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set()
+ *				 syntax structures included in the SPS
+ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term
+ *				reference pictures that are specified in the SPS
+ * @chroma_format_idc: specifies the chroma sampling
+ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number
+ *                             of temporal sub-layers
+ * @reserved: padding field. Should be zeroed by applications.
+ * @flags: see V4L2_HEVC_SPS_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_sps {
+	__u8	video_parameter_set_id;
+	__u8	seq_parameter_set_id;
+	__u16	pic_width_in_luma_samples;
+	__u16	pic_height_in_luma_samples;
+	__u8	bit_depth_luma_minus8;
+	__u8	bit_depth_chroma_minus8;
+	__u8	log2_max_pic_order_cnt_lsb_minus4;
+	__u8	sps_max_dec_pic_buffering_minus1;
+	__u8	sps_max_num_reorder_pics;
+	__u8	sps_max_latency_increase_plus1;
+	__u8	log2_min_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_luma_coding_block_size;
+	__u8	log2_min_luma_transform_block_size_minus2;
+	__u8	log2_diff_max_min_luma_transform_block_size;
+	__u8	max_transform_hierarchy_depth_inter;
+	__u8	max_transform_hierarchy_depth_intra;
+	__u8	pcm_sample_bit_depth_luma_minus1;
+	__u8	pcm_sample_bit_depth_chroma_minus1;
+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
+	__u8	num_short_term_ref_pic_sets;
+	__u8	num_long_term_ref_pics_sps;
+	__u8	chroma_format_idc;
+	__u8	sps_max_sub_layers_minus1;
+
+	__u8	reserved[6];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
+
+/**
+ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set
+ *
+ * @pic_parameter_set_id: identifies the PPS for reference by other
+ *			  syntax elements
+ * @num_extra_slice_header_bits: specifies the number of extra slice header
+ *				 bits that are present in the slice header RBSP
+ *				 for coded pictures referring to the PPS.
+ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the
+ *                                        inferred value of num_ref_idx_l0_active_minus1
+ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the
+ *                                        inferred value of num_ref_idx_l1_active_minus1
+ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for
+ *		     each slice referring to the PPS
+ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding
+ *			    tree block size and the minimum luma coding block
+ *			    size of coding units that convey cu_qp_delta_abs
+ *			    and cu_qp_delta_sign_flag
+ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb
+ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr
+ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns
+ *			     partitioning the picture
+ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning
+ *			  the picture
+ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in
+ *			 units of coding tree blocks
+ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in
+ *		       units of coding tree blocks
+ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for
+ *			  beta divided by 2
+ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC
+ *			divided by 2
+ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of
+ *                                    the variable Log2ParMrgLevel
+ * @reserved: padding field. Should be zeroed by applications.
+ * @flags: see V4L2_HEVC_PPS_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_pps {
+	__u8	pic_parameter_set_id;
+	__u8	num_extra_slice_header_bits;
+	__u8	num_ref_idx_l0_default_active_minus1;
+	__u8	num_ref_idx_l1_default_active_minus1;
+	__s8	init_qp_minus26;
+	__u8	diff_cu_qp_delta_depth;
+	__s8	pps_cb_qp_offset;
+	__s8	pps_cr_qp_offset;
+	__u8	num_tile_columns_minus1;
+	__u8	num_tile_rows_minus1;
+	__u8	column_width_minus1[20];
+	__u8	row_height_minus1[22];
+	__s8	pps_beta_offset_div2;
+	__s8	pps_tc_offset_div2;
+	__u8	log2_parallel_merge_level_minus2;
+	__u8	reserved;
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
+
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME				0
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD			1
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD			2
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM			3
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP			4
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP			5
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM		6
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING			7
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING			8
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM	9
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP	10
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM		11
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP		12
+
+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
+
+/**
+ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry
+ *
+ * @timestamp: timestamp of the V4L2 capture buffer to use as reference.
+ * @flags: long term flag for the reference frame
+ * @field_pic: whether the reference is a field picture or a frame.
+ * @reserved: padding field. Should be zeroed by applications.
+ * @pic_order_cnt_val: the picture order count of the current picture.
+ */
+struct v4l2_hevc_dpb_entry {
+	__u64	timestamp;
+	__u8	flags;
+	__u8	field_pic;
+	__u16	reserved;
+	__s32	pic_order_cnt_val;
+};
+
+/**
+ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters
+ *
+ * @delta_luma_weight_l0: the difference of the weighting factor applied
+ *			  to the luma prediction value for list 0
+ * @luma_offset_l0: the additive offset applied to the luma prediction value
+ *		    for list 0
+ * @delta_chroma_weight_l0: the difference of the weighting factor applied
+ *			    to the chroma prediction values for list 0
+ * @chroma_offset_l0: the difference of the additive offset applied to
+ *		      the chroma prediction values for list 0
+ * @delta_luma_weight_l1: the difference of the weighting factor applied
+ *			  to the luma prediction value for list 1
+ * @luma_offset_l1: the additive offset applied to the luma prediction value
+ *		    for list 1
+ * @delta_chroma_weight_l1: the difference of the weighting factor applied
+ *			    to the chroma prediction values for list 1
+ * @chroma_offset_l1: the difference of the additive offset applied to
+ *		      the chroma prediction values for list 1
+ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for
+ *			    all luma weighting factors
+ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm
+ *				    of the denominator for all chroma
+ *				    weighting factors
+ */
+struct v4l2_hevc_pred_weight_table {
+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__u8	luma_log2_weight_denom;
+	__s8	delta_chroma_log2_weight_denom;
+};
+
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
+
+/**
+ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters
+ *
+ * This control is a dynamically sized 1-dimensional array,
+ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
+ *
+ * @bit_size: size (in bits) of the current slice data
+ * @data_byte_offset: offset (in bytes) to the video data in the current slice data
+ * @num_entry_point_offsets: specifies the number of entry point offset syntax
+ *			     elements in the slice header.
+ * @nal_unit_type: specifies the coding type of the slice (B, P or I)
+ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit
+ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{}
+ * @colour_plane_id: specifies the colour plane associated with the current slice
+ * @slice_pic_order_cnt: specifies the picture order count
+ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum
+ *                                reference index for reference picture list 0
+ *                                that may be used to decode the slice
+ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum
+ *                                reference index for reference picture list 1
+ *                                that may be used to decode the slice
+ * @collocated_ref_idx: specifies the reference index of the collocated picture used
+ *			for temporal motion vector prediction
+ * @five_minus_max_num_merge_cand: specifies the maximum number of merging
+ *				   motion vector prediction candidates supported in
+ *				   the slice subtracted from 5
+ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding
+ *		    blocks in the slice
+ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset
+ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset
+ * @slice_act_y_qp_offset: screen content extension parameters
+ * @slice_act_cb_qp_offset: screen content extension parameters
+ * @slice_act_cr_qp_offset: screen content extension parameters
+ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2
+ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2
+ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or
+ *		more fields
+ * @reserved0: padding field. Should be zeroed by applications.
+ * @slice_segment_addr: specifies the address of the first coding tree block in
+ *			the slice segment
+ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB
+ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB
+ * @short_term_ref_pic_set_size: specifies the size of short-term reference
+ *				 pictures set included in the SPS
+ * @long_term_ref_pic_set_size: specifies the size of long-term reference
+ *				pictures set include in the SPS
+ * @pred_weight_table: the prediction weight coefficients for inter-picture
+ *		       prediction
+ * @reserved1: padding field. Should be zeroed by applications.
+ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_slice_params {
+	__u32	bit_size;
+	__u32	data_byte_offset;
+	__u32	num_entry_point_offsets;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+	__u8	nal_unit_type;
+	__u8	nuh_temporal_id_plus1;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	slice_type;
+	__u8	colour_plane_id;
+	__s32	slice_pic_order_cnt;
+	__u8	num_ref_idx_l0_active_minus1;
+	__u8	num_ref_idx_l1_active_minus1;
+	__u8	collocated_ref_idx;
+	__u8	five_minus_max_num_merge_cand;
+	__s8	slice_qp_delta;
+	__s8	slice_cb_qp_offset;
+	__s8	slice_cr_qp_offset;
+	__s8	slice_act_y_qp_offset;
+	__s8	slice_act_cb_qp_offset;
+	__s8	slice_act_cr_qp_offset;
+	__s8	slice_beta_offset_div2;
+	__s8	slice_tc_offset_div2;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
+	__u8	pic_struct;
+
+	__u8	reserved0[3];
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u32	slice_segment_addr;
+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u16	short_term_ref_pic_set_size;
+	__u16	long_term_ref_pic_set_size;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
+	struct v4l2_hevc_pred_weight_table pred_weight_table;
+
+	__u8	reserved1[2];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
+
+/**
+ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters
+ *
+ * @pic_order_cnt_val: picture order count
+ * @short_term_ref_pic_set_size: specifies the size of short-term reference
+ *				 pictures set included in the SPS of the first slice
+ * @long_term_ref_pic_set_size: specifies the size of long-term reference
+ *				pictures set include in the SPS of the first slice
+ * @num_active_dpb_entries: the number of entries in dpb
+ * @num_poc_st_curr_before: the number of reference pictures in the short-term
+ *			    set that come before the current frame
+ * @num_poc_st_curr_after: the number of reference pictures in the short-term
+ *			   set that come after the current frame
+ * @num_poc_lt_curr: the number of reference pictures in the long-term set
+ * @poc_st_curr_before: provides the index of the short term before references
+ *			in DPB array
+ * @poc_st_curr_after: provides the index of the short term after references
+ *		       in DPB array
+ * @poc_lt_curr: provides the index of the long term references in DPB array
+ * @reserved: padding field. Should be zeroed by applications.
+ * @dpb: the decoded picture buffer, for meta-data about reference frames
+ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_decode_params {
+	__s32	pic_order_cnt_val;
+	__u16	short_term_ref_pic_set_size;
+	__u16	long_term_ref_pic_set_size;
+	__u8	num_active_dpb_entries;
+	__u8	num_poc_st_curr_before;
+	__u8	num_poc_st_curr_after;
+	__u8	num_poc_lt_curr;
+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	reserved[4];
+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u64	flags;
+};
+
+/**
+ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters
+ *
+ * @scaling_list_4x4: scaling list is used for the scaling process for
+ *		      transform coefficients. The values on each scaling
+ *		      list are expected in raster scan order
+ * @scaling_list_8x8: scaling list is used for the scaling process for
+ *		      transform coefficients. The values on each scaling
+ *		      list are expected in raster scan order
+ * @scaling_list_16x16:	scaling list is used for the scaling process for
+ *			transform coefficients. The values on each scaling
+ *			list are expected in raster scan order
+ * @scaling_list_32x32:	scaling list is used for the scaling process for
+ *			transform coefficients. The values on each scaling
+ *			list are expected in raster scan order
+ * @scaling_list_dc_coef_16x16:	scaling list is used for the scaling process
+ *				for transform coefficients. The values on each
+ *				scaling list are expected in raster scan order.
+ * @scaling_list_dc_coef_32x32:	scaling list is used for the scaling process
+ *				for transform coefficients. The values on each
+ *				scaling list are expected in raster scan order.
+ */
+struct v4l2_ctrl_hevc_scaling_matrix {
+	__u8	scaling_list_4x4[6][16];
+	__u8	scaling_list_8x8[6][64];
+	__u8	scaling_list_16x16[6][64];
+	__u8	scaling_list_32x32[2][64];
+	__u8	scaling_list_dc_coef_16x16[6];
+	__u8	scaling_list_dc_coef_32x32[2];
+};
+
+#endif
diff --git a/libavcodec/v4l2_req_hevc_v4.c b/libavcodec/v4l2_req_hevc_v4.c
new file mode 100644
index 0000000000..c35579d8e0
--- /dev/null
+++ b/libavcodec/v4l2_req_hevc_v4.c
@@ -0,0 +1,3 @@
+#define HEVC_CTRLS_VERSION 4
+#include "v4l2_req_hevc_vx.c"
+
diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
index 611fa21cc3..761c5b2dc7 100644
--- a/libavcodec/v4l2_req_hevc_vx.c
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -6,8 +6,6 @@
 #include "internal.h"
 #include "thread.h"
 
-#include "v4l2_request_hevc.h"
-
 #if HEVC_CTRLS_VERSION == 1
 #include "hevc-ctrls-v1.h"
 
@@ -18,10 +16,37 @@
 #include "hevc-ctrls-v2.h"
 #elif HEVC_CTRLS_VERSION == 3
 #include "hevc-ctrls-v3.h"
+#elif HEVC_CTRLS_VERSION == 4
+#include <linux/v4l2-controls.h>
+#if !defined(V4L2_CID_STATELESS_HEVC_SPS)
+#include "hevc-ctrls-v4.h"
+#endif
 #else
 #error Unknown HEVC_CTRLS_VERSION
 #endif
 
+#ifndef V4L2_CID_STATELESS_HEVC_SPS
+#define V4L2_CID_STATELESS_HEVC_SPS                     V4L2_CID_MPEG_VIDEO_HEVC_SPS
+#define V4L2_CID_STATELESS_HEVC_PPS                     V4L2_CID_MPEG_VIDEO_HEVC_PPS
+#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS            V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS
+#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX          V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX
+#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS           V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS
+#define V4L2_CID_STATELESS_HEVC_DECODE_MODE             V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE
+#define V4L2_CID_STATELESS_HEVC_START_CODE              V4L2_CID_MPEG_VIDEO_HEVC_START_CODE
+
+#define V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED
+#define V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED
+#define V4L2_STATELESS_HEVC_START_CODE_NONE             V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE
+#define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B          V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B
+#endif
+
+// Should be in videodev2 but we might not have a good enough one
+#ifndef V4L2_PIX_FMT_HEVC_SLICE
+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
+#endif
+
+#include "v4l2_request_hevc.h"
+
 #include "libavutil/hwcontext_drm.h"
 
 #include <semaphore.h>
@@ -259,9 +284,13 @@ fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const
 #endif
             entry->field_pic = frame->frame->interlaced_frame;
 
+#if HEVC_CTRLS_VERSION <= 3
             /* TODO: Interleaved: Get the POC for each field. */
             entry->pic_order_cnt[0] = frame->poc;
             entry->pic_order_cnt[1] = frame->poc;
+#else
+            entry->pic_order_cnt_val = frame->poc;
+#endif
         }
     }
     return n;
@@ -287,8 +316,11 @@ static void fill_slice_params(const HEVCContext * const h,
 
     *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
         .bit_size = bit_size,
+#if HEVC_CTRLS_VERSION <= 3
         .data_bit_offset = bit_offset,
-
+#else
+        .data_byte_offset = bit_offset / 8 + 1,
+#endif
         /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
         .slice_segment_addr = sh->slice_segment_addr,
 
@@ -376,8 +408,10 @@ static void fill_slice_params(const HEVCContext * const h,
         av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
     }
 
+#if HEVC_CTRLS_VERSION <= 3
     for (i = 0; i < slice_params->num_entry_point_offsets; i++)
         slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
+#endif
 }
 
 #if HEVC_CTRLS_VERSION >= 2
@@ -761,30 +795,30 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
 
     struct v4l2_ext_control control[] = {
         {
-            .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
+            .id = V4L2_CID_STATELESS_HEVC_SPS,
             .ptr = &controls->sps,
             .size = sizeof(controls->sps),
         },
         {
-            .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS,
+            .id = V4L2_CID_STATELESS_HEVC_PPS,
             .ptr = &controls->pps,
             .size = sizeof(controls->pps),
         },
 #if HEVC_CTRLS_VERSION >= 2
         {
-            .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS,
+            .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS,
             .ptr = dec,
             .size = sizeof(*dec),
         },
 #endif
         {
-            .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS,
+            .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
             .ptr = slices + slice_no,
             .size = sizeof(*slices) * slice_count,
         },
         // Optional
         {
-            .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX,
+            .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX,
             .ptr = &controls->scaling_matrix,
             .size = sizeof(controls->scaling_matrix),
         },
@@ -1000,12 +1034,12 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
 
     // Check for var slice array
     struct v4l2_query_ext_ctrl qc[] = {
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS },
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS },
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS },
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX },
+        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS },
+        { .id = V4L2_CID_STATELESS_HEVC_SPS },
+        { .id = V4L2_CID_STATELESS_HEVC_PPS },
+        { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX },
 #if HEVC_CTRLS_VERSION >= 2
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS },
+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS },
 #endif
     };
     // Order & size must match!
@@ -1042,12 +1076,13 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
 
     fill_sps(&ctrl_sps, sps);
 
-    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_MPEG_VIDEO_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
+    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_STATELESS_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
         av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n");
         return AVERROR(EINVAL);
     }
 
     ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0;
+    av_log(avctx, AV_LOG_INFO, "%s SPS muti-slice\n", ctx->multi_slice ? "Has" : "No");
     return 0;
 }
 
@@ -1058,29 +1093,29 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
     int ret;
 
     struct v4l2_query_ext_ctrl querys[] = {
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, },
+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
+        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
+        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, },
     };
 
     struct v4l2_ext_control ctrls[] = {
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
-        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
+        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
     };
 
     mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));
 
     ctx->decode_mode = querys[0].default_value;
 
-    if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED &&
-        ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) {
+    if (ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED &&
+        ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) {
         av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
         return AVERROR(EINVAL);
     }
 
     ctx->start_code = querys[1].default_value;
-    if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE &&
-        ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
+    if (ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_NONE &&
+        ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) {
         av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
         return AVERROR(EINVAL);
     }
diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
index 20e4e0ab15..cd79aad563 100644
--- a/libavcodec/v4l2_request_hevc.c
+++ b/libavcodec/v4l2_request_hevc.c
@@ -210,7 +210,11 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
         goto fail4;
     }
 
-    if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
+    if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) {
+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
+        ctx->fns = &V2(ff_v4l2_req_hevc, 4);
+    }
+    else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
         ctx->fns = &V2(ff_v4l2_req_hevc, 3);
     }
diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
index ed48d62e2d..d4adb3f812 100644
--- a/libavcodec/v4l2_request_hevc.h
+++ b/libavcodec/v4l2_request_hevc.h
@@ -99,5 +99,6 @@ typedef struct v4l2_req_decode_fns {
 extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
 extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
 extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3);
+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 4);
 
 #endif
-- 
2.43.0


From c1d9f927762193a739895f829878340333401f14 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Sun, 3 Jul 2022 13:40:41 +0000
Subject: [PATCH 055/157] v4l2_req: Observe limit on size of slice_array

This in fact provides some minor simplifications by combing the
multi-slice and single-slice paths.

(cherry picked from commit 7631e6d1a66fca9048605c214f3464c90d37932c)
---
 libavcodec/v4l2_req_hevc_vx.c  | 39 ++++++++++++++--------------------
 libavcodec/v4l2_request_hevc.h |  5 +----
 2 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
index 761c5b2dc7..9d08d13d9e 100644
--- a/libavcodec/v4l2_req_hevc_vx.c
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -840,18 +840,21 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
     int bcount = get_bits_count(&h->HEVClc->gb);
     uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount;
 
+    const unsigned int n = rd->num_slices;
+    const unsigned int block_start = (n / ctx->max_slices) * ctx->max_slices;
+
     int rv;
     struct slice_info * si;
 
     if ((rv = slice_add(rd)) != 0)
         return rv;
 
-    si = rd->slices + rd->num_slices - 1;
+    si = rd->slices + n;
     si->ptr = buffer;
     si->len = size;
 
-    if (ctx->multi_slice && rd->num_slices > 1) {
-        struct slice_info *const si0 = rd->slices;
+    if (n != block_start) {
+        struct slice_info *const si0 = rd->slices + block_start;
         const size_t offset = (buffer - si0->ptr);
         boff += offset * 8;
         size += offset;
@@ -859,11 +862,11 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
     }
 
 #if HEVC_CTRLS_VERSION >= 2
-    if (rd->num_slices == 1)
+    if (n == 0)
         fill_decode_params(h, &rd->dec);
-    fill_slice_params(h, &rd->dec, rd->slice_params + rd->num_slices - 1, size * 8, boff);
+    fill_slice_params(h, &rd->dec, rd->slice_params + n, size * 8, boff);
 #else
-    fill_slice_params(h, rd->slice_params + rd->num_slices - 1, size * 8, boff);
+    fill_slice_params(h, rd->slice_params + n, size * 8, boff);
 #endif
 
     return 0;
@@ -997,18 +1000,11 @@ static int v4l2_request_hevc_end_frame(AVCodecContext *avctx)
     }
 
     // Send as slices
-    if (ctx->multi_slice)
-    {
-        if ((rv = send_slice(avctx, rd, &rc, 0, rd->num_slices)) != 0)
+    for (i = 0; i < rd->num_slices; i += ctx->max_slices) {
+        const unsigned int e = FFMIN(rd->num_slices, i + ctx->max_slices);
+        if ((rv = send_slice(avctx, rd, &rc, i, e)) != 0)
             goto fail;
     }
-    else
-    {
-        for (i = 0; i != rd->num_slices; ++i) {
-            if ((rv = send_slice(avctx, rd, &rc, i, i + 1)) != 0)
-                goto fail;
-        }
-    }
 
     // Set the drm_prime desriptor
     drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs));
@@ -1081,8 +1077,6 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
         return AVERROR(EINVAL);
     }
 
-    ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0;
-    av_log(avctx, AV_LOG_INFO, "%s SPS muti-slice\n", ctx->multi_slice ? "Has" : "No");
     return 0;
 }
 
@@ -1120,11 +1114,10 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
         return AVERROR(EINVAL);
     }
 
-    ctx->max_slices = querys[2].elems;
-    if (ctx->max_slices > MAX_SLICES) {
-        av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices);
-        return AVERROR(EINVAL);
-    }
+    ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) ||
+                       querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ?
+        1 : querys[2].dims[0];
+    av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices);
 
     ctrls[0].value = ctx->decode_mode;
     ctrls[1].value = ctx->start_code;
diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
index d4adb3f812..0029e23309 100644
--- a/libavcodec/v4l2_request_hevc.h
+++ b/libavcodec/v4l2_request_hevc.h
@@ -46,8 +46,6 @@
 #define V4L2_CTRL_FLAG_DYNAMIC_ARRAY	0x0800
 #endif
 
-#define MAX_SLICES 128
-
 #define VCAT(name, version) name##_v##version
 #define V2(n,v) VCAT(n, v)
 #define V(n) V2(n, HEVC_CTRLS_VERSION)
@@ -64,10 +62,9 @@ typedef struct V4L2RequestContextHEVC {
 
     unsigned int timestamp;  // ?? maybe uint64_t
 
-    int multi_slice;
     int decode_mode;
     int start_code;
-    int max_slices;
+    unsigned int max_slices;
 
     req_decode_q decode_q;
 
-- 
2.43.0


From b1029175dfc8494a658af1604d179e6967e265cc Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 4 Jul 2022 14:43:20 +0100
Subject: [PATCH 056/157] v4l2_req: Add entry point offsets array control

---
 libavcodec/v4l2_req_hevc_vx.c  | 88 +++++++++++++++++++++++++++-------
 libavcodec/v4l2_request_hevc.h |  3 +-
 2 files changed, 72 insertions(+), 19 deletions(-)

diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
index 9d08d13d9e..43ef6631ed 100644
--- a/libavcodec/v4l2_req_hevc_vx.c
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -82,11 +82,16 @@ typedef struct V4L2MediaReqDescriptor {
     struct v4l2_ctrl_hevc_slice_params * slice_params;
     struct slice_info * slices;
 
+    size_t num_offsets;
+    size_t alloced_offsets;
+    uint32_t *offsets;
+
 } V4L2MediaReqDescriptor;
 
 struct slice_info {
     const uint8_t * ptr;
     size_t len; // bytes
+    size_t n_offsets;
 };
 
 // Handy container for accumulating controls before setting
@@ -245,7 +250,7 @@ static int slice_add(V4L2MediaReqDescriptor * const rd)
     if (rd->num_slices >= rd->alloced_slices) {
         struct v4l2_ctrl_hevc_slice_params * p2;
         struct slice_info * s2;
-        size_t n2 = rd->num_slices == 0 ? 8 : rd->num_slices * 2;
+        size_t n2 = rd->alloced_slices == 0 ? 8 : rd->alloced_slices * 2;
 
         p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2));
         if (p2 == NULL)
@@ -263,6 +268,23 @@ static int slice_add(V4L2MediaReqDescriptor * const rd)
     return 0;
 }
 
+static int offsets_add(V4L2MediaReqDescriptor *const rd, const size_t n, const unsigned * const offsets)
+{
+    if (rd->num_offsets + n > rd->alloced_offsets) {
+        size_t n2 = rd->alloced_slices == 0 ? 128 : rd->alloced_slices * 2;
+        void * p2;
+        while (rd->num_offsets + n > n2)
+            n2 *= 2;
+        if ((p2 = av_realloc_array(rd->offsets, n2, sizeof(*rd->offsets))) == NULL)
+            return AVERROR(ENOMEM);
+        rd->offsets = p2;
+        rd->alloced_offsets = n2;
+    }
+    for (size_t i = 0; i != n; ++i)
+        rd->offsets[rd->num_offsets++] = offsets[i] - 1;
+    return 0;
+}
+
 static unsigned int
 fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries)
 {
@@ -403,12 +425,12 @@ static void fill_slice_params(const HEVCContext * const h,
     fill_pred_table(h, &slice_params->pred_weight_table);
 
     slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
+#if HEVC_CTRLS_VERSION <= 3
     if (slice_params->num_entry_point_offsets > 256) {
         slice_params->num_entry_point_offsets = 256;
         av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
     }
 
-#if HEVC_CTRLS_VERSION <= 3
     for (i = 0; i < slice_params->num_entry_point_offsets; i++)
         slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
 #endif
@@ -787,13 +809,17 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
 #if HEVC_CTRLS_VERSION >= 2
     struct v4l2_ctrl_hevc_decode_params * const dec,
 #endif
-    struct v4l2_ctrl_hevc_slice_params * const slices,
-    const unsigned int slice_no,
-    const unsigned int slice_count)
+    struct v4l2_ctrl_hevc_slice_params * const slices, const unsigned int slice_count,
+    void * const offsets, const size_t offset_count)
 {
     int rv;
+#if HEVC_CTRLS_VERSION >= 2
+    unsigned int n = 4;
+#else
+    unsigned int n = 3;
+#endif
 
-    struct v4l2_ext_control control[] = {
+    struct v4l2_ext_control control[6] = {
         {
             .id = V4L2_CID_STATELESS_HEVC_SPS,
             .ptr = &controls->sps,
@@ -813,21 +839,28 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
 #endif
         {
             .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
-            .ptr = slices + slice_no,
+            .ptr = slices,
             .size = sizeof(*slices) * slice_count,
         },
-        // Optional
-        {
+    };
+
+    if (controls->has_scaling)
+        control[n++] = (struct v4l2_ext_control) {
             .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX,
             .ptr = &controls->scaling_matrix,
             .size = sizeof(controls->scaling_matrix),
-        },
-    };
+        };
+
+#if HEVC_CTRLS_VERSION >= 4
+    if (offsets)
+        control[n++] = (struct v4l2_ext_control) {
+            .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS,
+            .ptr = offsets,
+            .size = sizeof(((struct V4L2MediaReqDescriptor *)0)->offsets[0]) * offset_count,
+        };
+#endif
 
-    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control,
-            controls->has_scaling ?
-                FF_ARRAY_ELEMS(control) :
-                FF_ARRAY_ELEMS(control) - 1);
+    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, n);
 
     return rv;
 }
@@ -852,6 +885,7 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
     si = rd->slices + n;
     si->ptr = buffer;
     si->len = size;
+    si->n_offsets = rd->num_offsets;
 
     if (n != block_start) {
         struct slice_info *const si0 = rd->slices + block_start;
@@ -868,6 +902,9 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
 #else
     fill_slice_params(h, rd->slice_params + n, size * 8, boff);
 #endif
+    if (ctx->max_offsets != 0 &&
+        (rv = offsets_add(rd, h->sh.num_entry_point_offsets, h->sh.entry_point_offset)) != 0)
+        return rv;
 
     return 0;
 }
@@ -893,10 +930,13 @@ static int send_slice(AVCodecContext * const avctx,
 {
     V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
 
+    const int is_last = (j == rd->num_slices);
     struct slice_info *const si = rd->slices + i;
     struct media_request * req = NULL;
     struct qent_src * src = NULL;
     MediaBufsStatus stat;
+    void * offsets = rd->offsets + rd->slices[i].n_offsets;
+    size_t n_offsets = (is_last ? rd->num_offsets : rd->slices[j].n_offsets) - rd->slices[i].n_offsets;
 
     if ((req = media_request_get(ctx->mpool)) == NULL) {
         av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__);
@@ -908,8 +948,8 @@ static int send_slice(AVCodecContext * const avctx,
 #if HEVC_CTRLS_VERSION >= 2
                      &rd->dec,
 #endif
-                     rd->slice_params,
-                     i, j - i)) {
+                     rd->slice_params + i, j - i,
+                     offsets, n_offsets)) {
         av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__);
         goto fail1;
     }
@@ -935,7 +975,7 @@ static int send_slice(AVCodecContext * const avctx,
 
     stat = mediabufs_start_request(ctx->mbufs, &req, &src,
                                    i == 0 ? rd->qe_dst : NULL,
-                                   j == rd->num_slices);
+                                   is_last);
 
     if (stat != MEDIABUFS_STATUS_SUCCESS) {
         av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__);
@@ -1090,6 +1130,9 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
         { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
         { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
         { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, },
+#if HEVC_CTRLS_VERSION >= 4
+        { .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, },
+#endif
     };
 
     struct v4l2_ext_control ctrls[] = {
@@ -1119,6 +1162,14 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
         1 : querys[2].dims[0];
     av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices);
 
+#if HEVC_CTRLS_VERSION >= 4
+    ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ?
+        0 : querys[3].dims[0];
+    av_log(avctx, AV_LOG_INFO, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets);
+#else
+    ctx->max_offsets = 0;
+#endif
+
     ctrls[0].value = ctx->decode_mode;
     ctrls[1].value = ctx->start_code;
 
@@ -1141,6 +1192,7 @@ static void v4l2_req_frame_free(void *opaque, uint8_t *data)
 
     av_freep(&rd->slices);
     av_freep(&rd->slice_params);
+    av_freep(&rd->offsets);
 
     av_free(rd);
 }
diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
index 0029e23309..99c90064ea 100644
--- a/libavcodec/v4l2_request_hevc.h
+++ b/libavcodec/v4l2_request_hevc.h
@@ -64,7 +64,8 @@ typedef struct V4L2RequestContextHEVC {
 
     int decode_mode;
     int start_code;
-    unsigned int max_slices;
+    unsigned int max_slices;    // 0 => not wanted (frame mode)
+    unsigned int max_offsets;   // 0 => not wanted
 
     req_decode_q decode_q;
 
-- 
2.43.0


From 22e7598562e13c42b7aec5e8ef64534b1632d3c3 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 4 Jul 2022 16:22:54 +0100
Subject: [PATCH 057/157] v4l2_req: Support Annex B

---
 libavcodec/v4l2_req_hevc_vx.c | 61 +++++++++++++++++++++++------------
 1 file changed, 41 insertions(+), 20 deletions(-)

diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
index 43ef6631ed..5e0db9850a 100644
--- a/libavcodec/v4l2_req_hevc_vx.c
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -879,6 +879,18 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
     int rv;
     struct slice_info * si;
 
+    // This looks dodgy but we know that FFmpeg has parsed this from a buffer
+    // that contains the entire frame including the start code
+    if (ctx->start_code == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) {
+        buffer -= 3;
+        size += 3;
+        boff += 24;
+        if (buffer[0] != 0 || buffer[1] != 0 || buffer[2] != 1) {
+            av_log(avctx, AV_LOG_ERROR, "Start code requested but missing %02x:%02x:%02x\n",
+                   buffer[0], buffer[1], buffer[2]);
+        }
+    }
+
     if ((rv = slice_add(rd)) != 0)
         return rv;
 
@@ -969,10 +981,6 @@ static int send_slice(AVCodecContext * const avctx,
         goto fail2;
     }
 
-#warning ANNEX_B start code
-//        if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
-//        }
-
     stat = mediabufs_start_request(ctx->mbufs, &req, &src,
                                    i == 0 ? rd->qe_dst : NULL,
                                    is_last);
@@ -1120,6 +1128,12 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
     return 0;
 }
 
+static inline int
+ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v)
+{
+    return v >= c->minimum && v <= c->maximum;
+}
+
 // Final init
 static int
 set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
@@ -1142,21 +1156,6 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
 
     mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));
 
-    ctx->decode_mode = querys[0].default_value;
-
-    if (ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED &&
-        ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) {
-        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
-        return AVERROR(EINVAL);
-    }
-
-    ctx->start_code = querys[1].default_value;
-    if (ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_NONE &&
-        ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) {
-        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
-        return AVERROR(EINVAL);
-    }
-
     ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) ||
                        querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ?
         1 : querys[2].dims[0];
@@ -1165,11 +1164,33 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
 #if HEVC_CTRLS_VERSION >= 4
     ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ?
         0 : querys[3].dims[0];
-    av_log(avctx, AV_LOG_INFO, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets);
+    av_log(avctx, AV_LOG_DEBUG, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets);
 #else
     ctx->max_offsets = 0;
 #endif
 
+    ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
+
+    if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
+    {
+        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED;
+
+        // Prefer NONE as it doesn't require the slightly dodgy look
+        // backwards in our raw buffer
+        if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
+            ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
+        else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B))
+            ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
+        else {
+            av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__);
+            return AVERROR(EINVAL);
+        }
+    }
+    else
+    {
+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__);
+    }
+
     ctrls[0].value = ctx->decode_mode;
     ctrls[1].value = ctx->start_code;
 
-- 
2.43.0


From 1d78e08699ac702a2cd407c4e941e3dacd3ddae4 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 4 Jul 2022 18:24:03 +0100
Subject: [PATCH 058/157] v4l2_req: Add frame mode decode

---
 libavcodec/v4l2_req_hevc_vx.c | 69 +++++++++++++++++++++++------------
 1 file changed, 46 insertions(+), 23 deletions(-)

diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
index 5e0db9850a..ada53d0d44 100644
--- a/libavcodec/v4l2_req_hevc_vx.c
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -814,9 +814,9 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
 {
     int rv;
 #if HEVC_CTRLS_VERSION >= 2
-    unsigned int n = 4;
-#else
     unsigned int n = 3;
+#else
+    unsigned int n = 2;
 #endif
 
     struct v4l2_ext_control control[6] = {
@@ -837,12 +837,14 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
             .size = sizeof(*dec),
         },
 #endif
-        {
+    };
+
+    if (slices)
+        control[n++] = (struct v4l2_ext_control) {
             .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
             .ptr = slices,
             .size = sizeof(*slices) * slice_count,
-        },
-    };
+        };
 
     if (controls->has_scaling)
         control[n++] = (struct v4l2_ext_control) {
@@ -865,6 +867,8 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
     return rv;
 }
 
+// This only works because we started out from a single coded frame buffer
+// that will remain intact until after end_frame
 static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
 {
     const HEVCContext * const h = avctx->priv_data;
@@ -891,6 +895,17 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
         }
     }
 
+    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) {
+        if (rd->slices == NULL) {
+            if ((rd->slices = av_mallocz(sizeof(*rd->slices))) == NULL)
+                return AVERROR(ENOMEM);
+            rd->slices->ptr = buffer;
+            rd->num_slices = 1;
+        }
+        rd->slices->len = buffer - rd->slices->ptr + size;
+        return 0;
+    }
+
     if ((rv = slice_add(rd)) != 0)
         return rv;
 
@@ -1169,28 +1184,36 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
     ctx->max_offsets = 0;
 #endif
 
-    ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
-
-    if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
-    {
+    if (querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED ||
+        querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED)
+        ctx->decode_mode = querys[0].default_value;
+    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED))
+        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED;
+    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
         ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED;
-
-        // Prefer NONE as it doesn't require the slightly dodgy look
-        // backwards in our raw buffer
-        if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
-            ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
-        else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B))
-            ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
-        else {
-            av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__);
-            return AVERROR(EINVAL);
-        }
-    }
-    else
-    {
+    else {
         av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__);
+        return AVERROR(EINVAL);
     }
 
+    if (querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_NONE ||
+        querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)
+        ctx->start_code = querys[1].default_value;
+    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B))
+        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
+    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
+        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
+    else {
+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__);
+        return AVERROR(EINVAL);
+    }
+
+    // If we are in slice mode & START_CODE_NONE supported then pick that
+    // as it doesn't require the slightly dodgy look backwards in our raw buffer
+    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED &&
+        ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
+        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
+
     ctrls[0].value = ctx->decode_mode;
     ctrls[1].value = ctx->start_code;
 
-- 
2.43.0


From d98985dfd4de98dbaa66e57bd6eb3e9ea5ec0b0d Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 5 Jul 2022 12:54:22 +0000
Subject: [PATCH 059/157] v4l2_req: Fix probe for frame based decode

---
 libavcodec/v4l2_req_hevc_vx.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
index ada53d0d44..5d083016f8 100644
--- a/libavcodec/v4l2_req_hevc_vx.c
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -1082,6 +1082,12 @@ fail:
     return rv;
 }
 
+static inline int
+ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v)
+{
+    return v >= c->minimum && v <= c->maximum;
+}
+
 // Initial check & init
 static int
 probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
@@ -1094,6 +1100,7 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
     // Check for var slice array
     struct v4l2_query_ext_ctrl qc[] = {
         { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS },
+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
         { .id = V4L2_CID_STATELESS_HEVC_SPS },
         { .id = V4L2_CID_STATELESS_HEVC_PPS },
         { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX },
@@ -1104,6 +1111,7 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
     // Order & size must match!
     static const size_t ctrl_sizes[] = {
         sizeof(struct v4l2_ctrl_hevc_slice_params),
+        sizeof(int32_t),
         sizeof(struct v4l2_ctrl_hevc_sps),
         sizeof(struct v4l2_ctrl_hevc_pps),
         sizeof(struct v4l2_ctrl_hevc_scaling_matrix),
@@ -1121,11 +1129,22 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
         return AVERROR(EINVAL);
 #endif
 
-    if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) {
-        av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION);
+    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls);
+    i = 0;
+#if HEVC_CTRLS_VERSION >= 4
+    // Skip slice check if no slice mode
+    if (qc[1].type != 0 && !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
+        i = 1;
+#else
+    // Fail frame mode silently for anything prior to V4
+    if (qc[1].type == 0 || !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
         return AVERROR(EINVAL);
-    }
-    for (i = 0; i != noof_ctrls; ++i) {
+#endif
+    for (; i != noof_ctrls; ++i) {
+        if (qc[i].type == 0) {
+            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %#x missing\n", HEVC_CTRLS_VERSION, qc[i].id);
+            return AVERROR(EINVAL);
+        }
         if (ctrl_sizes[i] != (size_t)qc[i].elem_size) {
             av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n",
                    HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size);
@@ -1143,12 +1162,6 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
     return 0;
 }
 
-static inline int
-ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v)
-{
-    return v >= c->minimum && v <= c->maximum;
-}
-
 // Final init
 static int
 set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-- 
2.43.0


From beda40c2298811e52b9b4bb6eae2b0475661758d Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 26 Jul 2022 15:46:14 +0000
Subject: [PATCH 060/157] vf_deinterlace_v4l2m2m: Support NV12 through
 deinterlace

Supports NV12 (though not yet NV12M) through deinterlace.
Also improves error handling such that attempting to deinterlace an
unsupported drm format causes an error.
No longer leaks frame structures.
---
 libavfilter/vf_deinterlace_v4l2m2m.c | 160 ++++++++++++++++++---------
 1 file changed, 107 insertions(+), 53 deletions(-)

diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
index 1a933b7e0a..1a3bef5bcb 100644
--- a/libavfilter/vf_deinterlace_v4l2m2m.c
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -373,14 +373,16 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue)
 		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
 
     if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-        if (fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 ||
+        if ((fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 &&
+             fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12) ||
             fmt->fmt.pix_mp.field != field) {
             av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
 
             return AVERROR(EINVAL);
         }
     } else {
-        if (fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 ||
+        if ((fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 &&
+             fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12) ||
             fmt->fmt.pix.field != field) {
             av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
 
@@ -391,7 +393,7 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue)
     return 0;
 }
 
-static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, int height, int pitch, int ysize)
+static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height, int pitch, int ysize)
 {
     struct v4l2_format *fmt        = &queue->format;
     DeintV4L2M2MContextShared *ctx = queue->ctx;
@@ -402,13 +404,16 @@ static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width,
         .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS,
     };
 
+    // This works for most single object 4:2:0 types
     if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        fmt->fmt.pix_mp.pixelformat = pixelformat;
         fmt->fmt.pix_mp.field = field;
         fmt->fmt.pix_mp.width = width;
         fmt->fmt.pix_mp.height = ysize / pitch;
         fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch;
         fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1);
     } else {
+        fmt->fmt.pix.pixelformat = pixelformat;
         fmt->fmt.pix.field = field;
         fmt->fmt.pix.width = width;
         fmt->fmt.pix.height = height;
@@ -417,12 +422,22 @@ static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width,
     }
 
     ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt);
-    if (ret)
+    if (ret) {
+        ret = AVERROR(errno);
         av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret);
+        return ret;
+    }
+
+    if (pixelformat != fmt->fmt.pix.pixelformat) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt->fmt.pix.pixelformat));
+        return AVERROR(EINVAL);
+    }
 
     ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel);
-    if (ret)
-        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_SELECTION failed: %d\n", ret);
+    if (ret) {
+        ret = AVERROR(errno);
+        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION failed: %d\n", ret);
+    }
 
     sel.r.width = width;
     sel.r.height = height;
@@ -432,10 +447,12 @@ static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width,
     sel.flags = V4L2_SEL_FLAG_LE;
 
     ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel);
-    if (ret)
-        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_SELECTION failed: %d\n", ret);
+    if (ret) {
+        ret = AVERROR(errno);
+        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %d\n", ret);
+    }
 
-    return ret;
+    return 0;
 }
 
 static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node)
@@ -517,10 +534,25 @@ static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf)
     return 0;
 }
 
-static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat)
 {
     struct v4l2_exportbuffer expbuf;
     int i, ret;
+    uint64_t mod = DRM_FORMAT_MOD_LINEAR;
+    uint32_t fmt = 0;
+
+    switch (pixelformat) {
+    case V4L2_PIX_FMT_NV12:
+        fmt = DRM_FORMAT_NV12;
+        break;
+    case V4L2_PIX_FMT_YUV420:
+        fmt = DRM_FORMAT_YUV420;
+        break;
+    default:
+        return AVERROR(EINVAL);
+    }
+
+    avbuf->drm_frame.layers[0].format = fmt;
 
     for (i = 0; i < avbuf->num_planes; i++) {
         memset(&expbuf, 0, sizeof(expbuf));
@@ -539,12 +571,12 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
             /* drm frame */
             avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length;
             avbuf->drm_frame.objects[i].fd = expbuf.fd;
-            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
+            avbuf->drm_frame.objects[i].format_modifier = mod;
         } else {
             /* drm frame */
             avbuf->drm_frame.objects[0].size = avbuf->buffer.length;
             avbuf->drm_frame.objects[0].fd = expbuf.fd;
-            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+            avbuf->drm_frame.objects[0].format_modifier = mod;
         }
     }
 
@@ -629,7 +661,7 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
             if (ret)
                 goto fail;
 
-            ret = v4l2_buffer_export_drm(buf);
+            ret = v4l2_buffer_export_drm(buf, multiplanar ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat);
             if (ret)
                 goto fail;
         }
@@ -878,7 +910,6 @@ static void v4l2_free_buffer(void *opaque, uint8_t *unused)
 
 static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
 {
-    int av_pix_fmt = AV_PIX_FMT_YUV420P;
     AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
     AVDRMLayerDescriptor *layer;
 
@@ -895,20 +926,13 @@ static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
         layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
     }
 
-    switch (av_pix_fmt) {
-    case AV_PIX_FMT_YUYV422:
-
-        layer->format = DRM_FORMAT_YUYV;
+    switch (layer->format) {
+    case DRM_FORMAT_YUYV:
         layer->nb_planes = 1;
-
         break;
 
-    case AV_PIX_FMT_NV12:
-    case AV_PIX_FMT_NV21:
-
-        layer->format = av_pix_fmt == AV_PIX_FMT_NV12 ?
-            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
-
+    case DRM_FORMAT_NV12:
+    case DRM_FORMAT_NV21:
         if (avbuf->num_planes > 1)
             break;
 
@@ -920,10 +944,7 @@ static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
         layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
         break;
 
-    case AV_PIX_FMT_YUV420P:
-
-        layer->format = DRM_FORMAT_YUV420;
-
+    case DRM_FORMAT_YUV420:
         if (avbuf->num_planes > 1)
             break;
 
@@ -1032,6 +1053,26 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
     return 0;
 }
 
+static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc)
+{
+    const int is_linear = (drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_LINEAR ||
+            drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID);
+
+    switch (drm_desc->layers[0].format) {
+    case DRM_FORMAT_YUV420:
+        if (is_linear)
+            return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_YUV420 : 0;
+        break;
+    case DRM_FORMAT_NV12:
+        if (is_linear)
+            return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_NV12 : 0;
+        break;
+    default:
+        break;
+    }
+    return 0;
+}
+
 static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
 {
     AVFilterContext *avctx         = link->dst;
@@ -1047,23 +1088,27 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
            avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out);
 
     if (ctx->field_order == V4L2_FIELD_ANY) {
-        AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0];
+        const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0];
+        const uint32_t pixelformat = desc_pixelformat(drm_desc);
+
+        if (pixelformat == 0) {
+            av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n",
+                   av_fourcc2str(drm_desc->layers[0].format),
+                   drm_desc->nb_objects, drm_desc->objects[0].format_modifier);
+            return AVERROR(EINVAL);
+        }
+
         ctx->orig_width = drm_desc->layers[0].planes[0].pitch;
         ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width;
 
         av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height,
            drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset);
 
-        if (in->top_field_first)
-            ctx->field_order = V4L2_FIELD_INTERLACED_TB;
-        else
-            ctx->field_order = V4L2_FIELD_INTERLACED_BT;
-
-        ret = deint_v4l2m2m_set_format(output, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
+        ret = deint_v4l2m2m_set_format(output, pixelformat, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
         if (ret)
             return ret;
 
-        ret = deint_v4l2m2m_set_format(capture, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
+        ret = deint_v4l2m2m_set_format(capture, pixelformat, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
         if (ret)
             return ret;
 
@@ -1082,6 +1127,12 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
         ret = deint_v4l2m2m_streamon(output);
         if (ret)
             return ret;
+
+        if (in->top_field_first)
+            ctx->field_order = V4L2_FIELD_INTERLACED_TB;
+        else
+            ctx->field_order = V4L2_FIELD_INTERLACED_BT;
+
     }
 
     ret = deint_v4l2m2m_enqueue_frame(output, in);
@@ -1157,28 +1208,31 @@ again:
         return 0;
     }
 
-    {
+    recycle_q(&s->output);
+    n = count_enqueued(&s->output);
+
+    while (n < 6) {
         AVFrame * frame;
         int rv;
 
-        recycle_q(&s->output);
-        n = count_enqueued(&s->output);
+        if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) {
+            av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv));
+            return rv;
+        }
 
-        while (n < 6) {
-            if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) {
-                av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv));
-                return rv;
-            }
+        if (frame == NULL) {
+            av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
+            break;
+        }
 
-            if (frame == NULL) {
-                av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
-                break;
-            }
+        rv = deint_v4l2m2m_filter_frame(inlink, frame);
+        av_frame_free(&frame);
 
-            deint_v4l2m2m_filter_frame(inlink, frame);
-            av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
-            ++n;
-        }
+        if (rv != 0)
+            return rv;
+
+        av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
+        ++n;
     }
 
     if (n < 6) {
-- 
2.43.0


From dbf7443f0c445e48c36181d6aedc10ced516346a Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 19 Aug 2022 15:29:11 +0000
Subject: [PATCH 061/157] v4l2_req: Enable use of MMAP for buffer alloc

Use MMAP rather than DMABUF if either the dmabuf device can't be opened
or create_buf doesn't set the capability.
---
 libavcodec/v4l2_req_dmabufs.c  |  22 +++
 libavcodec/v4l2_req_dmabufs.h  |   3 +
 libavcodec/v4l2_req_media.c    | 263 ++++++++++++++++++++++++++++-----
 libavcodec/v4l2_req_media.h    |  21 ++-
 libavcodec/v4l2_request_hevc.c |  42 +++++-
 5 files changed, 307 insertions(+), 44 deletions(-)

diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
index ae6c648369..c4bbed18c6 100644
--- a/libavcodec/v4l2_req_dmabufs.c
+++ b/libavcodec/v4l2_req_dmabufs.c
@@ -36,6 +36,26 @@ static unsigned int total_bufs = 0;
 static size_t total_size = 0;
 #endif
 
+struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size)
+{
+    struct dmabuf_h *dh;
+
+    if (mapptr == MAP_FAILED)
+        return NULL;
+
+    dh = malloc(sizeof(*dh));
+    if (!dh)
+        return NULL;
+
+    *dh = (struct dmabuf_h) {
+        .fd = -1,
+        .size = size,
+        .mapptr = mapptr
+    };
+
+    return dh;
+}
+
 struct dmabuf_h * dmabuf_import(int fd, size_t size)
 {
     struct dmabuf_h *dh;
@@ -122,6 +142,8 @@ int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags)
     struct dma_buf_sync sync = {
         .flags = flags
     };
+    if (dh->fd == -1)
+        return 0;
     while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) {
         const int err = errno;
         if (errno == EINTR)
diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h
index cfb17e801d..c1d3d8c8d7 100644
--- a/libavcodec/v4l2_req_dmabufs.h
+++ b/libavcodec/v4l2_req_dmabufs.h
@@ -18,6 +18,9 @@ static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t s
 }
 /* Create from existing fd - dups(fd) */
 struct dmabuf_h * dmabuf_import(int fd, size_t size);
+/* Import an MMAP - return NULL if mapptr = MAP_FAIL */
+struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size);
+
 void * dmabuf_map(struct dmabuf_h * const dh);
 
 /* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */
diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c
index 980b306b8a..910ac77bb6 100644
--- a/libavcodec/v4l2_req_media.c
+++ b/libavcodec/v4l2_req_media.c
@@ -33,9 +33,11 @@
 #include <string.h>
 #include <unistd.h>
 #include <linux/media.h>
+#include <linux/mman.h>
 #include <sys/ioctl.h>
 #include <sys/select.h>
 #include <sys/ioctl.h>
+#include <sys/mman.h>
 
 #include <linux/videodev2.h>
 
@@ -95,6 +97,32 @@ struct media_request {
     struct polltask * pt;
 };
 
+static inline enum v4l2_memory
+mediabufs_memory_to_v4l2(const enum mediabufs_memory m)
+{
+    return (enum v4l2_memory)m;
+}
+
+const char *
+mediabufs_memory_name(const enum mediabufs_memory m)
+{
+    switch (m) {
+    case MEDIABUFS_MEMORY_UNSET:
+        return "Unset";
+    case MEDIABUFS_MEMORY_MMAP:
+        return "MMap";
+    case MEDIABUFS_MEMORY_USERPTR:
+        return "UserPtr";
+    case MEDIABUFS_MEMORY_OVERLAY:
+        return "Overlay";
+    case MEDIABUFS_MEMORY_DMABUF:
+        return "DMABuf";
+    default:
+        break;
+    }
+    return "Unknown";
+}
+
 
 static inline int do_trywait(sem_t *const sem)
 {
@@ -115,14 +143,14 @@ static inline int do_wait(sem_t *const sem)
 }
 
 static int request_buffers(int video_fd, unsigned int type,
-                           enum v4l2_memory memory, unsigned int buffers_count)
+                           enum mediabufs_memory memory, unsigned int buffers_count)
 {
     struct v4l2_requestbuffers buffers;
     int rc;
 
     memset(&buffers, 0, sizeof(buffers));
     buffers.type = type;
-    buffers.memory = memory;
+    buffers.memory = mediabufs_memory_to_v4l2(memory);
     buffers.count = buffers_count;
 
     rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers);
@@ -324,6 +352,7 @@ struct qent_base {
     struct qent_base *next;
     struct qent_base *prev;
     enum qent_status status;
+    enum mediabufs_memory memtype;
     uint32_t index;
     struct dmabuf_h *dh[VIDEO_MAX_PLANES];
     struct timeval timestamp;
@@ -348,9 +377,9 @@ struct qe_list_head {
 };
 
 struct buf_pool {
+    enum mediabufs_memory memtype;
     pthread_mutex_t lock;
     sem_t free_sem;
-    enum v4l2_buf_type buf_type;
     struct qe_list_head free;
     struct qe_list_head inuse;
 };
@@ -367,9 +396,10 @@ static inline struct qent_src *base_to_src(struct qent_base *be)
 }
 
 
-#define QENT_BASE_INITIALIZER {\
+#define QENT_BASE_INITIALIZER(mtype) {\
     .ref_count = ATOMIC_VAR_INIT(0),\
     .status = QENT_NEW,\
+    .memtype = (mtype),\
     .index  = INDEX_UNSET\
 }
 
@@ -390,13 +420,13 @@ static void qe_src_free(struct qent_src *const be_src)
     free(be_src);
 }
 
-static struct qent_src * qe_src_new(void)
+static struct qent_src * qe_src_new(enum mediabufs_memory mtype)
 {
     struct qent_src *const be_src = malloc(sizeof(*be_src));
     if (!be_src)
         return NULL;
     *be_src = (struct qent_src){
-        .base = QENT_BASE_INITIALIZER
+        .base = QENT_BASE_INITIALIZER(mtype)
     };
     return be_src;
 }
@@ -413,13 +443,13 @@ static void qe_dst_free(struct qent_dst *const be_dst)
     free(be_dst);
 }
 
-static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl)
+static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl, const enum mediabufs_memory memtype)
 {
     struct qent_dst *const be_dst = malloc(sizeof(*be_dst));
     if (!be_dst)
         return NULL;
     *be_dst = (struct qent_dst){
-        .base = QENT_BASE_INITIALIZER,
+        .base = QENT_BASE_INITIALIZER(memtype),
         .lock = PTHREAD_MUTEX_INITIALIZER,
         .cond = PTHREAD_COND_INITIALIZER,
         .mbc_wl = ff_weak_link_ref(wl)
@@ -553,14 +583,14 @@ static struct qent_base *queue_tryget_free(struct buf_pool *const bp)
     return buf;
 }
 
-static struct qent_base * queue_find_extract_fd(struct buf_pool *const bp, const int fd)
+static struct qent_base * queue_find_extract_index(struct buf_pool *const bp, const unsigned int index)
 {
     struct qent_base *be;
 
     pthread_mutex_lock(&bp->lock);
     /* Expect 1st in Q, but allow anywhere */
     for (be = bp->inuse.head; be; be = be->next) {
-        if (dmabuf_fd(be->dh[0]) == fd) {
+        if (be->index == index) {
             bq_extract_inuse(bp, be);
             break;
         }
@@ -602,6 +632,8 @@ struct mediabufs_ctl {
     struct pollqueue * pq;
     struct ff_weak_link_master * this_wlm;
 
+    enum mediabufs_memory src_memtype;
+    enum mediabufs_memory dst_memtype;
     struct v4l2_format src_fmt;
     struct v4l2_format dst_fmt;
     struct v4l2_capability capability;
@@ -614,7 +646,7 @@ static int qe_v4l2_queue(struct qent_base *const be,
 {
     struct v4l2_buffer buffer = {
         .type = fmt->type,
-        .memory = V4L2_MEMORY_DMABUF,
+        .memory = mediabufs_memory_to_v4l2(be->memtype),
         .index = be->index
     };
     struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
@@ -628,7 +660,10 @@ static int qe_v4l2_queue(struct qent_base *const be,
             /* *** Really need a pixdesc rather than a format so we can fill in data_offset */
             planes[i].length = dmabuf_size(be->dh[i]);
             planes[i].bytesused = dmabuf_len(be->dh[i]);
-            planes[i].m.fd = dmabuf_fd(be->dh[i]);
+            if (be->memtype == MEDIABUFS_MEMORY_DMABUF)
+                planes[i].m.fd = dmabuf_fd(be->dh[i]);
+            else
+                planes[i].m.mem_offset = 0;
         }
         buffer.m.planes = planes;
         buffer.length = i;
@@ -639,7 +674,10 @@ static int qe_v4l2_queue(struct qent_base *const be,
 
         buffer.bytesused = dmabuf_len(be->dh[0]);
         buffer.length = dmabuf_size(be->dh[0]);
-        buffer.m.fd = dmabuf_fd(be->dh[0]);
+        if (be->memtype == MEDIABUFS_MEMORY_DMABUF)
+            buffer.m.fd = dmabuf_fd(be->dh[0]);
+        else
+            buffer.m.offset = 0;
     }
 
     if (!is_dst && mreq) {
@@ -668,14 +706,13 @@ static struct qent_base * qe_dequeue(struct buf_pool *const bp,
                      const int vfd,
                      const struct v4l2_format * const f)
 {
-    int fd;
     struct qent_base *be;
     int rc;
     const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type);
     struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
     struct v4l2_buffer buffer = {
         .type =  f->type,
-        .memory = V4L2_MEMORY_DMABUF
+        .memory = mediabufs_memory_to_v4l2(bp->memtype)
     };
     if (mp) {
         buffer.length = f->fmt.pix_mp.num_planes;
@@ -690,10 +727,9 @@ static struct qent_base * qe_dequeue(struct buf_pool *const bp,
         return NULL;
     }
 
-    fd = mp ? planes[0].m.fd : buffer.m.fd;
-    be = queue_find_extract_fd(bp, fd);
+    be = queue_find_extract_index(bp, buffer.index);
     if (!be) {
-        request_log("Failed to find fd %d in Q\n", fd);
+        request_log("Failed to find index %d in Q\n", buffer.index);
         return NULL;
     }
 
@@ -1104,7 +1140,7 @@ static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, stru
 
     struct v4l2_create_buffers cbuf = {
         .count = n,
-        .memory = V4L2_MEMORY_DMABUF,
+        .memory = mediabufs_memory_to_v4l2(mbc->dst->memtype),
         .format = mbc->dst_fmt,
     };
 
@@ -1125,12 +1161,97 @@ static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, stru
     return cbuf.count;
 }
 
+static MediaBufsStatus
+qe_import_from_buf(struct mediabufs_ctl *const mbc, struct qent_base * const be, const struct v4l2_format *const fmt,
+                   const unsigned int n, const bool x_dmabuf)
+{
+    struct v4l2_buffer buf = {
+        .index = n,
+        .type = fmt->type,
+    };
+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
+    int ret;
+
+    if (be->dh[0])
+        return 0;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        memset(planes, 0, sizeof(planes));
+        buf.m.planes = planes;
+        buf.length = VIDEO_MAX_PLANES;
+    }
+
+    if ((ret = ioctl(mbc->vfd, VIDIOC_QUERYBUF, &buf)) != 0) {
+        request_err(mbc->dc, "VIDIOC_QUERYBUF failed");
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+    }
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type))
+    {
+        unsigned int i;
+        for (i = 0; i != buf.length; ++i) {
+            if (x_dmabuf) {
+                struct v4l2_exportbuffer xbuf = {
+                    .type = buf.type,
+                    .index = buf.index,
+                    .plane = i,
+                    .flags = O_RDWR, // *** Arguably O_RDONLY would be fine
+                };
+                if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0)
+                    be->dh[i] = dmabuf_import(xbuf.fd, planes[i].length);
+            }
+            else {
+                be->dh[i] = dmabuf_import_mmap(
+                    mmap(NULL, planes[i].length,
+                        PROT_READ | PROT_WRITE,
+                        MAP_SHARED | MAP_POPULATE,
+                        mbc->vfd, planes[i].m.mem_offset),
+                    planes[i].length);
+            }
+            /* On failure tidy up and die */
+            if (!be->dh[i]) {
+                while (i--) {
+                    dmabuf_free(be->dh[i]);
+                    be->dh[i] = NULL;
+                }
+                return MEDIABUFS_ERROR_OPERATION_FAILED;
+            }
+        }
+    }
+    else
+    {
+        if (x_dmabuf) {
+            struct v4l2_exportbuffer xbuf = {
+                .type = buf.type,
+                .index = buf.index,
+                .flags = O_RDWR, // *** Arguably O_RDONLY would be fine
+            };
+            if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0)
+                be->dh[0] = dmabuf_import(xbuf.fd, buf.length);
+        }
+        else {
+            be->dh[0] = dmabuf_import_mmap(
+                mmap(NULL, buf.length,
+                    PROT_READ | PROT_WRITE,
+                    MAP_SHARED | MAP_POPULATE,
+                    mbc->vfd, buf.m.offset),
+                buf.length);
+        }
+        /* On failure tidy up and die */
+        if (!be->dh[0]) {
+            return MEDIABUFS_ERROR_OPERATION_FAILED;
+        }
+    }
+
+    return 0;
+}
+
 struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc)
 {
     struct qent_dst * be_dst;
 
     if (mbc == NULL) {
-        be_dst = qe_dst_new(NULL);
+        be_dst = qe_dst_new(NULL, MEDIABUFS_MEMORY_DMABUF);
         if (be_dst)
             be_dst->base.status = QENT_IMPORT;
         return be_dst;
@@ -1144,7 +1265,7 @@ struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struc
     else {
         be_dst = base_to_dst(queue_tryget_free(mbc->dst));
         if (!be_dst) {
-            be_dst = qe_dst_new(mbc->this_wlm);
+            be_dst = qe_dst_new(mbc->this_wlm, mbc->dst->memtype);
             if (!be_dst)
                 return NULL;
 
@@ -1155,12 +1276,21 @@ struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struc
         }
     }
 
-    if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) {
-        /* Given  how create buf works we can't uncreate it on alloc failure
-         * all we can do is put it on the free Q
-        */
-        queue_put_free(mbc->dst, &be_dst->base);
-        return NULL;
+    if (mbc->dst->memtype == MEDIABUFS_MEMORY_MMAP) {
+        if (qe_import_from_buf(mbc, &be_dst->base, &mbc->dst_fmt, be_dst->base.index, true)) {
+            request_err(mbc->dc, "Failed to export as dmabuf\n");
+            queue_put_free(mbc->dst, &be_dst->base);
+            return NULL;
+        }
+    }
+    else {
+        if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) {
+            /* Given  how create buf works we can't uncreate it on alloc failure
+             * all we can do is put it on the free Q
+            */
+            queue_put_free(mbc->dst, &be_dst->base);
+            return NULL;
+        }
     }
 
     be_dst->base.status = QENT_PENDING;
@@ -1208,7 +1338,7 @@ MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
 
 // ** This is a mess if we get partial alloc but without any way to remove
 //    individual V4L2 Q members we are somewhat stuffed
-MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed)
+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype)
 {
     unsigned int i;
     int a = 0;
@@ -1218,10 +1348,12 @@ MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, cons
     if (n > 32)
         return MEDIABUFS_ERROR_ALLOCATION_FAILED;
 
+    mbc->dst->memtype = memtype;
+
     // Create qents first as it is hard to get rid of the V4L2 buffers on error
     for (qc = 0; qc != n; ++qc)
     {
-        if ((qes[qc] = qe_dst_new(mbc->this_wlm)) == NULL)
+        if ((qes[qc] = qe_dst_new(mbc->this_wlm, mbc->dst->memtype)) == NULL)
             goto fail;
     }
 
@@ -1260,19 +1392,61 @@ void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src *
     queue_put_free(mbc->src, &qe_src->base);
 }
 
+static MediaBufsStatus
+chk_memory_type(struct mediabufs_ctl *const mbc,
+    const struct v4l2_format * const f,
+    const enum mediabufs_memory m)
+{
+    struct v4l2_create_buffers cbuf = {
+        .count = 0,
+        .memory = V4L2_MEMORY_MMAP,
+        .format = *f
+    };
+
+    if (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf) != 0)
+        return MEDIABUFS_ERROR_OPERATION_FAILED;
+
+    switch (m) {
+    case MEDIABUFS_MEMORY_DMABUF:
+        // 0 = Unknown but assume not in that case
+        if ((cbuf.capabilities & V4L2_BUF_CAP_SUPPORTS_DMABUF) == 0)
+            return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY;
+        break;
+    case MEDIABUFS_MEMORY_MMAP:
+        break;
+    default:
+        return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY;
+    }
+
+    return MEDIABUFS_STATUS_SUCCESS;
+}
+
+MediaBufsStatus
+mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype)
+{
+    return chk_memory_type(mbc, &mbc->src_fmt, memtype);
+}
+
+MediaBufsStatus
+mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype)
+{
+    return chk_memory_type(mbc, &mbc->dst_fmt, memtype);
+}
+
 /* src format must have been set up before this */
 MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc,
                   struct dmabufs_ctl * const dbsc,
-                  unsigned int n)
+                  unsigned int n, const enum mediabufs_memory memtype)
 {
     unsigned int i;
     struct v4l2_requestbuffers req = {
         .count = n,
         .type = mbc->src_fmt.type,
-        .memory = V4L2_MEMORY_DMABUF
+        .memory = mediabufs_memory_to_v4l2(memtype)
     };
 
     bq_free_all_free_src(mbc->src);
+
     while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) {
         if (errno != EINTR) {
             request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__);
@@ -1286,21 +1460,36 @@ MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc,
     }
 
     for (i = 0; i != n; ++i) {
-        struct qent_src *const be_src = qe_src_new();
+        struct qent_src *const be_src = qe_src_new(memtype);
         if (!be_src) {
             request_err(mbc->dc, "Failed to create src be %d\n", i);
             goto fail;
         }
-        if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) {
-            qe_src_free(be_src);
+        switch (memtype) {
+        case MEDIABUFS_MEMORY_MMAP:
+            if (qe_import_from_buf(mbc, &be_src->base, &mbc->src_fmt, i, false)) {
+                qe_src_free(be_src);
+                goto fail;
+            }
+            be_src->fixed_size = 1;
+            break;
+        case MEDIABUFS_MEMORY_DMABUF:
+            if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) {
+                qe_src_free(be_src);
+                goto fail;
+            }
+            be_src->fixed_size = !mediabufs_src_resizable(mbc);
+            break;
+        default:
+            request_err(mbc->dc, "Unexpected memorty type\n");
             goto fail;
         }
         be_src->base.index = i;
-        be_src->fixed_size = !mediabufs_src_resizable(mbc);
 
         queue_put_free(mbc->src, &be_src->base);
     }
 
+    mbc->src->memtype = memtype;
     return MEDIABUFS_STATUS_SUCCESS;
 
 fail:
@@ -1437,9 +1626,13 @@ int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_
 
 int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc)
 {
+#if 1
+    return 0;
+#else
     // Single planar OUTPUT can only take exact size buffers
     // Multiplanar will take larger than negotiated
     return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type);
+#endif
 }
 
 static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc)
diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h
index 0307a831de..890947b2e2 100644
--- a/libavcodec/v4l2_req_media.h
+++ b/libavcodec/v4l2_req_media.h
@@ -43,6 +43,7 @@ typedef enum media_buf_status {
     MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE,
     MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT,
     MEDIABUFS_ERROR_ALLOCATION_FAILED,
+    MEDIABUFS_ERROR_UNSUPPORTED_MEMORY,
 } MediaBufsStatus;
 
 struct media_pool * media_pool_new(const char * const media_path,
@@ -70,6 +71,15 @@ struct qent_dst;
 struct dmabuf_h;
 struct dmabufs_ctl;
 
+// 1-1 mammping to V4L2 type - just defined separetely to avoid some include versioning difficulties
+enum mediabufs_memory {
+   MEDIABUFS_MEMORY_UNSET            = 0,
+   MEDIABUFS_MEMORY_MMAP             = 1,
+   MEDIABUFS_MEMORY_USERPTR          = 2,
+   MEDIABUFS_MEMORY_OVERLAY          = 3,
+   MEDIABUFS_MEMORY_DMABUF           = 4,
+};
+
 int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp);
 struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst);
 
@@ -93,6 +103,8 @@ MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
                 unsigned int plane,
                 int fd, size_t size);
 
+const char * mediabufs_memory_name(const enum mediabufs_memory m);
+
 MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
                 struct media_request **const pmreq,
                 struct qent_src **const psrc_be,
@@ -106,7 +118,7 @@ struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc,
 // Create dst slots without alloc
 // If fixed true then qent_alloc will only get slots from this pool and will
 // block until a qent has been unrefed
-MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed);
+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype);
 
 MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc);
 MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc);
@@ -140,7 +152,12 @@ MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
 
 MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw,
                   struct dmabufs_ctl * const dbsc,
-                  unsigned int n);
+                  unsigned int n,
+                  const enum mediabufs_memory memtype);
+
+// Want to have appropriate formats set first
+MediaBufsStatus mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype);
+MediaBufsStatus mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype);
 
 #define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c))
 unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc);
diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
index cd79aad563..5cf17dd5e3 100644
--- a/libavcodec/v4l2_request_hevc.c
+++ b/libavcodec/v4l2_request_hevc.c
@@ -144,6 +144,8 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
     const struct decdev * decdev;
     const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2;  // Assuming constant for all APIs but avoiding V4L2 includes
     size_t src_size;
+    enum mediabufs_memory src_memtype;
+    enum mediabufs_memory dst_memtype;
 
     av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
 
@@ -174,8 +176,14 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
            decdev_media_path(decdev), decdev_video_path(decdev));
 
     if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) {
-        av_log(avctx, AV_LOG_ERROR, "Unable to open dmabufs\n");
-        goto fail0;
+        av_log(avctx, AV_LOG_DEBUG, "Unable to open dmabufs - try mmap buffers\n");
+        src_memtype = MEDIABUFS_MEMORY_MMAP;
+        dst_memtype = MEDIABUFS_MEMORY_MMAP;
+    }
+    else {
+        av_log(avctx, AV_LOG_DEBUG, "Dmabufs opened - try dmabuf buffers\n");
+        src_memtype = MEDIABUFS_MEMORY_DMABUF;
+        dst_memtype = MEDIABUFS_MEMORY_DMABUF;
     }
 
     if ((ctx->pq = pollqueue_new()) == NULL) {
@@ -196,8 +204,9 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
     // Ask for an initial bitbuf size of max size / 4
     // We will realloc if we need more
     // Must use sps->h/w as avctx contains cropped size
+retry_src_memtype:
     src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8);
-    if (mediabufs_src_resizable(ctx->mbufs))
+    if (src_memtype == MEDIABUFS_MEMORY_DMABUF && mediabufs_src_resizable(ctx->mbufs))
         src_size /= 4;
     // Kludge for conformance tests which break Annex A limits
     else if (src_size < 0x40000)
@@ -210,6 +219,15 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
         goto fail4;
     }
 
+    if (mediabufs_src_chk_memtype(ctx->mbufs, src_memtype)) {
+        if (src_memtype == MEDIABUFS_MEMORY_DMABUF) {
+            src_memtype = MEDIABUFS_MEMORY_MMAP;
+            goto retry_src_memtype;
+        }
+        av_log(avctx, AV_LOG_ERROR, "Failed to get src memory type\n");
+        goto fail4;
+    }
+
     if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) {
         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
         ctx->fns = &V2(ff_v4l2_req_hevc, 4);
@@ -238,7 +256,7 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
         goto fail4;
     }
 
-    if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6)) {
+    if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6, src_memtype)) {
         av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n");
         goto fail4;
     }
@@ -250,8 +268,17 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
                sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering,
                avctx->thread_count, avctx->extra_hw_frames);
 
+        if (mediabufs_dst_chk_memtype(ctx->mbufs, dst_memtype)) {
+            if (dst_memtype != MEDIABUFS_MEMORY_DMABUF) {
+                av_log(avctx, AV_LOG_ERROR, "Failed to get dst memory type\n");
+                goto fail4;
+            }
+            av_log(avctx, AV_LOG_DEBUG, "Dst DMABUF not supported - trying mmap\n");
+            dst_memtype = MEDIABUFS_MEMORY_MMAP;
+        }
+
         // extra_hw_frames is -1 if unset
-        if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0))) {
+        if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0), dst_memtype)) {
             av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n");
             goto fail4;
         }
@@ -277,9 +304,10 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
     // Set our s/w format
     avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format;
 
-    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s\n",
+    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s\n",
            ctx->fns->name,
-           decdev_media_path(decdev), decdev_video_path(decdev));
+           decdev_media_path(decdev), decdev_video_path(decdev),
+           mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype));
 
     return 0;
 
-- 
2.43.0


From 495356bdeef3e8a46b7e530c95f1a8f71067136b Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 22 Aug 2022 12:35:40 +0000
Subject: [PATCH 062/157] Set buffer lengths on DQ

---
 libavcodec/v4l2_req_media.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c
index 910ac77bb6..1a9944774a 100644
--- a/libavcodec/v4l2_req_media.c
+++ b/libavcodec/v4l2_req_media.c
@@ -733,6 +733,14 @@ static struct qent_base * qe_dequeue(struct buf_pool *const bp,
         return NULL;
     }
 
+    if (mp) {
+        unsigned int i;
+        for (i = 0; i != buffer.length; ++i)
+            dmabuf_len_set(be->dh[i], V4L2_TYPE_IS_CAPTURE(f->type) ? planes[i].bytesused : 0);
+    }
+    else
+        dmabuf_len_set(be->dh[0], V4L2_TYPE_IS_CAPTURE(f->type) ? buffer.length : 0);
+
     be->timestamp = buffer.timestamp;
     be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE;
     return be;
-- 
2.43.0


From b8010877bf0b23b793370d6b8364d2fe8b1f4bbc Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 22 Aug 2022 17:11:24 +0000
Subject: [PATCH 063/157] Fix compile if videodev2.h defines V4L2 HEVC request
 API

If videodev2.h does define the HEVC request API it is really hard to
set old variations of the controls so if it does then we only compile
against the system includes and remove the back compatability.
---
 configure                      | 9 +++++++++
 libavcodec/Makefile            | 4 ++--
 libavcodec/hevc-ctrls-v4.h     | 2 ++
 libavcodec/v4l2_req_hevc_vx.c  | 5 -----
 libavcodec/v4l2_request_hevc.c | 6 ++++--
 5 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/configure b/configure
index a4ffd87976..f16f85dbc3 100755
--- a/configure
+++ b/configure
@@ -1945,6 +1945,7 @@ FEATURE_LIST="
     swscale_alpha
     vout_drm
     vout_egl
+    v4l2_req_hevc_vx
 "
 
 # this list should be kept in linking order
@@ -6904,6 +6905,14 @@ fi
 
 check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
 check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
+disable v4l2_req_hevc_vx
+if enabled hevc_v4l2request_hwaccel; then
+    enable v4l2_req_hevc_vx
+fi
+if enabled hevc_v4l2_request; then
+    disable v4l2_req_hevc_vx
+fi
+
 check_headers sys/videoio.h
 test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
 
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index d433a71236..11f183c9b9 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -999,8 +999,8 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)       += dxva2_hevc.o
 OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
 OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
 OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec.o
-OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o\
-                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o  v4l2_req_hevc_v4.o
+OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o v4l2_req_hevc_v4.o
+OBJS-$(CONFIG_V4L2_REQ_HEVC_VX)           += v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o
 OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o h265_profile_level.o
 OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h
index 7e05f6e7c3..7829d82084 100644
--- a/libavcodec/hevc-ctrls-v4.h
+++ b/libavcodec/hevc-ctrls-v4.h
@@ -53,6 +53,8 @@
 #include <linux/const.h>
 #include <linux/types.h>
 
+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
+
 #define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_STATELESS_BASE + 400)
 #define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_STATELESS_BASE + 401)
 #define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 402)
diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
index 5d083016f8..e1bd5c6a1f 100644
--- a/libavcodec/v4l2_req_hevc_vx.c
+++ b/libavcodec/v4l2_req_hevc_vx.c
@@ -40,11 +40,6 @@
 #define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B          V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B
 #endif
 
-// Should be in videodev2 but we might not have a good enough one
-#ifndef V4L2_PIX_FMT_HEVC_SLICE
-#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
-#endif
-
 #include "v4l2_request_hevc.h"
 
 #include "libavutil/hwcontext_drm.h"
diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
index 5cf17dd5e3..614a1b4d99 100644
--- a/libavcodec/v4l2_request_hevc.c
+++ b/libavcodec/v4l2_request_hevc.c
@@ -17,7 +17,7 @@
  */
 
 
-
+#include "config.h"
 #include "decode.h"
 #include "hevcdec.h"
 #include "hwconfig.h"
@@ -142,7 +142,7 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
     const HEVCSPS * const sps = h->ps.sps;
     int ret;
     const struct decdev * decdev;
-    const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2;  // Assuming constant for all APIs but avoiding V4L2 includes
+    const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 4).src_pix_fmt_v4l2;  // Assuming constant for all APIs but avoiding V4L2 includes
     size_t src_size;
     enum mediabufs_memory src_memtype;
     enum mediabufs_memory dst_memtype;
@@ -232,6 +232,7 @@ retry_src_memtype:
         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
         ctx->fns = &V2(ff_v4l2_req_hevc, 4);
     }
+#if CONFIG_V4L2_REQ_HEVC_VX
     else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
         ctx->fns = &V2(ff_v4l2_req_hevc, 3);
@@ -244,6 +245,7 @@ retry_src_memtype:
         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n");
         ctx->fns = &V2(ff_v4l2_req_hevc, 1);
     }
+#endif
     else {
         av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n");
         ret = AVERROR(EINVAL);
-- 
2.43.0


From 47060230fcd6d3df96a7d9ada8dc4c269a8014c4 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 12 Sep 2022 17:59:22 +0100
Subject: [PATCH 064/157] v4l2_m2m_enc: Send headers in in pkt side_data

If GLOBAL_HEADERS are requested then we can't provide them at init time
so send as NEW_EXTRADATA side data in a similar way to some AV1
encoders.
---
 libavcodec/v4l2_m2m_enc.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
index 05ff6ba726..099ad23928 100644
--- a/libavcodec/v4l2_m2m_enc.c
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -544,14 +544,12 @@ dequeue:
         av_freep(&avctx->extradata);
         avctx->extradata_size = 0;
 
-        if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL)
-            memcpy(data, avpkt->data, len);
+        if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL)
+            goto fail_no_mem;
 
+        memcpy(data, avpkt->data, len);
         av_packet_unref(avpkt);
 
-        if (data == NULL)
-            return AVERROR(ENOMEM);
-
         // We need to copy the header, but keep local if not global
         if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) {
             avctx->extradata = data;
@@ -567,18 +565,28 @@ dequeue:
     }
 
     // First frame must be key so mark as such even if encoder forgot
-    if (capture->first_buf == 2)
+    if (capture->first_buf == 2) {
         avpkt->flags |= AV_PKT_FLAG_KEY;
 
+        // Add any extradata to the 1st packet we emit as we cannot create it at init
+        if (avctx->extradata_size > 0 && avctx->extradata) {
+            void * const side = av_packet_new_side_data(avpkt,
+                                           AV_PKT_DATA_NEW_EXTRADATA,
+                                           avctx->extradata_size);
+            if (!side)
+                goto fail_no_mem;
+
+            memcpy(side, avctx->extradata, avctx->extradata_size);
+        }
+    }
+
     // Add SPS/PPS to the start of every key frame if non-global headers
     if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) {
         const size_t newlen = s->extdata_size + avpkt->size;
         AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE);
 
-        if (buf == NULL) {
-            av_packet_unref(avpkt);
-            return AVERROR(ENOMEM);
-        }
+        if (buf == NULL)
+            goto fail_no_mem;
 
         memcpy(buf->data, s->extdata_data, s->extdata_size);
         memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size);
@@ -592,6 +600,11 @@ dequeue:
 //    av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret);
     capture->first_buf = 0;
     return 0;
+
+fail_no_mem:
+    ret = AVERROR(ENOMEM);
+    av_packet_unref(avpkt);
+    return ret;
 }
 
 static av_cold int v4l2_encode_init(AVCodecContext *avctx)
-- 
2.43.0


From 23b9bcd1cd9bd08fdc298cf1f9fada05690a78d9 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 14 Sep 2022 15:44:10 +0000
Subject: [PATCH 065/157] matroskaenc: Allow H264 SPS/PPS headers in packet
 sidedata

---
 libavformat/matroskaenc.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c
index 113541bd9a..61e4c976ef 100644
--- a/libavformat/matroskaenc.c
+++ b/libavformat/matroskaenc.c
@@ -77,6 +77,10 @@
 
 #define IS_WEBM(mkv) (CONFIG_WEBM_MUXER && CONFIG_MATROSKA_MUXER ? \
                       ((mkv)->mode == MODE_WEBM) : CONFIG_WEBM_MUXER)
+
+/* Reserved size for H264 headers if not extant at init time */
+#define MAX_H264_HEADER_SIZE 1024
+
 #define IS_SEEKABLE(pb, mkv) (((pb)->seekable & AVIO_SEEKABLE_NORMAL) && \
                               !(mkv)->is_live)
 
@@ -1121,8 +1125,12 @@ static int mkv_assemble_native_codecprivate(AVFormatContext *s, AVIOContext *dyn
     case AV_CODEC_ID_WAVPACK:
         return put_wv_codecpriv(dyn_cp, extradata, extradata_size);
     case AV_CODEC_ID_H264:
-        return ff_isom_write_avcc(dyn_cp, extradata,
-                                  extradata_size);
+        if (par->extradata_size)
+            return ff_isom_write_avcc(dyn_cp, extradata,
+                                      extradata_size);
+        else
+            *size_to_reserve = MAX_H264_HEADER_SIZE;
+        break;
     case AV_CODEC_ID_HEVC:
         return ff_isom_write_hvcc(dyn_cp, extradata,
                                   extradata_size, 0);
@@ -2731,8 +2739,8 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt)
         }
         break;
 #endif
-    // FIXME: Remove the following once libaom starts propagating proper extradata during init()
-    //        See https://bugs.chromium.org/p/aomedia/issues/detail?id=2208
+    // FIXME: Remove the following once libaom starts propagating extradata during init()
+    //        See https://bugs.chromium.org/p/aomedia/issues/detail?id=2012
     case AV_CODEC_ID_AV1:
         if (side_data_size && mkv->track.bc && !par->extradata_size) {
             // If the reserved space doesn't suffice, only write
@@ -2744,6 +2752,16 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt)
         } else if (!par->extradata_size)
             return AVERROR_INVALIDDATA;
         break;
+    // H264 V4L2 has a similar issue
+    case AV_CODEC_ID_H264:
+        if (side_data_size && mkv->track.bc && !par->extradata_size) {
+            ret = mkv_update_codecprivate(s, mkv, side_data, side_data_size,
+                                          par, mkv->track.bc, track, 0);
+            if (ret < 0)
+                return ret;
+        } else if (!par->extradata_size)
+            return AVERROR_INVALIDDATA;
+        break;
     default:
         if (side_data_size)
             av_log(s, AV_LOG_DEBUG, "Ignoring new extradata in a packet for stream %d.\n", pkt->stream_index);
-- 
2.43.0


From 7f542b8d4a9bd4cd3cfd8184ec55c3b44371afac Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 14 Sep 2022 15:55:15 +0000
Subject: [PATCH 066/157] movenc: Allow H264 SPS/PPS headers in packet sidedata

---
 libavformat/movenc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavformat/movenc.c b/libavformat/movenc.c
index c4fcb5f8b1..891adbf7b2 100644
--- a/libavformat/movenc.c
+++ b/libavformat/movenc.c
@@ -6343,6 +6343,7 @@ static int mov_write_single_packet(AVFormatContext *s, AVPacket *pkt)
     if (trk->par->codec_id == AV_CODEC_ID_MP4ALS ||
             trk->par->codec_id == AV_CODEC_ID_AAC ||
             trk->par->codec_id == AV_CODEC_ID_AV1 ||
+            trk->par->codec_id == AV_CODEC_ID_H264 ||
             trk->par->codec_id == AV_CODEC_ID_FLAC) {
         size_t side_size;
         uint8_t *side = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
-- 
2.43.0


From 6d9aad2aaed49120a99e6ee0b89b758398bbfed2 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 26 Sep 2022 12:45:05 +0100
Subject: [PATCH 067/157] Allow ffmpeg to select codec internal hwfmts if
 no_cvt_hw

This allows the selection of DRM_PRIME from v4l2m2m without forcing it
in the decoder.

Not utterly sure this is the right method for 5.1 but it does work
---
 fftools/ffmpeg.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
index 04bea4ef4f..0de5346183 100644
--- a/fftools/ffmpeg.c
+++ b/fftools/ffmpeg.c
@@ -2766,12 +2766,15 @@ static enum AVPixelFormat get_format(AVCodecContext *s, const enum AVPixelFormat
             break;
 
         if (ist->hwaccel_id == HWACCEL_GENERIC ||
-            ist->hwaccel_id == HWACCEL_AUTO) {
+            ist->hwaccel_id == HWACCEL_AUTO ||
+            no_cvt_hw) {
             for (i = 0;; i++) {
                 config = avcodec_get_hw_config(s->codec, i);
                 if (!config)
                     break;
-                if (!(config->methods &
+                if (no_cvt_hw && (config->methods & AV_CODEC_HW_CONFIG_METHOD_INTERNAL))
+                    av_log(s, AV_LOG_DEBUG, "no_cvt_hw so trying pix_fmt %d with codec internal hwaccel\n", *p);
+                else if (!(config->methods &
                       AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX))
                     continue;
                 if (config->pix_fmt == *p)
-- 
2.43.0


From e9e8cd3d160dfbe402f50a5d086e612b23f33ea8 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 1 Sep 2022 11:42:41 +0000
Subject: [PATCH 068/157] vf_deinterlace_v4l2m2m: Add a v4l2m2m scaler

The logic for running an isp based scaler is pretty much identical to
that for the deinterlacer so add to the deinterlacer. This requires
some rework of the setup code to avoid assumptions that are true for
deinterlace but not scale but the reworked code requires few switches
based on operation.
---
 libavfilter/allfilters.c             |    1 +
 libavfilter/vf_deinterlace_v4l2m2m.c | 1123 ++++++++++++++++++++------
 2 files changed, 877 insertions(+), 247 deletions(-)

diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 357ff61ca8..d504fa1bc8 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -421,6 +421,7 @@ extern const AVFilter ff_vf_scale;
 extern const AVFilter ff_vf_scale_cuda;
 extern const AVFilter ff_vf_scale_npp;
 extern const AVFilter ff_vf_scale_qsv;
+extern const AVFilter ff_vf_scale_v4l2m2m;
 extern const AVFilter ff_vf_scale_vaapi;
 extern const AVFilter ff_vf_scale_vulkan;
 extern const AVFilter ff_vf_scale2ref;
diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
index 1a3bef5bcb..2df39ec0f1 100644
--- a/libavfilter/vf_deinterlace_v4l2m2m.c
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -52,31 +52,36 @@
 #include "avfilter.h"
 #include "formats.h"
 #include "internal.h"
+#include "scale_eval.h"
 #include "video.h"
 
+#ifndef DRM_FORMAT_P030
+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel packed */
+#endif
+
 typedef struct V4L2Queue V4L2Queue;
 typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared;
 
-typedef struct V4L2PlaneInfo {
-    int bytesperline;
-    size_t length;
-} V4L2PlaneInfo;
+typedef enum filter_type_v4l2_e
+{
+    FILTER_V4L2_DEINTERLACE = 1,
+    FILTER_V4L2_SCALE,
+} filter_type_v4l2_t;
 
 typedef struct V4L2Buffer {
     int enqueued;
     int reenqueue;
-    int fd;
     struct v4l2_buffer buffer;
     AVFrame frame;
     struct v4l2_plane planes[VIDEO_MAX_PLANES];
     int num_planes;
-    V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES];
     AVDRMFrameDescriptor drm_frame;
     V4L2Queue *q;
 } V4L2Buffer;
 
 typedef struct V4L2Queue {
     struct v4l2_format format;
+    struct v4l2_selection sel;
     int num_buffers;
     V4L2Buffer *buffers;
     DeintV4L2M2MContextShared *ctx;
@@ -111,11 +116,18 @@ typedef struct pts_track_s
 
 typedef struct DeintV4L2M2MContextShared {
     void * logctx;  // For logging - will be NULL when done
+    filter_type_v4l2_t filter_type;
 
     int fd;
     int done;
     int width;
     int height;
+
+    // from options
+    int output_width;
+    int output_height;
+    enum AVPixelFormat output_format;
+
     int orig_width;
     int orig_height;
     atomic_uint refcount;
@@ -134,8 +146,60 @@ typedef struct DeintV4L2M2MContext {
     const AVClass *class;
 
     DeintV4L2M2MContextShared *shared;
+
+    char * w_expr;
+    char * h_expr;
+    char * output_format_string;;
+
+    int force_original_aspect_ratio;
+    int force_divisible_by;
+
+    char *colour_primaries_string;
+    char *colour_transfer_string;
+    char *colour_matrix_string;
+    int   colour_range;
+    char *chroma_location_string;
+
+    enum AVColorPrimaries colour_primaries;
+    enum AVColorTransferCharacteristic colour_transfer;
+    enum AVColorSpace colour_matrix;
+    enum AVChromaLocation chroma_location;
 } DeintV4L2M2MContext;
 
+// These just list the ones we know we can cope with
+static uint32_t
+fmt_av_to_v4l2(const enum AVPixelFormat avfmt)
+{
+    switch (avfmt) {
+    case AV_PIX_FMT_YUV420P:
+        return V4L2_PIX_FMT_YUV420;
+    case AV_PIX_FMT_NV12:
+        return V4L2_PIX_FMT_NV12;
+    case AV_PIX_FMT_RPI4_8:
+    case AV_PIX_FMT_SAND128:
+        return V4L2_PIX_FMT_NV12_COL128;
+    default:
+        break;
+    }
+    return 0;
+}
+
+static enum AVPixelFormat
+fmt_v4l2_to_av(const uint32_t pixfmt)
+{
+    switch (pixfmt) {
+    case V4L2_PIX_FMT_YUV420:
+        return AV_PIX_FMT_YUV420P;
+    case V4L2_PIX_FMT_NV12:
+        return AV_PIX_FMT_NV12;
+    case V4L2_PIX_FMT_NV12_COL128:
+        return AV_PIX_FMT_RPI4_8;
+    default:
+        break;
+    }
+    return AV_PIX_FMT_NONE;
+}
+
 static unsigned int pts_stats_interval(const pts_stats_t * const stats)
 {
     return stats->last_interval;
@@ -301,6 +365,39 @@ static int pts_track_init(pts_track_t * const trk, void *logctx)
     return 0;
 }
 
+static inline uint32_t
+fmt_bpl(const struct v4l2_format * const fmt, const unsigned int plane_n)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.plane_fmt[plane_n].bytesperline : fmt->fmt.pix.bytesperline;
+}
+
+static inline uint32_t
+fmt_height(const struct v4l2_format * const fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
+}
+
+static inline uint32_t
+fmt_width(const struct v4l2_format * const fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
+}
+
+static inline uint32_t
+fmt_pixelformat(const struct v4l2_format * const fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
+}
+
+static void
+init_format(V4L2Queue * const q, const uint32_t format_type)
+{
+    memset(&q->format, 0, sizeof(q->format));
+    memset(&q->sel,    0, sizeof(q->sel));
+    q->format.type = format_type;
+    q->sel.type    = format_type;
+}
+
 static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx)
 {
     struct v4l2_capability cap;
@@ -311,80 +408,99 @@ static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx)
     if (ret < 0)
         return ret;
 
-    if (!(cap.capabilities & V4L2_CAP_STREAMING))
+    if (ctx->filter_type == FILTER_V4L2_SCALE &&
+        strcmp("bcm2835-codec-isp", cap.card) != 0)
+    {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Not ISP\n");
         return AVERROR(EINVAL);
+    }
 
-    if (cap.capabilities & V4L2_CAP_VIDEO_M2M) {
-        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
-
-        return 0;
+    if (!(cap.capabilities & V4L2_CAP_STREAMING)) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "No streaming\n");
+        return AVERROR(EINVAL);
     }
 
     if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) {
-        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
-        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
-
-        return 0;
+        init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE);
+        init_format(&ctx->output,  V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE);
+    }
+    else if (cap.capabilities & V4L2_CAP_VIDEO_M2M) {
+        init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE);
+        init_format(&ctx->output,  V4L2_BUF_TYPE_VIDEO_OUTPUT);
+    }
+    else {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Not M2M\n");
+        return AVERROR(EINVAL);
     }
 
-    return AVERROR(EINVAL);
+    return 0;
 }
 
-static int deint_v4l2m2m_try_format(V4L2Queue *queue)
+// Just use for probe - doesn't modify q format
+static int deint_v4l2m2m_try_format(V4L2Queue *queue, const uint32_t width, const uint32_t height, const enum AVPixelFormat avfmt)
 {
-    struct v4l2_format *fmt        = &queue->format;
+    struct v4l2_format fmt         = {.type = queue->format.type};
     DeintV4L2M2MContextShared *ctx = queue->ctx;
     int ret, field;
+    // Pick YUV to test with if not otherwise specified
+    uint32_t pixelformat = avfmt == AV_PIX_FMT_NONE ? V4L2_PIX_FMT_YUV420 : fmt_av_to_v4l2(avfmt);
+    enum AVPixelFormat r_avfmt;
+
 
-    ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt);
+    ret = ioctl(ctx->fd, VIDIOC_G_FMT, &fmt);
     if (ret)
         av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret);
 
-    if (V4L2_TYPE_IS_OUTPUT(fmt->type))
+    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && V4L2_TYPE_IS_OUTPUT(fmt.type))
         field = V4L2_FIELD_INTERLACED_TB;
     else
         field = V4L2_FIELD_NONE;
 
-    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-        fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420;
-        fmt->fmt.pix_mp.field = field;
-        fmt->fmt.pix_mp.width = ctx->width;
-        fmt->fmt.pix_mp.height = ctx->height;
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
+        fmt.fmt.pix_mp.pixelformat = pixelformat;
+        fmt.fmt.pix_mp.field = field;
+        fmt.fmt.pix_mp.width = width;
+        fmt.fmt.pix_mp.height = height;
     } else {
-        fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_YUV420;
-        fmt->fmt.pix.field = field;
-        fmt->fmt.pix.width = ctx->width;
-        fmt->fmt.pix.height = ctx->height;
+        fmt.fmt.pix.pixelformat = pixelformat;
+        fmt.fmt.pix.field = field;
+        fmt.fmt.pix.width = width;
+        fmt.fmt.pix.height = height;
     }
 
-    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__,
-		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
-		 fmt->fmt.pix_mp.pixelformat,
-		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
+    av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__,
+         fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height,
+         fmt.fmt.pix_mp.pixelformat,
+         fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline);
 
-    ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt);
+    ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, &fmt);
     if (ret)
         return AVERROR(EINVAL);
 
-    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__,
-		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
-		 fmt->fmt.pix_mp.pixelformat,
-		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
+    av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__,
+         fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height,
+         fmt.fmt.pix_mp.pixelformat,
+         fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline);
 
-    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-        if ((fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 &&
-             fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12) ||
-            fmt->fmt.pix_mp.field != field) {
-            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
+    r_avfmt = fmt_v4l2_to_av(fmt_pixelformat(&fmt));
+    if (r_avfmt != avfmt && avfmt != AV_PIX_FMT_NONE) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Unable to set format %s on %s port\n", av_get_pix_fmt_name(avfmt), V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src");
+        return AVERROR(EINVAL);
+    }
+    if (r_avfmt == AV_PIX_FMT_NONE) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "No supported format on %s port\n", V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src");
+        return AVERROR(EINVAL);
+    }
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
+        if (fmt.fmt.pix_mp.field != field) {
+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type);
 
             return AVERROR(EINVAL);
         }
     } else {
-        if ((fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 &&
-             fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12) ||
-            fmt->fmt.pix.field != field) {
-            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
+        if (fmt.fmt.pix.field != field) {
+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type);
 
             return AVERROR(EINVAL);
         }
@@ -393,68 +509,410 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue)
     return 0;
 }
 
-static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height, int pitch, int ysize)
+static int
+do_s_fmt(V4L2Queue * const q)
 {
-    struct v4l2_format *fmt        = &queue->format;
-    DeintV4L2M2MContextShared *ctx = queue->ctx;
+    DeintV4L2M2MContextShared * const ctx = q->ctx;
+    const uint32_t pixelformat = fmt_pixelformat(&q->format);
     int ret;
 
-    struct v4l2_selection sel = {
-        .type = fmt->type,
-        .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS,
-    };
-
-    // This works for most single object 4:2:0 types
-    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-        fmt->fmt.pix_mp.pixelformat = pixelformat;
-        fmt->fmt.pix_mp.field = field;
-        fmt->fmt.pix_mp.width = width;
-        fmt->fmt.pix_mp.height = ysize / pitch;
-        fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch;
-        fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1);
-    } else {
-        fmt->fmt.pix.pixelformat = pixelformat;
-        fmt->fmt.pix.field = field;
-        fmt->fmt.pix.width = width;
-        fmt->fmt.pix.height = height;
-        fmt->fmt.pix.sizeimage = 0;
-        fmt->fmt.pix.bytesperline = 0;
-    }
-
-    ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt);
+    ret = ioctl(ctx->fd, VIDIOC_S_FMT, &q->format);
     if (ret) {
         ret = AVERROR(errno);
-        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret);
+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %s\n", av_err2str(ret));
         return ret;
     }
 
-    if (pixelformat != fmt->fmt.pix.pixelformat) {
-        av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt->fmt.pix.pixelformat));
+    if (pixelformat != fmt_pixelformat(&q->format)) {
+        av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt_pixelformat(&q->format)));
         return AVERROR(EINVAL);
     }
 
-    ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel);
+    q->sel.target = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE,
+    q->sel.flags  = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_FLAG_LE : V4L2_SEL_FLAG_GE;
+
+    ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &q->sel);
     if (ret) {
         ret = AVERROR(errno);
-        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION failed: %d\n", ret);
+        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %s\n", av_err2str(ret));
     }
 
-    sel.r.width = width;
-    sel.r.height = height;
-    sel.r.left = 0;
-    sel.r.top = 0;
-    sel.target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE,
-    sel.flags = V4L2_SEL_FLAG_LE;
+    return 0;
+}
 
-    ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel);
-    if (ret) {
-        ret = AVERROR(errno);
-        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %d\n", ret);
+static void
+set_fmt_color(struct v4l2_format *const fmt,
+               const enum AVColorPrimaries avcp,
+               const enum AVColorSpace avcs,
+               const enum AVColorTransferCharacteristic avxc)
+{
+    enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT;
+    enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT;
+    enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT;
+
+    switch (avcp) {
+    case AVCOL_PRI_BT709:
+        cs = V4L2_COLORSPACE_REC709;
+        ycbcr = V4L2_YCBCR_ENC_709;
+        break;
+    case AVCOL_PRI_BT470M:
+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
+        ycbcr = V4L2_YCBCR_ENC_601;
+        break;
+    case AVCOL_PRI_BT470BG:
+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
+        break;
+    case AVCOL_PRI_SMPTE170M:
+        cs = V4L2_COLORSPACE_SMPTE170M;
+        break;
+    case AVCOL_PRI_SMPTE240M:
+        cs = V4L2_COLORSPACE_SMPTE240M;
+        break;
+    case AVCOL_PRI_BT2020:
+        cs = V4L2_COLORSPACE_BT2020;
+        break;
+    case AVCOL_PRI_SMPTE428:
+    case AVCOL_PRI_SMPTE431:
+    case AVCOL_PRI_SMPTE432:
+    case AVCOL_PRI_EBU3213:
+    case AVCOL_PRI_RESERVED:
+    case AVCOL_PRI_FILM:
+    case AVCOL_PRI_UNSPECIFIED:
+    default:
+        break;
+    }
+
+    switch (avcs) {
+    case AVCOL_SPC_RGB:
+        cs = V4L2_COLORSPACE_SRGB;
+        break;
+    case AVCOL_SPC_BT709:
+        cs = V4L2_COLORSPACE_REC709;
+        break;
+    case AVCOL_SPC_FCC:
+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
+        break;
+    case AVCOL_SPC_BT470BG:
+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
+        break;
+    case AVCOL_SPC_SMPTE170M:
+        cs = V4L2_COLORSPACE_SMPTE170M;
+        break;
+    case AVCOL_SPC_SMPTE240M:
+        cs = V4L2_COLORSPACE_SMPTE240M;
+        break;
+    case AVCOL_SPC_BT2020_CL:
+        cs = V4L2_COLORSPACE_BT2020;
+        ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM;
+        break;
+    case AVCOL_SPC_BT2020_NCL:
+        cs = V4L2_COLORSPACE_BT2020;
+        break;
+    default:
+        break;
+    }
+
+    switch (xfer) {
+    case AVCOL_TRC_BT709:
+        xfer = V4L2_XFER_FUNC_709;
+        break;
+    case AVCOL_TRC_IEC61966_2_1:
+        xfer = V4L2_XFER_FUNC_SRGB;
+        break;
+    case AVCOL_TRC_SMPTE240M:
+        xfer = V4L2_XFER_FUNC_SMPTE240M;
+        break;
+    case AVCOL_TRC_SMPTE2084:
+        xfer = V4L2_XFER_FUNC_SMPTE2084;
+        break;
+    default:
+        break;
+    }
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        fmt->fmt.pix_mp.colorspace = cs;
+        fmt->fmt.pix_mp.ycbcr_enc = ycbcr;
+        fmt->fmt.pix_mp.xfer_func = xfer;
+    } else {
+        fmt->fmt.pix.colorspace = cs;
+        fmt->fmt.pix.ycbcr_enc = ycbcr;
+        fmt->fmt.pix.xfer_func = xfer;
+    }
+}
+
+static void
+set_fmt_color_range(struct v4l2_format *const fmt, const enum AVColorRange avcr)
+{
+    const enum v4l2_quantization q =
+        avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE :
+        avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE :
+            V4L2_QUANTIZATION_DEFAULT;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        fmt->fmt.pix_mp.quantization = q;
+    } else {
+        fmt->fmt.pix.quantization = q;
+    }
+}
+
+static enum AVColorPrimaries get_color_primaries(const struct v4l2_format *const fmt)
+{
+    enum v4l2_ycbcr_encoding ycbcr;
+    enum v4l2_colorspace cs;
+
+    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.colorspace :
+        fmt->fmt.pix.colorspace;
+
+    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.ycbcr_enc:
+        fmt->fmt.pix.ycbcr_enc;
+
+    switch(ycbcr) {
+    case V4L2_YCBCR_ENC_XV709:
+    case V4L2_YCBCR_ENC_709: return AVCOL_PRI_BT709;
+    case V4L2_YCBCR_ENC_XV601:
+    case V4L2_YCBCR_ENC_601:return AVCOL_PRI_BT470M;
+    default:
+        break;
+    }
+
+    switch(cs) {
+    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_PRI_BT470BG;
+    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_PRI_SMPTE170M;
+    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_PRI_SMPTE240M;
+    case V4L2_COLORSPACE_BT2020: return AVCOL_PRI_BT2020;
+    default:
+        break;
+    }
+
+    return AVCOL_PRI_UNSPECIFIED;
+}
+
+static enum AVColorSpace get_color_space(const struct v4l2_format *const fmt)
+{
+    enum v4l2_ycbcr_encoding ycbcr;
+    enum v4l2_colorspace cs;
+
+    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.colorspace :
+        fmt->fmt.pix.colorspace;
+
+    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.ycbcr_enc:
+        fmt->fmt.pix.ycbcr_enc;
+
+    switch(cs) {
+    case V4L2_COLORSPACE_SRGB: return AVCOL_SPC_RGB;
+    case V4L2_COLORSPACE_REC709: return AVCOL_SPC_BT709;
+    case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_SPC_FCC;
+    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_SPC_BT470BG;
+    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_SPC_SMPTE170M;
+    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_SPC_SMPTE240M;
+    case V4L2_COLORSPACE_BT2020:
+        if (ycbcr == V4L2_YCBCR_ENC_BT2020_CONST_LUM)
+            return AVCOL_SPC_BT2020_CL;
+        else
+             return AVCOL_SPC_BT2020_NCL;
+    default:
+        break;
+    }
+
+    return AVCOL_SPC_UNSPECIFIED;
+}
+
+static enum AVColorTransferCharacteristic get_color_trc(const struct v4l2_format *const fmt)
+{
+    enum v4l2_ycbcr_encoding ycbcr;
+    enum v4l2_xfer_func xfer;
+    enum v4l2_colorspace cs;
+
+    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.colorspace :
+        fmt->fmt.pix.colorspace;
+
+    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.ycbcr_enc:
+        fmt->fmt.pix.ycbcr_enc;
+
+    xfer = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.xfer_func:
+        fmt->fmt.pix.xfer_func;
+
+    switch (xfer) {
+    case V4L2_XFER_FUNC_709: return AVCOL_TRC_BT709;
+    case V4L2_XFER_FUNC_SRGB: return AVCOL_TRC_IEC61966_2_1;
+    default:
+        break;
+    }
+
+    switch (cs) {
+    case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_TRC_GAMMA22;
+    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_TRC_GAMMA28;
+    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_TRC_SMPTE170M;
+    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_TRC_SMPTE240M;
+    default:
+        break;
+    }
+
+    switch (ycbcr) {
+    case V4L2_YCBCR_ENC_XV709:
+    case V4L2_YCBCR_ENC_XV601: return AVCOL_TRC_BT1361_ECG;
+    default:
+        break;
+    }
+
+    return AVCOL_TRC_UNSPECIFIED;
+}
+
+static enum AVColorRange get_color_range(const struct v4l2_format *const fmt)
+{
+    enum v4l2_quantization qt;
+
+    qt = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
+        fmt->fmt.pix_mp.quantization :
+        fmt->fmt.pix.quantization;
+
+    switch (qt) {
+    case V4L2_QUANTIZATION_LIM_RANGE: return AVCOL_RANGE_MPEG;
+    case V4L2_QUANTIZATION_FULL_RANGE: return AVCOL_RANGE_JPEG;
+    default:
+        break;
+    }
+
+     return AVCOL_RANGE_UNSPECIFIED;
+}
+
+static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame)
+{
+    struct v4l2_format *const format = &q->format;
+    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
+
+    const uint32_t drm_fmt = src->layers[0].format;
+    // Treat INVALID as LINEAR
+    const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
+        DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
+    uint32_t pix_fmt = 0;
+    uint32_t w = 0;
+    uint32_t h = 0;
+    uint32_t bpl = src->layers[0].planes[0].pitch;
+
+    // We really don't expect multiple layers
+    // All formats that we currently cope with are single object
+
+    if (src->nb_layers != 1 || src->nb_objects != 1)
+        return AVERROR(EINVAL);
+
+    switch (drm_fmt) {
+        case DRM_FORMAT_YUV420:
+            if (mod == DRM_FORMAT_MOD_LINEAR) {
+                if (src->layers[0].nb_planes != 3)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_YUV420;
+                h = src->layers[0].planes[1].offset / bpl;
+                w = bpl;
+            }
+            break;
+
+        case DRM_FORMAT_NV12:
+            if (mod == DRM_FORMAT_MOD_LINEAR) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_NV12;
+                h = src->layers[0].planes[1].offset / bpl;
+                w = bpl;
+            }
+            else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt = V4L2_PIX_FMT_NV12_COL128;
+                w = bpl;
+                h = src->layers[0].planes[1].offset / 128;
+                bpl = fourcc_mod_broadcom_param(mod);
+            }
+            break;
+
+        case DRM_FORMAT_P030:
+            if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
+                if (src->layers[0].nb_planes != 2)
+                    break;
+                pix_fmt =  V4L2_PIX_FMT_NV12_10_COL128;
+                w = bpl / 2;  // Matching lie to how we construct this
+                h = src->layers[0].planes[1].offset / 128;
+                bpl = fourcc_mod_broadcom_param(mod);
+            }
+            break;
+
+        default:
+            break;
+    }
+
+    if (!pix_fmt)
+        return AVERROR(EINVAL);
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
+        struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
+
+        pix->width = w;
+        pix->height = h;
+        pix->pixelformat = pix_fmt;
+        pix->plane_fmt[0].bytesperline = bpl;
+        pix->num_planes = 1;
+    }
+    else {
+        struct v4l2_pix_format *const pix = &format->fmt.pix;
+
+        pix->width = w;
+        pix->height = h;
+        pix->pixelformat = pix_fmt;
+        pix->bytesperline = bpl;
     }
 
+    set_fmt_color(format, frame->color_primaries, frame->colorspace, frame->color_trc);
+    set_fmt_color_range(format, frame->color_range);
+
+    q->sel.r.width = frame->width - (frame->crop_left + frame->crop_right);
+    q->sel.r.height = frame->height - (frame->crop_top + frame->crop_bottom);
+    q->sel.r.left = frame->crop_left;
+    q->sel.r.top = frame->crop_top;
+
     return 0;
 }
 
+
+static int set_dst_format(DeintV4L2M2MContext * const priv, V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height)
+{
+    struct v4l2_format * const fmt   = &queue->format;
+    struct v4l2_selection *const sel = &queue->sel;
+
+    memset(&fmt->fmt, 0, sizeof(fmt->fmt));
+
+    // Align w/h to 16 here in case there are alignment requirements at the next
+    // stage of the filter chain (also RPi deinterlace setup is bust and this
+    // fixes it)
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
+        fmt->fmt.pix_mp.pixelformat = pixelformat;
+        fmt->fmt.pix_mp.field = field;
+        fmt->fmt.pix_mp.width = FFALIGN(width, 16);
+        fmt->fmt.pix_mp.height = FFALIGN(height, 16);
+    } else {
+        fmt->fmt.pix.pixelformat = pixelformat;
+        fmt->fmt.pix.field = field;
+        fmt->fmt.pix.width = FFALIGN(width, 16);
+        fmt->fmt.pix.height = FFALIGN(height, 16);
+    }
+
+    set_fmt_color(fmt, priv->colour_primaries, priv->colour_matrix, priv->colour_transfer);
+    set_fmt_color_range(fmt, priv->colour_range);
+
+    sel->r.width = width;
+    sel->r.height = height;
+    sel->r.left = 0;
+    sel->r.top = 0;
+
+    return do_s_fmt(queue);
+}
+
 static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node)
 {
     int ret;
@@ -464,16 +922,22 @@ static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node
         return AVERROR(errno);
 
     ret = deint_v4l2m2m_prepare_context(ctx);
-    if (ret)
+    if (ret) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to prepare context\n");
         goto fail;
+    }
 
-    ret = deint_v4l2m2m_try_format(&ctx->capture);
-    if (ret)
+    ret = deint_v4l2m2m_try_format(&ctx->capture, ctx->output_width, ctx->output_height, ctx->output_format);
+    if (ret) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try dst format\n");
         goto fail;
+    }
 
-    ret = deint_v4l2m2m_try_format(&ctx->output);
-    if (ret)
+    ret = deint_v4l2m2m_try_format(&ctx->output, ctx->width, ctx->height, AV_PIX_FMT_NONE);
+    if (ret) {
+        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try src format\n");
         goto fail;
+    }
 
     return 0;
 
@@ -534,26 +998,118 @@ static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf)
     return 0;
 }
 
-static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat)
+static void
+drm_frame_init(AVDRMFrameDescriptor * const d)
+{
+    unsigned int i;
+    for (i = 0; i != AV_DRM_MAX_PLANES; ++i) {
+        d->objects[i].fd = -1;
+    }
+}
+
+static void
+drm_frame_uninit(AVDRMFrameDescriptor * const d)
+{
+    unsigned int i;
+    for (i = 0; i != d->nb_objects; ++i) {
+        if (d->objects[i].fd != -1) {
+            close(d->objects[i].fd);
+            d->objects[i].fd = -1;
+        }
+    }
+}
+
+static void
+avbufs_delete(V4L2Buffer** ppavbufs, const unsigned int n)
+{
+    unsigned int i;
+    V4L2Buffer* const avbufs = *ppavbufs;
+
+    if (avbufs == NULL)
+        return;
+    *ppavbufs = NULL;
+
+    for (i = 0; i != n; ++i) {
+        V4L2Buffer* const avbuf = avbufs + i;
+        drm_frame_uninit(&avbuf->drm_frame);
+    }
+
+    av_free(avbufs);
+}
+
+static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf)
 {
     struct v4l2_exportbuffer expbuf;
     int i, ret;
     uint64_t mod = DRM_FORMAT_MOD_LINEAR;
-    uint32_t fmt = 0;
 
-    switch (pixelformat) {
-    case V4L2_PIX_FMT_NV12:
-        fmt = DRM_FORMAT_NV12;
-        break;
-    case V4L2_PIX_FMT_YUV420:
-        fmt = DRM_FORMAT_YUV420;
-        break;
-    default:
-        return AVERROR(EINVAL);
+    AVDRMFrameDescriptor * const drm_desc = &avbuf->drm_frame;
+    AVDRMLayerDescriptor * const layer = &drm_desc->layers[0];
+    const struct v4l2_format *const fmt = &q->format;
+    const uint32_t height = fmt_height(fmt);
+    const uint32_t width  = fmt_width(fmt);
+    ptrdiff_t bpl0;
+
+    /* fill the DRM frame descriptor */
+    drm_desc->nb_layers = 1;
+    layer->nb_planes = avbuf->num_planes;
+
+    for (int i = 0; i < avbuf->num_planes; i++) {
+        layer->planes[i].object_index = i;
+        layer->planes[i].offset = 0;
+        layer->planes[i].pitch = fmt_bpl(fmt, i);
     }
+    bpl0 = layer->planes[0].pitch;
+
+    switch (fmt_pixelformat(fmt)) {
+
+        case V4L2_PIX_FMT_NV12_COL128:
+            mod = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl0);
+            layer->format = V4L2_PIX_FMT_NV12;
+
+            if (avbuf->num_planes > 1)
+                break;
+
+            layer->nb_planes = 2;
+            layer->planes[1].object_index = 0;
+            layer->planes[1].offset = height * 128;
+            layer->planes[0].pitch = width;
+            layer->planes[1].pitch = width;
+            break;
 
-    avbuf->drm_frame.layers[0].format = fmt;
+        case DRM_FORMAT_NV12:
+            layer->format = V4L2_PIX_FMT_NV12;
 
+            if (avbuf->num_planes > 1)
+                break;
+
+            layer->nb_planes = 2;
+            layer->planes[1].object_index = 0;
+            layer->planes[1].offset = bpl0 * height;
+            layer->planes[1].pitch = bpl0;
+            break;
+
+        case V4L2_PIX_FMT_YUV420:
+            layer->format = DRM_FORMAT_YUV420;
+
+            if (avbuf->num_planes > 1)
+                break;
+
+            layer->nb_planes = 3;
+            layer->planes[1].object_index = 0;
+            layer->planes[1].offset = bpl0 * height;
+            layer->planes[1].pitch = bpl0 / 2;
+            layer->planes[2].object_index = 0;
+            layer->planes[2].offset = layer->planes[1].offset + ((bpl0 * height) / 4);
+            layer->planes[2].pitch = bpl0 / 2;
+            break;
+
+        default:
+            drm_desc->nb_layers = 0;
+            return AVERROR(EINVAL);
+    }
+
+    drm_desc->nb_objects = 0;
     for (i = 0; i < avbuf->num_planes; i++) {
         memset(&expbuf, 0, sizeof(expbuf));
 
@@ -565,19 +1121,11 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat)
         if (ret < 0)
             return AVERROR(errno);
 
-        avbuf->fd = expbuf.fd;
-
-        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) {
-            /* drm frame */
-            avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length;
-            avbuf->drm_frame.objects[i].fd = expbuf.fd;
-            avbuf->drm_frame.objects[i].format_modifier = mod;
-        } else {
-            /* drm frame */
-            avbuf->drm_frame.objects[0].size = avbuf->buffer.length;
-            avbuf->drm_frame.objects[0].fd = expbuf.fd;
-            avbuf->drm_frame.objects[0].format_modifier = mod;
-        }
+        drm_desc->objects[i].size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type) ?
+            avbuf->buffer.m.planes[i].length : avbuf->buffer.length;
+        drm_desc->objects[i].fd = expbuf.fd;
+        drm_desc->objects[i].format_modifier = mod;
+        drm_desc->nb_objects = i + 1;
     }
 
     return 0;
@@ -588,7 +1136,7 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
     struct v4l2_format *fmt = &queue->format;
     DeintV4L2M2MContextShared *ctx = queue->ctx;
     struct v4l2_requestbuffers req;
-    int ret, i, j, multiplanar;
+    int ret, i, multiplanar;
     uint32_t memory;
 
     memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ?
@@ -617,10 +1165,9 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
     }
 
     for (i = 0; i < queue->num_buffers; i++) {
-        V4L2Buffer *buf = &queue->buffers[i];
+        V4L2Buffer * const buf = &queue->buffers[i];
 
         buf->enqueued = 0;
-        buf->fd = -1;
         buf->q = queue;
 
         buf->buffer.type = fmt->type;
@@ -632,6 +1179,12 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
             buf->buffer.m.planes = buf->planes;
         }
 
+        drm_frame_init(&buf->drm_frame);
+    }
+
+    for (i = 0; i < queue->num_buffers; i++) {
+        V4L2Buffer * const buf = &queue->buffers[i];
+
         ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer);
         if (ret < 0) {
             ret = AVERROR(errno);
@@ -639,29 +1192,14 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
             goto fail;
         }
 
-        if (multiplanar)
-            buf->num_planes = buf->buffer.length;
-        else
-            buf->num_planes = 1;
-
-        for (j = 0; j < buf->num_planes; j++) {
-            V4L2PlaneInfo *info = &buf->plane_info[j];
-
-            if (multiplanar) {
-                info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline;
-                info->length = buf->buffer.m.planes[j].length;
-            } else {
-                info->bytesperline = fmt->fmt.pix.bytesperline;
-                info->length = buf->buffer.length;
-            }
-        }
+        buf->num_planes = multiplanar ? buf->buffer.length : 1;
 
         if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) {
             ret = deint_v4l2m2m_enqueue_buffer(buf);
             if (ret)
                 goto fail;
 
-            ret = v4l2_buffer_export_drm(buf, multiplanar ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat);
+            ret = v4l2_buffer_export_drm(queue, buf);
             if (ret)
                 goto fail;
         }
@@ -670,12 +1208,8 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
     return 0;
 
 fail:
-    for (i = 0; i < queue->num_buffers; i++)
-        if (queue->buffers[i].fd >= 0)
-            close(queue->buffers[i].fd);
-    av_free(queue->buffers);
-    queue->buffers = NULL;
-
+    avbufs_delete(&queue->buffers, queue->num_buffers);
+    queue->num_buffers = 0;
     return ret;
 }
 
@@ -862,7 +1396,6 @@ static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx)
     if (atomic_fetch_sub(&ctx->refcount, 1) == 1) {
         V4L2Queue *capture = &ctx->capture;
         V4L2Queue *output  = &ctx->output;
-        int i;
 
         av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__);
 
@@ -871,12 +1404,7 @@ static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx)
             deint_v4l2m2m_streamoff(output);
         }
 
-        if (capture->buffers)
-            for (i = 0; i < capture->num_buffers; i++) {
-                capture->buffers[i].q = NULL;
-                if (capture->buffers[i].fd >= 0)
-                    close(capture->buffers[i].fd);
-            }
+        avbufs_delete(&capture->buffers, capture->num_buffers);
 
         deint_v4l2m2m_unref_queued(output);
 
@@ -908,73 +1436,15 @@ static void v4l2_free_buffer(void *opaque, uint8_t *unused)
     deint_v4l2m2m_destroy_context(ctx);
 }
 
-static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
-{
-    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
-    AVDRMLayerDescriptor *layer;
-
-    /* fill the DRM frame descriptor */
-    drm_desc->nb_objects = avbuf->num_planes;
-    drm_desc->nb_layers = 1;
-
-    layer = &drm_desc->layers[0];
-    layer->nb_planes = avbuf->num_planes;
-
-    for (int i = 0; i < avbuf->num_planes; i++) {
-        layer->planes[i].object_index = i;
-        layer->planes[i].offset = 0;
-        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
-    }
-
-    switch (layer->format) {
-    case DRM_FORMAT_YUYV:
-        layer->nb_planes = 1;
-        break;
-
-    case DRM_FORMAT_NV12:
-    case DRM_FORMAT_NV21:
-        if (avbuf->num_planes > 1)
-            break;
-
-        layer->nb_planes = 2;
-
-        layer->planes[1].object_index = 0;
-        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
-            height;
-        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
-        break;
-
-    case DRM_FORMAT_YUV420:
-        if (avbuf->num_planes > 1)
-            break;
-
-        layer->nb_planes = 3;
-
-        layer->planes[1].object_index = 0;
-        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
-            height;
-        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
-
-        layer->planes[2].object_index = 0;
-        layer->planes[2].offset = layer->planes[1].offset +
-            ((avbuf->plane_info[0].bytesperline *
-              height) >> 2);
-        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
-        break;
-
-    default:
-        drm_desc->nb_layers = 0;
-        break;
-    }
-
-    return (uint8_t *) drm_desc;
-}
-
 // timeout in ms
 static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout)
 {
     DeintV4L2M2MContextShared *ctx = queue->ctx;
     V4L2Buffer* avbuf;
+    enum AVColorPrimaries color_primaries;
+    enum AVColorSpace colorspace;
+    enum AVColorTransferCharacteristic color_trc;
+    enum AVColorRange color_range;
 
     av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__);
 
@@ -985,8 +1455,6 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim
     }
 
     // Fill in PTS and anciliary info from src frame
-    // we will want to overwrite some fields as only the pts/dts
-    // fields are updated with new timing in this fn
     pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame);
 
     frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame,
@@ -999,18 +1467,36 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim
 
     atomic_fetch_add(&ctx->refcount, 1);
 
-    frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height);
+    frame->data[0] = (uint8_t *)&avbuf->drm_frame;
     frame->format = AV_PIX_FMT_DRM_PRIME;
     if (ctx->hw_frames_ctx)
         frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
-    frame->height = ctx->height;
-    frame->width = ctx->width;
-
-    // Not interlaced now
-    frame->interlaced_frame = 0;
-    frame->top_field_first = 0;
-    // Pkt duration halved
-    frame->pkt_duration /= 2;
+    frame->height = ctx->output_height;
+    frame->width = ctx->output_width;
+
+    color_primaries = get_color_primaries(&ctx->capture.format);
+    colorspace      = get_color_space(&ctx->capture.format);
+    color_trc       = get_color_trc(&ctx->capture.format);
+    color_range     = get_color_range(&ctx->capture.format);
+
+    // If the color parameters are unspecified by V4L2 then leave alone as they
+    // will have been copied from src
+    if (color_primaries != AVCOL_PRI_UNSPECIFIED)
+        frame->color_primaries = color_primaries;
+    if (colorspace != AVCOL_SPC_UNSPECIFIED)
+        frame->colorspace = colorspace;
+    if (color_trc != AVCOL_TRC_UNSPECIFIED)
+        frame->color_trc = color_trc;
+    if (color_range != AVCOL_RANGE_UNSPECIFIED)
+        frame->color_range = color_range;
+
+    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE) {
+        // Not interlaced now
+        frame->interlaced_frame = 0;   // *** Fill in from dst buffer?
+        frame->top_field_first = 0;
+        // Pkt duration halved
+        frame->pkt_duration /= 2;
+    }
 
     if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) {
         av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n");
@@ -1032,15 +1518,34 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
     ctx->height = avctx->inputs[0]->h;
     ctx->width = avctx->inputs[0]->w;
 
-    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d\n", __func__, ctx->width, ctx->height);
+    if (ctx->filter_type == FILTER_V4L2_SCALE) {
+        if ((ret = ff_scale_eval_dimensions(priv,
+                                            priv->w_expr, priv->h_expr,
+                                            inlink, outlink,
+                                            &ctx->output_width, &ctx->output_height)) < 0)
+            return ret;
+
+        ff_scale_adjust_dimensions(inlink, &ctx->output_width, &ctx->output_height,
+                                   priv->force_original_aspect_ratio, priv->force_divisible_by);
+    }
+    else {
+        ctx->output_width  = ctx->width;
+        ctx->output_height = ctx->height;
+    }
+
+    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d\n", __func__, ctx->width, ctx->height, ctx->output_width, ctx->output_height);
 
     outlink->time_base           = inlink->time_base;
-    outlink->w                   = inlink->w;
-    outlink->h                   = inlink->h;
-    outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+    outlink->w                   = ctx->output_width;
+    outlink->h                   = ctx->output_height;
     outlink->format              = inlink->format;
     outlink->frame_rate = (AVRational) {1, 0};  // Deny knowledge of frame rate
 
+    if (inlink->sample_aspect_ratio.num)
+        outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio);
+    else
+        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+
     ret = deint_v4l2m2m_find_device(ctx);
     if (ret)
         return ret;
@@ -1055,18 +1560,19 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
 
 static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc)
 {
-    const int is_linear = (drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_LINEAR ||
-            drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID);
+    const uint64_t mod = drm_desc->objects[0].format_modifier;
+    const int is_linear = (mod == DRM_FORMAT_MOD_LINEAR || mod == DRM_FORMAT_MOD_INVALID);
+
+    // Only currently support single object things
+    if (drm_desc->nb_objects != 1)
+        return 0;
 
     switch (drm_desc->layers[0].format) {
     case DRM_FORMAT_YUV420:
-        if (is_linear)
-            return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_YUV420 : 0;
-        break;
+        return is_linear ? V4L2_PIX_FMT_YUV420 : 0;
     case DRM_FORMAT_NV12:
-        if (is_linear)
-            return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_NV12 : 0;
-        break;
+        return is_linear ? V4L2_PIX_FMT_NV12 :
+            fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 : 0;
     default:
         break;
     }
@@ -1089,7 +1595,7 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
 
     if (ctx->field_order == V4L2_FIELD_ANY) {
         const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0];
-        const uint32_t pixelformat = desc_pixelformat(drm_desc);
+        uint32_t pixelformat = desc_pixelformat(drm_desc);
 
         if (pixelformat == 0) {
             av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n",
@@ -1104,29 +1610,49 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
         av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height,
            drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset);
 
-        ret = deint_v4l2m2m_set_format(output, pixelformat, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
-        if (ret)
+        if ((ret = set_src_fmt(output, in)) != 0) {
+            av_log(avctx, AV_LOG_WARNING, "Unknown input DRM format: %s mod: %#" PRIx64 "\n",
+                   av_fourcc2str(drm_desc->layers[0].format), drm_desc->objects[0].format_modifier);
+            return ret;
+        }
+
+        ret = do_s_fmt(output);
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed to set source format\n");
             return ret;
+        }
 
-        ret = deint_v4l2m2m_set_format(capture, pixelformat, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
-        if (ret)
+        if (ctx->output_format != AV_PIX_FMT_NONE)
+           pixelformat = fmt_av_to_v4l2(ctx->output_format);
+        ret = set_dst_format(priv, capture, pixelformat, V4L2_FIELD_NONE, ctx->output_width, ctx->output_height);
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed to set destination format\n");
             return ret;
+        }
 
         ret = deint_v4l2m2m_allocate_buffers(capture);
-        if (ret)
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed to allocate destination buffers\n");
             return ret;
+        }
 
         ret = deint_v4l2m2m_streamon(capture);
-        if (ret)
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed set destination streamon: %s\n", av_err2str(ret));
             return ret;
+        }
 
         ret = deint_v4l2m2m_allocate_buffers(output);
-        if (ret)
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed to allocate src buffers\n");
             return ret;
+        }
 
         ret = deint_v4l2m2m_streamon(output);
-        if (ret)
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "Failed set src streamon: %s\n", av_err2str(ret));
             return ret;
+        }
 
         if (in->top_field_first)
             ctx->field_order = V4L2_FIELD_INTERLACED_TB;
@@ -1251,7 +1777,7 @@ again:
     return did_something ? 0 : FFERROR_NOT_READY;
 }
 
-static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
+static av_cold int common_v4l2m2m_init(AVFilterContext * const avctx, const filter_type_v4l2_t filter_type)
 {
     DeintV4L2M2MContext * const priv = avctx->priv;
     DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared));
@@ -1262,6 +1788,7 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
     }
     priv->shared = ctx;
     ctx->logctx = priv;
+    ctx->filter_type = filter_type;
     ctx->fd = -1;
     ctx->output.ctx = ctx;
     ctx->output.num_buffers = 8;
@@ -1274,9 +1801,52 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
 
     atomic_init(&ctx->refcount, 1);
 
+    if (priv->output_format_string) {
+        ctx->output_format = av_get_pix_fmt(priv->output_format_string);
+        if (ctx->output_format == AV_PIX_FMT_NONE) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid ffmpeg output format '%s'.\n", priv->output_format_string);
+            return AVERROR(EINVAL);
+        }
+        if (fmt_av_to_v4l2(ctx->output_format) == 0) {
+            av_log(avctx, AV_LOG_ERROR, "Unsupported output format for V4L2: %s.\n", av_get_pix_fmt_name(ctx->output_format));
+            return AVERROR(EINVAL);
+        }
+    } else {
+        // Use the input format once that is configured.
+        ctx->output_format = AV_PIX_FMT_NONE;
+    }
+
+#define STRING_OPTION(var_name, func_name, default_value) do { \
+        if (priv->var_name ## _string) { \
+            int var = av_ ## func_name ## _from_name(priv->var_name ## _string); \
+            if (var < 0) { \
+                av_log(avctx, AV_LOG_ERROR, "Invalid %s.\n", #var_name); \
+                return AVERROR(EINVAL); \
+            } \
+            priv->var_name = var; \
+        } else { \
+            priv->var_name = default_value; \
+        } \
+    } while (0)
+
+    STRING_OPTION(colour_primaries, color_primaries, AVCOL_PRI_UNSPECIFIED);
+    STRING_OPTION(colour_transfer,  color_transfer,  AVCOL_TRC_UNSPECIFIED);
+    STRING_OPTION(colour_matrix,    color_space,     AVCOL_SPC_UNSPECIFIED);
+    STRING_OPTION(chroma_location,  chroma_location, AVCHROMA_LOC_UNSPECIFIED);
+
     return 0;
 }
 
+static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
+{
+    return common_v4l2m2m_init(avctx, FILTER_V4L2_DEINTERLACE);
+}
+
+static av_cold int scale_v4l2m2m_init(AVFilterContext *avctx)
+{
+    return common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE);
+}
+
 static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
 {
     DeintV4L2M2MContext *priv = avctx->priv;
@@ -1294,6 +1864,51 @@ static const AVOption deinterlace_v4l2m2m_options[] = {
 
 AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m);
 
+#define OFFSET(x) offsetof(DeintV4L2M2MContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
+
+static const AVOption scale_v4l2m2m_options[] = {
+    { "w", "Output video width",
+      OFFSET(w_expr), AV_OPT_TYPE_STRING, {.str = "iw"}, .flags = FLAGS },
+    { "h", "Output video height",
+      OFFSET(h_expr), AV_OPT_TYPE_STRING, {.str = "ih"}, .flags = FLAGS },
+    { "format", "Output video format (software format of hardware frames)",
+      OFFSET(output_format_string), AV_OPT_TYPE_STRING, .flags = FLAGS },
+      // These colour properties match the ones of the same name in vf_scale.
+      { "out_color_matrix", "Output colour matrix coefficient set",
+      OFFSET(colour_matrix_string), AV_OPT_TYPE_STRING, { .str = NULL }, .flags = FLAGS },
+    { "out_range", "Output colour range",
+      OFFSET(colour_range), AV_OPT_TYPE_INT, { .i64 = AVCOL_RANGE_UNSPECIFIED },
+      AVCOL_RANGE_UNSPECIFIED, AVCOL_RANGE_JPEG, FLAGS, "range" },
+        { "full",    "Full range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
+        { "limited", "Limited range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
+        { "jpeg",    "Full range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
+        { "mpeg",    "Limited range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
+        { "tv",      "Limited range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
+        { "pc",      "Full range",
+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
+    // These colour properties match the ones in the VAAPI scaler
+    { "out_color_primaries", "Output colour primaries",
+      OFFSET(colour_primaries_string), AV_OPT_TYPE_STRING,
+      { .str = NULL }, .flags = FLAGS },
+    { "out_color_transfer", "Output colour transfer characteristics",
+      OFFSET(colour_transfer_string),  AV_OPT_TYPE_STRING,
+      { .str = NULL }, .flags = FLAGS },
+    { "out_chroma_location", "Output chroma sample location",
+      OFFSET(chroma_location_string),  AV_OPT_TYPE_STRING,
+      { .str = NULL }, .flags = FLAGS },
+    { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 2, FLAGS, "force_oar" },
+    { "force_divisible_by", "enforce that the output resolution is divisible by a defined integer when force_original_aspect_ratio is used", OFFSET(force_divisible_by), AV_OPT_TYPE_INT, { .i64 = 1}, 1, 256, FLAGS },
+    { NULL },
+};
+
+AVFILTER_DEFINE_CLASS(scale_v4l2m2m);
+
 static const AVFilterPad deint_v4l2m2m_inputs[] = {
     {
         .name         = "default",
@@ -1321,3 +1936,17 @@ AVFilter ff_vf_deinterlace_v4l2m2m = {
     .priv_class     = &deinterlace_v4l2m2m_class,
     .activate       = deint_v4l2m2m_activate,
 };
+
+AVFilter ff_vf_scale_v4l2m2m = {
+    .name           = "scale_v4l2m2m",
+    .description    = NULL_IF_CONFIG_SMALL("V4L2 M2M scaler"),
+    .priv_size      = sizeof(DeintV4L2M2MContext),
+    .init           = &scale_v4l2m2m_init,
+    .uninit         = &deint_v4l2m2m_uninit,
+    FILTER_INPUTS(deint_v4l2m2m_inputs),
+    FILTER_OUTPUTS(deint_v4l2m2m_outputs),
+    FILTER_SINGLE_SAMPLEFMT(AV_PIX_FMT_DRM_PRIME),
+    .priv_class     = &scale_v4l2m2m_class,
+    .activate       = deint_v4l2m2m_activate,
+};
+
-- 
2.43.0


From 8fb46717eb7c37ca1ca1bc6a0ab73c28758a43ca Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 22 Sep 2022 14:54:46 +0000
Subject: [PATCH 069/157] v4l2_m2m: Adjust buffer allocation based on min/max
 controls

Clip requested buffer count to min/max declared by driver.
If 0 buffers requested then set to min+2.
This allows encode to keep its src buffer count down to a plausible
minimum which helps with flow control.
---
 libavcodec/v4l2_context.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 6b97eab41e..ba36689ff3 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -1187,6 +1187,7 @@ fail_release:
 
 int ff_v4l2_context_init(V4L2Context* ctx)
 {
+    struct v4l2_queryctrl qctrl;
     V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
     int ret;
 
@@ -1228,6 +1229,24 @@ int ff_v4l2_context_init(V4L2Context* ctx)
         goto fail_unref_hwframes;
     }
 
+    memset(&qctrl, 0, sizeof(qctrl));
+    qctrl.id = V4L2_CID_MIN_BUFFERS_FOR_OUTPUT;
+    if (ioctl(s->fd, VIDIOC_QUERYCTRL, &qctrl) != 0) {
+        ret = AVERROR(errno);
+        if (ret != AVERROR(EINVAL)) {
+            av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_QUERCTRL failed: %s\n", ctx->name, av_err2str(ret));
+            goto fail_unref_hwframes;
+        }
+        // Control unsupported - set default if wanted
+        if (ctx->num_buffers < 2)
+            ctx->num_buffers = 4;
+    }
+    else {
+        if (ctx->num_buffers < 2)
+            ctx->num_buffers = qctrl.minimum + 2;
+        ctx->num_buffers = av_clip(ctx->num_buffers, qctrl.minimum, qctrl.maximum);
+    }
+
     ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem);
     if (ret < 0)
         goto fail_unref_hwframes;
-- 
2.43.0


From 4206d7c0edab7c426ec08cfe6b3d9fa0fbe9b96f Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 22 Sep 2022 15:00:12 +0000
Subject: [PATCH 070/157] v4l2_m2m_dec: If src Q is full then wait indefinitely
 for buffer

If it is not possible to add another buffer to the src Q then alawys
wait indefinitely for either an output frame or the Q to have space.

This has issues if the reason that the Q is stalled is due to dst buffer
exhaustion and buffers cannot be returned async by another thread but
the current scheme confuses ffmpegs pipeline scheduling.
---
 libavcodec/v4l2_m2m_dec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 485a96f4b4..bb183097f6 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -456,9 +456,9 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
         if (dst_rv != 0 && TRY_DQ(src_rv)) {
             // Pick a timeout depending on state
             const int t =
+                src_rv == NQ_Q_FULL ? -1 :
                 src_rv == NQ_DRAINING ? 300 :
-                prefer_dq ? 5 :
-                src_rv == NQ_Q_FULL ? -1 : 0;
+                prefer_dq ? 5 : 0;
 
             // Dequeue frame will unref any previous contents of frame
             // if it returns success so we don't need an explicit unref
-- 
2.43.0


From 8ec5fd064c38299e96cdc7750a254768420765ad Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 22 Sep 2022 15:12:27 +0000
Subject: [PATCH 071/157] vf_deinterlace_v4l2m2m: Add Q name to structure for
 debug

---
 libavfilter/vf_deinterlace_v4l2m2m.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
index 2df39ec0f1..4edecc02bf 100644
--- a/libavfilter/vf_deinterlace_v4l2m2m.c
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -84,6 +84,7 @@ typedef struct V4L2Queue {
     struct v4l2_selection sel;
     int num_buffers;
     V4L2Buffer *buffers;
+    const char * name;
     DeintV4L2M2MContextShared *ctx;
 } V4L2Queue;
 
@@ -1792,8 +1793,10 @@ static av_cold int common_v4l2m2m_init(AVFilterContext * const avctx, const filt
     ctx->fd = -1;
     ctx->output.ctx = ctx;
     ctx->output.num_buffers = 8;
+    ctx->output.name = "OUTPUT";
     ctx->capture.ctx = ctx;
     ctx->capture.num_buffers = 12;
+    ctx->capture.name = "CAPTURE";
     ctx->done = 0;
     ctx->field_order = V4L2_FIELD_ANY;
 
-- 
2.43.0


From c4512846a16d970f545fbb009abfcda30ccbbb05 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 22 Sep 2022 16:08:42 +0000
Subject: [PATCH 072/157] v4l2_m2m_enc: Set src buffer count to min+2 by
 default

Set output.num_buffers to 0 by default which will then be set to min+2
by the allocation code. This fixes an issue where the deinterlacer had
fewer dest buffer than the encoder has src buffers and so ran dry
creating deadlock in the ffmpeg filter chain.
---
 libavcodec/v4l2_m2m_enc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
index 099ad23928..b8ba815c37 100644
--- a/libavcodec/v4l2_m2m_enc.c
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -672,9 +672,10 @@ static av_cold int v4l2_encode_close(AVCodecContext *avctx)
 #define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 
 #define V4L_M2M_CAPTURE_OPTS \
-    V4L_M2M_DEFAULT_OPTS,\
+    { "num_output_buffers", "Number of buffers in the output context",\
+        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS },\
     { "num_capture_buffers", "Number of buffers in the capture context", \
-        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 4 }, 4, INT_MAX, FLAGS }
+        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 8 }, 8, INT_MAX, FLAGS }
 
 static const AVOption mpeg4_options[] = {
     V4L_M2M_CAPTURE_OPTS,
-- 
2.43.0


From ebc30a7e0624455b15ced4a0c6028717f61c3909 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 22 Sep 2022 16:13:57 +0000
Subject: [PATCH 073/157] vf_deinterlace_m2m: For deinterlace set outlink FR to
 twice inlink

We used to set the outlink framerate to unknown but it turns out that
ffmpegs filter pipeline copes with that badly. Otherwise leave at 0,0
which will copy FR from inlink to outlink.
---
 libavfilter/vf_deinterlace_v4l2m2m.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
index 4edecc02bf..c52dae1c44 100644
--- a/libavfilter/vf_deinterlace_v4l2m2m.c
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -1534,13 +1534,16 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
         ctx->output_height = ctx->height;
     }
 
-    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d\n", __func__, ctx->width, ctx->height, ctx->output_width, ctx->output_height);
+    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d FR: %d/%d->%d/%d\n", __func__,
+           ctx->width, ctx->height, ctx->output_width, ctx->output_height,
+           inlink->frame_rate.num, inlink->frame_rate.den, outlink->frame_rate.num, outlink->frame_rate.den);
 
     outlink->time_base           = inlink->time_base;
     outlink->w                   = ctx->output_width;
     outlink->h                   = ctx->output_height;
     outlink->format              = inlink->format;
-    outlink->frame_rate = (AVRational) {1, 0};  // Deny knowledge of frame rate
+    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && inlink->frame_rate.den != 0)
+        outlink->frame_rate = (AVRational){inlink->frame_rate.num * 2, inlink->frame_rate.den};
 
     if (inlink->sample_aspect_ratio.num)
         outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio);
-- 
2.43.0


From 78750b91795c470a6d2c49db27c951db7aae500c Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 23 Sep 2022 11:30:56 +0000
Subject: [PATCH 074/157] v4l2m2m: Add ff_v4l2_dq_all to drain all buffers from
 a Q

Useful for where (encode) we might have drmprime buffers that we want to
return to the source ASAP.
---
 libavcodec/v4l2_context.c | 17 +++++++++++------
 libavcodec/v4l2_context.h |  2 ++
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index ba36689ff3..4a359bf45e 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -707,17 +707,22 @@ clean_v4l2_buffer(V4L2Buffer * const avbuf)
     return avbuf;
 }
 
+void
+ff_v4l2_dq_all(V4L2Context *const ctx)
+{
+    V4L2Buffer * avbuf;
+    do {
+        get_qbuf(ctx, &avbuf, 0);
+    } while (avbuf);
+}
+
 static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
 {
     int i;
 
     /* get back as many output buffers as possible */
-    if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-        V4L2Buffer * avbuf;
-        do {
-            get_qbuf(ctx, &avbuf, 0);
-        } while (avbuf);
-    }
+    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
+        ff_v4l2_dq_all(ctx);
 
     for (i = 0; i < ctx->num_buffers; i++) {
         V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 21265f1bd7..523c53e97d 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -218,4 +218,6 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const
  */
 int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f);
 
+void ff_v4l2_dq_all(V4L2Context *const ctx);
+
 #endif // AVCODEC_V4L2_CONTEXT_H
-- 
2.43.0


From 16b44ca308b9805a459645b0e00d448dd0dfd09f Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 23 Sep 2022 11:38:36 +0000
Subject: [PATCH 075/157] v4l2_m2m_enc: DQ output more frequently

Ensure that we DQ any released src buffers on every op to avoid deadlock
with source.

There is a plausible argument that this patch is inelegant and the drain
should be integrated into dq_buf, but that is a further reaching delta.
---
 libavcodec/v4l2_m2m_enc.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
index b8ba815c37..a992a3cccc 100644
--- a/libavcodec/v4l2_m2m_enc.c
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -421,6 +421,8 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
     V4L2Context *const output = &s->output;
 
+    ff_v4l2_dq_all(output);
+
     // Signal EOF if needed
     if (!frame) {
         return ff_v4l2_context_enqueue_frame(output, frame);
@@ -492,6 +494,8 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
     AVFrame *frame = s->frame;
     int ret;
 
+    ff_v4l2_dq_all(output);
+
     if (s->draining)
         goto dequeue;
 
@@ -528,7 +532,9 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
     }
 
 dequeue:
-    if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
+    ret = ff_v4l2_context_dequeue_packet(capture, avpkt);
+    ff_v4l2_dq_all(output);
+    if (ret)
         return ret;
 
     if (capture->first_buf == 1) {
@@ -560,7 +566,9 @@ dequeue:
             s->extdata_size = len;
         }
 
-        if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
+        ret = ff_v4l2_context_dequeue_packet(capture, avpkt);
+        ff_v4l2_dq_all(output);
+        if (ret)
             return ret;
     }
 
-- 
2.43.0


From 6b90569e36985810367f0210c30c10ba3553c47a Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 29 Sep 2022 19:48:08 +0000
Subject: [PATCH 076/157] v4l2_m2m_dec: Deal correctly with avcC H264 data in
 extradata

Decoders expect AnnexB style headers, mkv and similar formats have
somewhat oddly wrapped extradata. Convert to annex-b style before use.
---
 libavcodec/v4l2_m2m.h     |   2 +-
 libavcodec/v4l2_m2m_dec.c | 177 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 169 insertions(+), 10 deletions(-)

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index ee72beb052..babf101d65 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -118,7 +118,7 @@ typedef struct V4L2m2mContext {
     /* Ext data sent */
     int extdata_sent;
     /* Ext data sent in packet - overrides ctx */
-    uint8_t * extdata_data;
+    void * extdata_data;
     size_t extdata_size;
 
 #define FF_V4L2_QUIRK_REINIT_ALWAYS             1
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index bb183097f6..6bd9926b3f 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -46,6 +46,71 @@
 #define STATS_LAST_COUNT_MAX 64
 #define STATS_INTERVAL_MAX (1 << 30)
 
+#ifndef FF_API_BUFFER_SIZE_T
+#define FF_API_BUFFER_SIZE_T 1
+#endif
+
+#define DUMP_FAILED_EXTRADATA 0
+
+#if DUMP_FAILED_EXTRADATA
+static inline char hex1(unsigned int x)
+{
+    x &= 0xf;
+    return x <= 9 ? '0' + x : 'a' + x - 10;
+}
+
+static inline char * hex2(char * s, unsigned int x)
+{
+    *s++ = hex1(x >> 4);
+    *s++ = hex1(x);
+    return s;
+}
+
+static inline char * hex4(char * s, unsigned int x)
+{
+    s = hex2(s, x >> 8);
+    s = hex2(s, x);
+    return s;
+}
+
+static inline char * dash2(char * s)
+{
+    *s++ = '-';
+    *s++ = '-';
+    return s;
+}
+
+static void
+data16(char * s, const unsigned int offset, const uint8_t * m, const size_t len)
+{
+    size_t i;
+    s = hex4(s, offset);
+    m += offset;
+    for (i = 0; i != 8; ++i) {
+        *s++ = ' ';
+        s = len > i + offset ? hex2(s, *m++) : dash2(s);
+    }
+    *s++ = ' ';
+    *s++ = ':';
+    for (; i != 16; ++i) {
+        *s++ = ' ';
+        s = len > i + offset ? hex2(s, *m++) : dash2(s);
+    }
+    *s++ = 0;
+}
+
+static void
+log_dump(void * logctx, int lvl, const void * const data, const size_t len)
+{
+    size_t i;
+    for (i = 0; i < len; i += 16) {
+        char buf[80];
+        data16(buf, i, data, len);
+        av_log(logctx, lvl, "%s\n", buf);
+    }
+}
+#endif
+
 static int64_t pts_stats_guess(const pts_stats_t * const stats)
 {
     if (stats->last_pts == AV_NOPTS_VALUE ||
@@ -98,6 +163,98 @@ static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char
     };
 }
 
+// If abdata == NULL then this just counts space required
+// Unpacks avcC if detected
+static int
+h264_xd_copy(const uint8_t * const extradata, const int extrasize, uint8_t * abdata)
+{
+    const uint8_t * const xdend = extradata + extrasize;
+    const uint8_t * p = extradata;
+    uint8_t * d = abdata;
+    unsigned int n;
+    unsigned int len;
+    const unsigned int hdrlen = 4;
+    unsigned int need_pps = 1;
+
+    if (extrasize < 8)
+        return AVERROR(EINVAL);
+
+    if (p[0] == 0 && p[1] == 0) {
+        // Assume a couple of leading zeros are good enough to indicate NAL
+        if (abdata)
+            memcpy(d, p, extrasize);
+        return extrasize;
+    }
+
+    // avcC starts with a 1
+    if (p[0] != 1)
+        return AVERROR(EINVAL);
+
+    p += 5;
+    n = *p++ & 0x1f;
+
+doxps:
+    while (n--) {
+        if (xdend - p < 2)
+            return AVERROR(EINVAL);
+        len = (p[0] << 8) | p[1];
+        p += 2;
+        if (xdend - p < (ptrdiff_t)len)
+            return AVERROR(EINVAL);
+        if (abdata) {
+            d[0] = 0;
+            d[1] = 0;
+            d[2] = 0;
+            d[3] = 1;
+            memcpy(d + 4, p, len);
+        }
+        d += len + hdrlen;
+        p += len;
+    }
+    if (need_pps) {
+        need_pps = 0;
+        if (p >= xdend)
+            return AVERROR(EINVAL);
+        n = *p++;
+        goto doxps;
+    }
+
+    return d - abdata;
+}
+
+static int
+copy_extradata(AVCodecContext * const avctx,
+               const void * const src_data, const int src_len,
+               void ** const pdst_data, size_t * const pdst_len)
+{
+    int len;
+
+    *pdst_len = 0;
+    av_freep(pdst_data);
+
+    if (avctx->codec_id == AV_CODEC_ID_H264)
+        len = h264_xd_copy(src_data, src_len, NULL);
+    else
+        len = src_len < 0 ? AVERROR(EINVAL) : src_len;
+
+    // Zero length is OK but we swant to stop - -ve is error val
+    if (len <= 0)
+        return len;
+
+    if ((*pdst_data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL)
+        return AVERROR(ENOMEM);
+
+    if (avctx->codec_id == AV_CODEC_ID_H264)
+        h264_xd_copy(src_data, src_len, *pdst_data);
+    else
+        memcpy(*pdst_data, src_data, len);
+    *pdst_len = len;
+
+    return 0;
+}
+
+
+
 static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s)
 {
     int ret;
@@ -277,13 +434,8 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
             side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
             if (side_data) {
                 av_log(avctx, AV_LOG_DEBUG, "New extradata\n");
-                av_freep(&s->extdata_data);
-                if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) {
-                    av_log(avctx, AV_LOG_ERROR, "Failed to alloc %zd bytes of extra data\n", side_size);
-                    return AVERROR(ENOMEM);
-                }
-                memcpy(s->extdata_data, side_data, side_size);
-                s->extdata_size = side_size;
+                if ((ret = copy_extradata(avctx, side_data, (int)side_size, &s->extdata_data, &s->extdata_size)) < 0)
+                    av_log(avctx, AV_LOG_WARNING, "Failed to copy new extra data: %s\n", av_err2str(ret));
                 s->extdata_sent = 0;
             }
 
@@ -359,8 +511,6 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
         ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
     else if (s->extdata_data)
         ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);
-    else
-        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size);
 
     if (ret == AVERROR(EAGAIN)) {
         // Out of input buffers - keep packet
@@ -770,6 +920,15 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
         return ret;
     }
 
+    if (avctx->extradata &&
+        (ret = copy_extradata(avctx, avctx->extradata, avctx->extradata_size, &s->extdata_data, &s->extdata_size)) != 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to copy extradata from context: %s\n", av_err2str(ret));
+#if DUMP_FAILED_EXTRADATA
+        log_dump(avctx, AV_LOG_INFO, avctx->extradata, avctx->extradata_size);
+#endif
+        return ret;
+    }
+
     if ((ret = v4l2_prepare_decoder(s)) < 0)
         return ret;
 
-- 
2.43.0


From c3334f8e3ebeaeb79a78657b6d19d31d6b8a9105 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 30 Sep 2022 14:20:23 +0000
Subject: [PATCH 077/157] v4l2_request_hevc: Fix up
 V4L2_CID_CODEC_STATELESS_BASE if missing

---
 libavcodec/hevc-ctrls-v4.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h
index 7829d82084..c02fdbe5a8 100644
--- a/libavcodec/hevc-ctrls-v4.h
+++ b/libavcodec/hevc-ctrls-v4.h
@@ -53,6 +53,13 @@
 #include <linux/const.h>
 #include <linux/types.h>
 
+#ifndef V4L2_CTRL_CLASS_CODEC_STATELESS
+#define V4L2_CTRL_CLASS_CODEC_STATELESS 0x00a40000	/* Stateless codecs controls */
+#endif
+#ifndef V4L2_CID_CODEC_STATELESS_BASE
+#define V4L2_CID_CODEC_STATELESS_BASE		(V4L2_CTRL_CLASS_CODEC_STATELESS | 0x900)
+#endif
+
 #define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
 
 #define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_STATELESS_BASE + 400)
-- 
2.43.0


From 957f55f5048da02400cf717acd9ff5a590897be6 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Sat, 1 Oct 2022 13:40:57 +0000
Subject: [PATCH 078/157] vf_deinterlace_v4l2m2m: Fix compile on m/c without
 V4L2 SAND

---
 libavfilter/vf_deinterlace_v4l2m2m.c | 33 +++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
index c52dae1c44..716789f988 100644
--- a/libavfilter/vf_deinterlace_v4l2m2m.c
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -35,6 +35,8 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
+#include "config.h"
+
 #include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
 #include "libavutil/common.h"
@@ -59,6 +61,16 @@
 #define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel packed */
 #endif
 
+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
+// in drm_fourcc.h hopefully will be sometime in the future but until then...
+#ifndef V4L2_PIX_FMT_NV12_10_COL128
+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
+#endif
+
+#ifndef V4L2_PIX_FMT_NV12_COL128
+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
+#endif
+
 typedef struct V4L2Queue V4L2Queue;
 typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared;
 
@@ -176,9 +188,11 @@ fmt_av_to_v4l2(const enum AVPixelFormat avfmt)
         return V4L2_PIX_FMT_YUV420;
     case AV_PIX_FMT_NV12:
         return V4L2_PIX_FMT_NV12;
+#if CONFIG_SAND
     case AV_PIX_FMT_RPI4_8:
     case AV_PIX_FMT_SAND128:
         return V4L2_PIX_FMT_NV12_COL128;
+#endif
     default:
         break;
     }
@@ -193,8 +207,10 @@ fmt_v4l2_to_av(const uint32_t pixfmt)
         return AV_PIX_FMT_YUV420P;
     case V4L2_PIX_FMT_NV12:
         return AV_PIX_FMT_NV12;
+#if CONFIG_SAND
     case V4L2_PIX_FMT_NV12_COL128:
         return AV_PIX_FMT_RPI4_8;
+#endif
     default:
         break;
     }
@@ -823,6 +839,7 @@ static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame)
                 h = src->layers[0].planes[1].offset / bpl;
                 w = bpl;
             }
+#if CONFIG_SAND
             else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
                 if (src->layers[0].nb_planes != 2)
                     break;
@@ -831,9 +848,11 @@ static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame)
                 h = src->layers[0].planes[1].offset / 128;
                 bpl = fourcc_mod_broadcom_param(mod);
             }
+#endif
             break;
 
         case DRM_FORMAT_P030:
+#if CONFIG_SAND
             if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
                 if (src->layers[0].nb_planes != 2)
                     break;
@@ -842,6 +861,7 @@ static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame)
                 h = src->layers[0].planes[1].offset / 128;
                 bpl = fourcc_mod_broadcom_param(mod);
             }
+#endif
             break;
 
         default:
@@ -1048,7 +1068,6 @@ static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf)
     AVDRMLayerDescriptor * const layer = &drm_desc->layers[0];
     const struct v4l2_format *const fmt = &q->format;
     const uint32_t height = fmt_height(fmt);
-    const uint32_t width  = fmt_width(fmt);
     ptrdiff_t bpl0;
 
     /* fill the DRM frame descriptor */
@@ -1063,7 +1082,7 @@ static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf)
     bpl0 = layer->planes[0].pitch;
 
     switch (fmt_pixelformat(fmt)) {
-
+#if CONFIG_SAND
         case V4L2_PIX_FMT_NV12_COL128:
             mod = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl0);
             layer->format = V4L2_PIX_FMT_NV12;
@@ -1074,9 +1093,10 @@ static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf)
             layer->nb_planes = 2;
             layer->planes[1].object_index = 0;
             layer->planes[1].offset = height * 128;
-            layer->planes[0].pitch = width;
-            layer->planes[1].pitch = width;
+            layer->planes[0].pitch = fmt_width(fmt);
+            layer->planes[1].pitch = layer->planes[0].pitch;
             break;
+#endif
 
         case DRM_FORMAT_NV12:
             layer->format = V4L2_PIX_FMT_NV12;
@@ -1576,7 +1596,10 @@ static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc)
         return is_linear ? V4L2_PIX_FMT_YUV420 : 0;
     case DRM_FORMAT_NV12:
         return is_linear ? V4L2_PIX_FMT_NV12 :
-            fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 : 0;
+#if CONFIG_SAND
+            fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 :
+#endif
+            0;
     default:
         break;
     }
-- 
2.43.0


From 1b190d6618734d7716fdf63b30dde9dcd3a575f0 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Sun, 2 Oct 2022 12:36:43 +0000
Subject: [PATCH 079/157] configure: Fix v4l2_req_hevc_vx setup; set after deps
 fixups

---
 configure | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/configure b/configure
index f16f85dbc3..7d0c60124c 100755
--- a/configure
+++ b/configure
@@ -6906,12 +6906,6 @@ fi
 check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
 check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
 disable v4l2_req_hevc_vx
-if enabled hevc_v4l2request_hwaccel; then
-    enable v4l2_req_hevc_vx
-fi
-if enabled hevc_v4l2_request; then
-    disable v4l2_req_hevc_vx
-fi
 
 check_headers sys/videoio.h
 test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
@@ -7407,6 +7401,9 @@ check_deps $CONFIG_LIST       \
 
 enabled threads && ! enabled pthreads && ! enabled atomics_native && die "non pthread threading without atomics not supported, try adding --enable-pthreads or --cpu=i486 or higher if you are on x86"
 
+# Sub-feature of hevc_v4l2request_hwaccel - can only be set once deps are done
+enabled hevc_v4l2request_hwaccel && disabled hevc_v4l2_request && enable v4l2_req_hevc_vx
+
 case $target_os in
 haiku)
     disable memalign
-- 
2.43.0


From 9599eebfaaea4a191a02aac705f2e56ee1d072f2 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Sat, 1 Oct 2022 12:39:45 +0000
Subject: [PATCH 080/157] vf_deinterlace_v4l2m2m: Ensure we get consistent
 final frames

On getting EOS at the input of the filster do not simply drop everything
in transit on the floor but attempt to retrieve everything possible from
the capture Q before on-signalling EOS.
If we know that we expect 1 frame in to always produce 1 frame out then
match CAPTURE frame to the last OUTPUT frame Qed (scale)
If frames out have an unknown relation to source frames (deinterlace) try
an encode stop and wait for the last frame marker to emerge from CAPTURE
---
 libavfilter/vf_deinterlace_v4l2m2m.c | 172 +++++++++++++++++++++++----
 1 file changed, 148 insertions(+), 24 deletions(-)

diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
index 716789f988..ce875c2c61 100644
--- a/libavfilter/vf_deinterlace_v4l2m2m.c
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -94,6 +94,7 @@ typedef struct V4L2Buffer {
 typedef struct V4L2Queue {
     struct v4l2_format format;
     struct v4l2_selection sel;
+    int eos;
     int num_buffers;
     V4L2Buffer *buffers;
     const char * name;
@@ -127,20 +128,41 @@ typedef struct pts_track_s
     pts_track_el_t a[PTS_TRACK_SIZE];
 } pts_track_t;
 
+typedef enum drain_state_e
+{
+    DRAIN_NONE = 0,     // Not draining
+    DRAIN_TIMEOUT,      // Drain until normal timeout setup yields no frame
+    DRAIN_LAST,         // Drain with long timeout last_frame in received on output expected
+    DRAIN_EOS,          // Drain with long timeout EOS expected
+    DRAIN_DONE          // Drained
+} drain_state_t;
+
 typedef struct DeintV4L2M2MContextShared {
     void * logctx;  // For logging - will be NULL when done
     filter_type_v4l2_t filter_type;
 
     int fd;
-    int done;
+    int done;   // fd closed - awating all refs dropped
     int width;
     int height;
 
+    int drain;          // EOS received (inlink status)
+    drain_state_t drain_state;
+    int64_t drain_pts;  // PTS associated with inline status
+
+    unsigned int frames_rx;
+    unsigned int frames_tx;
+
     // from options
     int output_width;
     int output_height;
     enum AVPixelFormat output_format;
 
+    int has_enc_stop;
+    // We expect to get exactly the same number of frames out as we put in
+    // We can drain by matching input to output
+    int one_to_one;
+
     int orig_width;
     int orig_height;
     atomic_uint refcount;
@@ -179,6 +201,12 @@ typedef struct DeintV4L2M2MContext {
     enum AVChromaLocation chroma_location;
 } DeintV4L2M2MContext;
 
+
+static inline int drain_frame_expected(const drain_state_t d)
+{
+    return d == DRAIN_EOS || d == DRAIN_LAST;
+}
+
 // These just list the ones we know we can cope with
 static uint32_t
 fmt_av_to_v4l2(const enum AVPixelFormat avfmt)
@@ -334,6 +362,13 @@ fail:
     return 0;
 }
 
+// We are only ever expecting in-order frames so nothing more clever is required
+static unsigned int
+pts_track_count(const pts_track_t * const trk)
+{
+    return (trk->n - trk->last_n) & (PTS_TRACK_SIZE - 1);
+}
+
 static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src)
 {
     const uint32_t n = pts_track_next_n(trk);
@@ -406,6 +441,12 @@ fmt_pixelformat(const struct v4l2_format * const fmt)
     return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
 }
 
+static inline uint32_t
+buf_bytesused0(const struct v4l2_buffer * const buf)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(buf->type) ? buf->m.planes[0].bytesused : buf->bytesused;
+}
+
 static void
 init_format(V4L2Queue * const q, const uint32_t format_type)
 {
@@ -1469,12 +1510,24 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim
 
     av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__);
 
+    if (queue->eos) {
+        av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: EOS\n", __func__);
+        return AVERROR_EOF;
+    }
+
     avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout);
     if (!avbuf) {
         av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout);
         return AVERROR(EAGAIN);
     }
 
+    if (V4L2_TYPE_IS_CAPTURE(avbuf->buffer.type)) {
+        if ((avbuf->buffer.flags & V4L2_BUF_FLAG_LAST) != 0)
+            queue->eos = 1;
+        if (buf_bytesused0(&avbuf->buffer) == 0)
+            return queue->eos ? AVERROR_EOF : AVERROR(EINVAL);
+    }
+
     // Fill in PTS and anciliary info from src frame
     pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame);
 
@@ -1686,6 +1739,20 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
         else
             ctx->field_order = V4L2_FIELD_INTERLACED_BT;
 
+        {
+            struct v4l2_encoder_cmd ecmd = {
+                .cmd = V4L2_ENC_CMD_STOP
+            };
+            ctx->has_enc_stop = 0;
+            if (ioctl(ctx->fd, VIDIOC_TRY_ENCODER_CMD, &ecmd) == 0) {
+                av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop succeeded\n");
+                ctx->has_enc_stop = 1;
+            }
+            else {
+                av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop fail: %s\n", av_err2str(AVERROR(errno)));
+            }
+
+        }
     }
 
     ret = deint_v4l2m2m_enqueue_frame(output, in);
@@ -1694,6 +1761,41 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
     return ret;
 }
 
+static int
+ack_inlink(AVFilterContext * const avctx, DeintV4L2M2MContextShared *const s,
+           AVFilterLink * const inlink)
+{
+    int instatus;
+    int64_t inpts;
+
+    if (ff_inlink_acknowledge_status(inlink, &instatus, &inpts) <= 0)
+        return 0;
+
+    s->drain      = instatus;
+    s->drain_pts  = inpts;
+    s->drain_state = DRAIN_TIMEOUT;
+
+    if (s->field_order == V4L2_FIELD_ANY) {  // Not yet started
+        s->drain_state = DRAIN_DONE;
+    }
+    else if (s->one_to_one) {
+        s->drain_state = DRAIN_LAST;
+    }
+    else if (s->has_enc_stop) {
+        struct v4l2_encoder_cmd ecmd = {
+            .cmd = V4L2_ENC_CMD_STOP
+        };
+        if (ioctl(s->fd, VIDIOC_ENCODER_CMD, &ecmd) == 0) {
+            av_log(avctx->priv, AV_LOG_DEBUG, "Do Encode stop\n");
+            s->drain_state = DRAIN_EOS;
+        }
+        else {
+            av_log(avctx->priv, AV_LOG_WARNING, "Encode stop fail: %s\n", av_err2str(AVERROR(errno)));
+        }
+    }
+    return 1;
+}
+
 static int deint_v4l2m2m_activate(AVFilterContext *avctx)
 {
     DeintV4L2M2MContext * const priv = avctx->priv;
@@ -1702,15 +1804,13 @@ static int deint_v4l2m2m_activate(AVFilterContext *avctx)
     AVFilterLink * const inlink = avctx->inputs[0];
     int n = 0;
     int cn = 99;
-    int instatus = 0;
-    int64_t inpts = 0;
     int did_something = 0;
 
     av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__);
 
     FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx);
 
-    ff_inlink_acknowledge_status(inlink, &instatus, &inpts);
+    ack_inlink(avctx, s, inlink);
 
     if (!ff_outlink_frame_wanted(outlink)) {
         av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__);
@@ -1720,7 +1820,6 @@ static int deint_v4l2m2m_activate(AVFilterContext *avctx)
         AVFrame * frame = av_frame_alloc();
         int rv;
 
-again:
         recycle_q(&s->output);
         n = count_enqueued(&s->output);
 
@@ -1729,10 +1828,21 @@ again:
             return AVERROR(ENOMEM);
         }
 
-        rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, n > 4 ? 300 : 0);
+        rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame,
+                                         drain_frame_expected(s->drain_state) || n > 4 ? 300 : 0);
         if (rv != 0) {
             av_frame_free(&frame);
-            if (rv != AVERROR(EAGAIN)) {
+            if (rv == AVERROR_EOF) {
+                av_log(priv, AV_LOG_DEBUG, "%s: --- DQ EOF\n", __func__);
+                s->drain_state = DRAIN_DONE;
+            }
+            else if (rv == AVERROR(EAGAIN)) {
+                if (s->drain_state != DRAIN_NONE) {
+                    av_log(priv, AV_LOG_DEBUG, "%s: --- DQ empty - drain done\n", __func__);
+                    s->drain_state = DRAIN_DONE;
+                }
+            }
+            else {
                 av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv));
                 return rv;
             }
@@ -1742,29 +1852,30 @@ again:
             // frame is always consumed by filter_frame - even on error despite
             // a somewhat confusing comment in the header
             rv = ff_filter_frame(outlink, frame);
-
-            if (instatus != 0) {
-                av_log(priv, AV_LOG_TRACE, "%s: eof loop\n", __func__);
-                goto again;
-            }
+            ++s->frames_tx;
 
             av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv));
             did_something = 1;
+
+            if (s->drain_state != DRAIN_NONE && pts_track_count(&s->track) == 0) {
+                av_log(priv, AV_LOG_DEBUG, "%s: --- DQ last - drain done\n", __func__);
+                s->drain_state = DRAIN_DONE;
+            }
         }
 
         cn = count_enqueued(&s->capture);
     }
 
-    if (instatus != 0) {
-        ff_outlink_set_status(outlink, instatus, inpts);
-        av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(instatus));
+    if (s->drain_state == DRAIN_DONE) {
+        ff_outlink_set_status(outlink, s->drain, s->drain_pts);
+        av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(s->drain));
         return 0;
     }
 
     recycle_q(&s->output);
     n = count_enqueued(&s->output);
 
-    while (n < 6) {
+    while (n < 6 && !s->drain) {
         AVFrame * frame;
         int rv;
 
@@ -1775,8 +1886,13 @@ again:
 
         if (frame == NULL) {
             av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
+            if (!ack_inlink(avctx, s, inlink)) {
+                ff_inlink_request_frame(inlink);
+                av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__);
+            }
             break;
         }
+        ++s->frames_rx;
 
         rv = deint_v4l2m2m_filter_frame(inlink, frame);
         av_frame_free(&frame);
@@ -1785,16 +1901,11 @@ again:
             return rv;
 
         av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
-        ++n;
-    }
-
-    if (n < 6) {
-        ff_inlink_request_frame(inlink);
         did_something = 1;
-        av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__);
+        ++n;
     }
 
-    if (n > 4 && ff_outlink_frame_wanted(outlink)) {
+    if ((n > 4 || s->drain) && ff_outlink_frame_wanted(outlink)) {
         ff_filter_set_ready(avctx, 1);
         did_something = 1;
         av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__);
@@ -1873,7 +1984,18 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
 
 static av_cold int scale_v4l2m2m_init(AVFilterContext *avctx)
 {
-    return common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE);
+    int rv;
+    DeintV4L2M2MContext * priv;
+    DeintV4L2M2MContextShared * ctx;
+
+    if ((rv = common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE)) != 0)
+        return rv;
+
+    priv = avctx->priv;
+    ctx = priv->shared;
+
+    ctx->one_to_one = 1;
+    return 0;
 }
 
 static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
@@ -1881,6 +2003,8 @@ static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
     DeintV4L2M2MContext *priv = avctx->priv;
     DeintV4L2M2MContextShared *ctx = priv->shared;
 
+    av_log(priv, AV_LOG_VERBOSE, "Frames Rx: %u, Frames Tx: %u\n",
+           ctx->frames_rx, ctx->frames_tx);
     ctx->done = 1;
     ctx->logctx = NULL;  // Log to NULL works, log to missing crashes
     pts_track_uninit(&ctx->track);
-- 
2.43.0


From 131d9b9cc221ed5972647160afbf82598a0e1c67 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 5 Oct 2022 16:12:02 +0000
Subject: [PATCH 081/157] v4l2_m2m_dec: Rework decode pending heuristic

The old code measured the length of the entire Q in the decoder and
attempted to dynamically guess an appropriate length. This was prone to
failure when the guesswork became confused.
The new code attempts to measure the Q length before insertion into decode
which, after all, is what we actually care about. It does this by
asserting that the decoder must have consumed all packets that came
before the one associated with the most recent CAPTURE frame.  This
avoids all need for reorder buffer size guesswork.
---
 libavcodec/v4l2_m2m.h     |  2 -
 libavcodec/v4l2_m2m_dec.c | 77 +++++++++++++++++----------------------
 2 files changed, 34 insertions(+), 45 deletions(-)

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index babf101d65..26a7161042 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -107,8 +107,6 @@ typedef struct V4L2m2mContext {
 
     /* Frame tracking */
     xlat_track_t xlat;
-    int pending_hw;
-    int pending_n;
 
     pts_stats_t pts_stat;
 
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 6bd9926b3f..bec9b22fcf 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -349,41 +349,54 @@ static void
 xlat_flush(xlat_track_t * const x)
 {
     unsigned int i;
+    // Do not reset track_no - this ensures that any frames left in the decoder
+    // that turn up later get discarded.
+
+    x->last_pts = AV_NOPTS_VALUE;
+    x->last_opaque = 0;
     for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) {
         x->track_els[i].pending = 0;
         x->track_els[i].discard = 1;
     }
-    x->last_pts = AV_NOPTS_VALUE;
+}
+
+static void
+xlat_init(xlat_track_t * const x)
+{
+    memset(x, 0, sizeof(*x));
+    xlat_flush(x);
 }
 
 static int
 xlat_pending(const xlat_track_t * const x)
 {
     unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
-    unsigned int i;
-    int r = 0;
-    int64_t now = AV_NOPTS_VALUE;
+    int i;
+    const int64_t now = x->last_pts;
 
-    for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) {
+    for (i = 0; i < FF_V4L2_M2M_TRACK_SIZE; ++i, n = (n - 1) & (FF_V4L2_M2M_TRACK_SIZE - 1)) {
         const V4L2m2mTrackEl * const t = x->track_els + n;
 
+        // Discard only set on never-set or flushed entries
+        // So if we get here we've never successfully decoded a frame so allow
+        // more frames into the buffer before stalling
+        if (t->discard)
+            return i - 16;
+
+        // If we've got this frame out then everything before this point
+        // must have entered the decoder
         if (!t->pending)
-            continue;
+            break;
 
+        // If we've never seen a pts all we can do is count frames
         if (now == AV_NOPTS_VALUE)
-            now = t->dts;
+            continue;
 
-        if (t->pts == AV_NOPTS_VALUE ||
-            ((now == AV_NOPTS_VALUE || t->pts <= now) &&
-             (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts)))
-            ++r;
+        if (t->dts != AV_NOPTS_VALUE && now >= t->dts)
+            break;
     }
 
-    // If we never get any ideas about PTS vs DTS allow a lot more buffer
-    if (now == AV_NOPTS_VALUE)
-        r -= 16;
-
-    return r;
+    return i;
 }
 
 static inline int stream_started(const V4L2m2mContext * const s) {
@@ -557,18 +570,6 @@ static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
     return rv;
 }
 
-// Number of frames over what xlat_pending returns that we keep *16
-// This is a min value - if it appears to be too small the threshold should
-// adjust dynamically.
-#define PENDING_HW_MIN      (3 * 16)
-// Offset to use when setting dynamically
-// Set to %16 == 15 to avoid the threshold changing immediately as we relax
-#define PENDING_HW_OFFSET   (PENDING_HW_MIN - 1)
-// Number of consecutive times we've failed to get a frame when we prefer it
-// before we increase the prefer threshold (5ms * N = max expected decode
-// time)
-#define PENDING_N_THRESHOLD 6
-
 static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 {
     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
@@ -578,9 +579,11 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 
     do {
         const int pending = xlat_pending(&s->xlat);
-        const int prefer_dq = (pending > s->pending_hw / 16);
+        const int prefer_dq = (pending > 3);
         const int last_src_rv = src_rv;
 
+        av_log(avctx, AV_LOG_TRACE, "Pending=%d, src_rv=%d, req_pkt=%d\n", pending, src_rv, s->req_pkt);
+
         // Enqueue another pkt for decode if
         // (a) We don't have a lot of stuff in the buffer already OR
         // (b) ... we (think we) do but we've failed to get a frame already OR
@@ -625,20 +628,8 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
                 }
             }
 
-            // Adjust dynamic pending threshold
-            if (dst_rv == 0) {
-                if (--s->pending_hw < PENDING_HW_MIN)
-                    s->pending_hw = PENDING_HW_MIN;
-                s->pending_n = 0;
-
+            if (dst_rv == 0)
                 set_best_effort_pts(avctx, &s->pts_stat, frame);
-            }
-            else if (dst_rv == AVERROR(EAGAIN)) {
-                if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
-                    s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
-                    s->pending_n = 0;
-                }
-            }
 
             if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
                 av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
@@ -857,8 +848,8 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     if (ret < 0)
         return ret;
 
+    xlat_init(&s->xlat);
     pts_stats_init(&s->pts_stat, avctx, "decoder");
-    s->pending_hw = PENDING_HW_MIN;
 
     capture = &s->capture;
     output = &s->output;
-- 
2.43.0


From 77e8a7c929864f7aa4362db7f0dfe93f9ac09254 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 21 Oct 2022 13:48:07 +0000
Subject: [PATCH 082/157] pthread_frame: Fix MT hwaccel. Recent change broke
 it.

Revert the effects of 35aa7e70e7ec350319e7634a30d8d8aa1e6ecdda if the
hwaccel is marked MT_SAFE.
---
 libavcodec/pthread_frame.c | 48 ++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 12 deletions(-)

diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c
index 2cc89a41f5..b14f8e9360 100644
--- a/libavcodec/pthread_frame.c
+++ b/libavcodec/pthread_frame.c
@@ -231,7 +231,7 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
             p->hwaccel_serializing = 0;
             pthread_mutex_unlock(&p->parent->hwaccel_mutex);
         }
-        av_assert0(!avctx->hwaccel);
+        av_assert0(!avctx->hwaccel || (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE));
 
         if (p->async_serializing) {
             p->async_serializing = 0;
@@ -319,6 +319,12 @@ FF_ENABLE_DEPRECATION_WARNINGS
         }
 
         dst->hwaccel_flags = src->hwaccel_flags;
+        if (src->hwaccel &&
+            (src->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
+            dst->hwaccel = src->hwaccel;
+            dst->hwaccel_context = src->hwaccel_context;
+            dst->internal->hwaccel_priv_data = src->internal->hwaccel_priv_data;
+        }
 
         err = av_buffer_replace(&dst->internal->pool, src->internal->pool);
         if (err < 0)
@@ -434,10 +440,13 @@ static int submit_packet(PerThreadContext *p, AVCodecContext *user_avctx,
     }
 
     /* transfer the stashed hwaccel state, if any */
-    av_assert0(!p->avctx->hwaccel);
-    FFSWAP(const AVHWAccel*, p->avctx->hwaccel,                     fctx->stash_hwaccel);
-    FFSWAP(void*,            p->avctx->hwaccel_context,             fctx->stash_hwaccel_context);
-    FFSWAP(void*,            p->avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
+    av_assert0(!p->avctx->hwaccel || (p->avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE));
+    if (p->avctx->hwaccel &&
+        !(p->avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
+        FFSWAP(const AVHWAccel*, p->avctx->hwaccel,                     fctx->stash_hwaccel);
+        FFSWAP(void*,            p->avctx->hwaccel_context,             fctx->stash_hwaccel_context);
+        FFSWAP(void*,            p->avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
+    }
 
     av_packet_unref(p->avpkt);
     ret = av_packet_ref(p->avpkt, avpkt);
@@ -610,9 +619,12 @@ void ff_thread_finish_setup(AVCodecContext *avctx) {
      * this is done here so that this worker thread can wipe its own hwaccel
      * state after decoding, without requiring synchronization */
     av_assert0(!p->parent->stash_hwaccel);
-    p->parent->stash_hwaccel         = avctx->hwaccel;
-    p->parent->stash_hwaccel_context = avctx->hwaccel_context;
-    p->parent->stash_hwaccel_priv    = avctx->internal->hwaccel_priv_data;
+    if (avctx->hwaccel &&
+        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
+        p->parent->stash_hwaccel         = avctx->hwaccel;
+        p->parent->stash_hwaccel_context = avctx->hwaccel_context;
+        p->parent->stash_hwaccel_priv    = avctx->internal->hwaccel_priv_data;
+    }
 
     pthread_mutex_lock(&p->progress_mutex);
     if(atomic_load(&p->state) == STATE_SETUP_FINISHED){
@@ -667,6 +679,15 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)
 
     park_frame_worker_threads(fctx, thread_count);
 
+     if (fctx->prev_thread &&
+         avctx->hwaccel && (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) &&
+         avctx->internal->hwaccel_priv_data !=
+                             fctx->prev_thread->avctx->internal->hwaccel_priv_data) {
+        if (update_context_from_thread(avctx, fctx->prev_thread->avctx, 1) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to update user thread.\n");
+        }
+    }
+
     for (i = 0; i < thread_count; i++) {
         PerThreadContext *p = &fctx->threads[i];
         AVCodecContext *ctx = p->avctx;
@@ -710,10 +731,13 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)
 
     /* if we have stashed hwaccel state, move it to the user-facing context,
      * so it will be freed in avcodec_close() */
-    av_assert0(!avctx->hwaccel);
-    FFSWAP(const AVHWAccel*, avctx->hwaccel,                     fctx->stash_hwaccel);
-    FFSWAP(void*,            avctx->hwaccel_context,             fctx->stash_hwaccel_context);
-    FFSWAP(void*,            avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
+    av_assert0(!avctx->hwaccel || (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE));
+    if (avctx->hwaccel &&
+        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
+        FFSWAP(const AVHWAccel*, avctx->hwaccel,                     fctx->stash_hwaccel);
+        FFSWAP(void*,            avctx->hwaccel_context,             fctx->stash_hwaccel_context);
+        FFSWAP(void*,            avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
+    }
 
     av_freep(&avctx->internal->thread_ctx);
 }
-- 
2.43.0


From d4fee8420f0bf78ef24b0874973f999dda46d0a0 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 18 Oct 2022 13:18:27 +0000
Subject: [PATCH 083/157] v4l2_req: Add swfmt to init logging

(cherry picked from commit dfa03b702baaf2952bcd2bbf8badcc2f9c961ddf)
---
 libavcodec/v4l2_request_hevc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
index 614a1b4d99..767ecb036a 100644
--- a/libavcodec/v4l2_request_hevc.c
+++ b/libavcodec/v4l2_request_hevc.c
@@ -26,6 +26,7 @@
 #include "v4l2_request_hevc.h"
 
 #include "libavutil/hwcontext_drm.h"
+#include "libavutil/pixdesc.h"
 
 #include "v4l2_req_devscan.h"
 #include "v4l2_req_dmabufs.h"
@@ -306,10 +307,11 @@ retry_src_memtype:
     // Set our s/w format
     avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format;
 
-    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s\n",
+    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s; swfmt=%s\n",
            ctx->fns->name,
            decdev_media_path(decdev), decdev_video_path(decdev),
-           mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype));
+           mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype),
+           av_get_pix_fmt_name(avctx->sw_pix_fmt));
 
     return 0;
 
-- 
2.43.0


From dc347192fe47e1256c68d7c45d8834398cdfec82 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 18 Oct 2022 13:39:54 +0000
Subject: [PATCH 084/157] v4l2_m2m: Avoid polling on a queue that is streamoff

(cherry picked from commit b2658bc56d3034a17db7f39597fc7d71bfe9a43b)
---
 libavcodec/v4l2_context.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 4a359bf45e..b296dc111c 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -578,6 +578,11 @@ get_event(V4L2m2mContext * const m)
     return 0;
 }
 
+static inline int
+dq_ok(const V4L2Context * const c)
+{
+    return c->streamon && atomic_load(&c->q_count) != 0;
+}
 
 // Get a buffer
 // If output then just gets the buffer in the expected way
@@ -613,13 +618,13 @@ get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout
         }
 
         // If capture && timeout == -1 then also wait for rx buffer free
-        if (is_cap && timeout == -1 && m->output.streamon && !m->draining)
+        if (is_cap && timeout == -1 && dq_ok(&m->output) && !m->draining)
             pfd.events |= poll_out;
 
         // If nothing Qed all we will get is POLLERR - avoid that
-        if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) ||
-            (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) ||
-            (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) {
+        if ((pfd.events == poll_out && !dq_ok(&m->output)) ||
+            (pfd.events == poll_cap && !dq_ok(&m->capture)) ||
+            (pfd.events == (poll_cap | poll_out) && !dq_ok(&m->capture) && !dq_ok(&m->output))) {
             av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
             return AVERROR(ENOSPC);
         }
-- 
2.43.0


From 43b92358bc6f2f563c6d7c211239153badddda44 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 18 Oct 2022 14:07:04 +0000
Subject: [PATCH 085/157] v4l2_m2m: Add function to get number of queued
 buffers

(cherry picked from commit f9ac6485c00b4531dcff354222aef450b29728f4)
---
 libavcodec/v4l2_context.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 523c53e97d..8e4f681643 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -220,4 +220,15 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f);
 
 void ff_v4l2_dq_all(V4L2Context *const ctx);
 
+/**
+ * Returns the number of buffers currently queued
+ *
+ * @param[in] ctx The V4L2Context to evaluate
+ */
+static inline int
+ff_v4l2_context_q_count(const V4L2Context* const ctx)
+{
+    return atomic_load(&ctx->q_count);
+}
+
 #endif // AVCODEC_V4L2_CONTEXT_H
-- 
2.43.0


From c122e985b7b3cfd78a5896d9a365715a887fea65 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 18 Oct 2022 14:48:20 +0000
Subject: [PATCH 086/157] v4l2_m2m: Add timeouts to dq_all and dequeue_packet

Add timeouts and use them to have better flow control in encode

(cherry picked from commit c6173cad7f21697e12887982bda796de9719bb32)
---
 libavcodec/v4l2_context.c | 16 +++++++++++-----
 libavcodec/v4l2_context.h | 15 +++++++++++++--
 libavcodec/v4l2_m2m_enc.c | 28 +++++++++++++++++++---------
 3 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index b296dc111c..7031f3d340 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -712,13 +712,19 @@ clean_v4l2_buffer(V4L2Buffer * const avbuf)
     return avbuf;
 }
 
-void
-ff_v4l2_dq_all(V4L2Context *const ctx)
+int
+ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1)
 {
     V4L2Buffer * avbuf;
+    if (timeout1 != 0) {
+        int rv = get_qbuf(ctx, &avbuf, timeout1);
+        if (rv != 0)
+            return rv;
+    }
     do {
         get_qbuf(ctx, &avbuf, 0);
     } while (avbuf);
+    return 0;
 }
 
 static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
@@ -727,7 +733,7 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
 
     /* get back as many output buffers as possible */
     if (V4L2_TYPE_IS_OUTPUT(ctx->type))
-        ff_v4l2_dq_all(ctx);
+        ff_v4l2_dq_all(ctx, 0);
 
     for (i = 0; i < ctx->num_buffers; i++) {
         V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
@@ -1047,7 +1053,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
    return 0;
 }
 
-int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
+int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout)
 {
     V4L2m2mContext *s = ctx_to_m2mctx(ctx);
     AVCodecContext *const avctx = s->avctx;
@@ -1055,7 +1061,7 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
     int rv;
 
     do {
-        if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
+        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
             return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
         if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0)
             return rv;
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 8e4f681643..5afed3e6ec 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -179,7 +179,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd);
  * @param[inout] pkt The AVPacket to dequeue to.
  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
  */
-int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
+int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout);
 
 /**
  * Dequeues a buffer from a V4L2Context to an AVFrame.
@@ -218,7 +218,18 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const
  */
 int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f);
 
-void ff_v4l2_dq_all(V4L2Context *const ctx);
+/**
+ * Dequeue all buffers on this queue
+ *
+ * Used to recycle output buffers
+ *
+ * @param[in] ctx The V4L2Context to dequeue from.
+ * @param[in] timeout1 A timeout on dequeuing the 1st buffer, 
+ *       all others have a timeout of zero
+ * @return AVERROR(EAGAIN) if timeout1 non-zero then the return
+ *         of the first dequeue operation, 0 otherwise.
+ */
+int ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1);
 
 /**
  * Returns the number of buffers currently queued
diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
index a992a3cccc..d0d27e5bc2 100644
--- a/libavcodec/v4l2_m2m_enc.c
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -420,16 +420,24 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
 {
     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
     V4L2Context *const output = &s->output;
+    int rv;
+    int needs_slot = ff_v4l2_context_q_count(output) == output->num_buffers;
 
-    ff_v4l2_dq_all(output);
+    av_log(avctx, AV_LOG_TRACE, "<<< %s; needs_slot=%d\n", __func__, needs_slot);
 
-    // Signal EOF if needed
+    // Signal EOF if needed (doesn't need q slot)
     if (!frame) {
         return ff_v4l2_context_enqueue_frame(output, frame);
     }
 
+    if ((rv = ff_v4l2_dq_all(output, needs_slot? 500 : 0)) != 0) {
+        // We should be able to return AVERROR(EAGAIN) to indicate buffer
+        // exhaustion, but ffmpeg currently treats that as fatal.
+        av_log(avctx, AV_LOG_WARNING, "Failed to get buffer for src frame: %s\n", av_err2str(rv));
+        return rv;
+    }
+
     if (s->input_drm && !output->streamon) {
-        int rv;
         struct v4l2_format req_format = {.type = output->format.type};
 
         // Set format when we first get a buffer
@@ -494,7 +502,9 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
     AVFrame *frame = s->frame;
     int ret;
 
-    ff_v4l2_dq_all(output);
+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
+
+    ff_v4l2_dq_all(output, 0);
 
     if (s->draining)
         goto dequeue;
@@ -532,10 +542,10 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
     }
 
 dequeue:
-    ret = ff_v4l2_context_dequeue_packet(capture, avpkt);
-    ff_v4l2_dq_all(output);
+    ret = ff_v4l2_context_dequeue_packet(capture, avpkt, s->draining ? 300 : 0);
+    ff_v4l2_dq_all(output, 0);
     if (ret)
-        return ret;
+        return (s->draining && ret == AVERROR(EAGAIN)) ? AVERROR_EOF : ret;
 
     if (capture->first_buf == 1) {
         uint8_t * data;
@@ -566,8 +576,8 @@ dequeue:
             s->extdata_size = len;
         }
 
-        ret = ff_v4l2_context_dequeue_packet(capture, avpkt);
-        ff_v4l2_dq_all(output);
+        ret = ff_v4l2_context_dequeue_packet(capture, avpkt, 0);
+        ff_v4l2_dq_all(output, 0);
         if (ret)
             return ret;
     }
-- 
2.43.0


From 6e86d7e50f86ee8e32f15b2d78507e65d58e13f6 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 18 Oct 2022 14:23:32 +0000
Subject: [PATCH 087/157] v4l2_m2m_enc: Improve debug trace

(cherry picked from commit 113e89daffb329a0cd3d920abd483a4025664bf5)
---
 libavcodec/v4l2_m2m_enc.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
index d0d27e5bc2..c8c2de3d47 100644
--- a/libavcodec/v4l2_m2m_enc.c
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -427,6 +427,7 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
 
     // Signal EOF if needed (doesn't need q slot)
     if (!frame) {
+        av_log(avctx, AV_LOG_TRACE, "--- %s: EOS\n", __func__);
         return ff_v4l2_context_enqueue_frame(output, frame);
     }
 
@@ -491,7 +492,12 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
         v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1);
 #endif
 
-    return ff_v4l2_context_enqueue_frame(output, frame);
+    rv = ff_v4l2_context_enqueue_frame(output, frame);
+    if (rv) {
+        av_log(avctx, AV_LOG_ERROR, "Enqueue frame failed: %s\n", av_err2str(rv));
+    }
+
+    return rv;
 }
 
 static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
@@ -502,7 +508,8 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
     AVFrame *frame = s->frame;
     int ret;
 
-    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
+    av_log(avctx, AV_LOG_TRACE, "<<< %s: qlen out %d cap %d\n", __func__,
+           ff_v4l2_context_q_count(output), ff_v4l2_context_q_count(capture));
 
     ff_v4l2_dq_all(output, 0);
 
@@ -615,11 +622,11 @@ dequeue:
         avpkt->size = newlen;
     }
 
-//    av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret);
     capture->first_buf = 0;
     return 0;
 
 fail_no_mem:
+    av_log(avctx, AV_LOG_ERROR, "Rx pkt failed: No memory\n");
     ret = AVERROR(ENOMEM);
     av_packet_unref(avpkt);
     return ret;
-- 
2.43.0


From 8c7a87f44738b35242ce0e67fbc1611715adfa58 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 18 Oct 2022 13:22:36 +0000
Subject: [PATCH 088/157] v4l2_m2m_enc: Copy dest packets to memory if short of
 v4l2 buffers

(cherry picked from commit aa4ebbda400b42db952fc713b26927fc8636b0e5)
---
 libavcodec/v4l2_m2m_enc.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
index c8c2de3d47..c23187e6e6 100644
--- a/libavcodec/v4l2_m2m_enc.c
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -621,6 +621,22 @@ dequeue:
         avpkt->data = buf->data;
         avpkt->size = newlen;
     }
+    else if (ff_v4l2_context_q_count(capture) < 2) {
+        // Avoid running out of capture buffers
+        // In most cases the buffers will be returned quickly in which case
+        // we don't copy and can use the v4l2 buffers directly but sometimes
+        // ffmpeg seems to hold onto all of them for a long time (.mkv
+        // creation?) so avoid deadlock in those cases.
+        AVBufferRef * const buf = av_buffer_alloc(avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE);
+        if (buf == NULL)
+            goto fail_no_mem;
+
+        memcpy(buf->data, avpkt->data, avpkt->size);
+        av_buffer_unref(&avpkt->buf);  // Will recycle the V4L2 buffer
+
+        avpkt->buf = buf;
+        avpkt->data = buf->data;
+    }
 
     capture->first_buf = 0;
     return 0;
-- 
2.43.0


From 95faf2fb7da5c7c2f6060fd737836b2143d6502d Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 19 Oct 2022 11:00:16 +0000
Subject: [PATCH 089/157] v4l2_m2m_dec: Fix pts_best_effort guessing for
 initial pts

(cherry picked from commit 1af32e5c87586a0f7e76cdf19a012ddbcf3eac67)
---
 libavcodec/v4l2_m2m_dec.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index bec9b22fcf..47b2735f82 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -113,6 +113,8 @@ log_dump(void * logctx, int lvl, const void * const data, const size_t len)
 
 static int64_t pts_stats_guess(const pts_stats_t * const stats)
 {
+    if (stats->last_count <= 1)
+        return stats->last_pts;
     if (stats->last_pts == AV_NOPTS_VALUE ||
             stats->last_interval == 0 ||
             stats->last_count >= STATS_LAST_COUNT_MAX)
-- 
2.43.0


From f244a22b89417917a7feee962e253fa1eb94246a Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 19 Oct 2022 14:47:04 +0000
Subject: [PATCH 090/157] v4l2_m2m_enc: Wait for frame or space in src Q in
 rx_pkt

If receive_packet we should ensure that there is space in the source Q
if we return EAGAIN so wait for either an output packet or space if
the source Q is currently full.

(cherry picked from commit 82f0c55782a67a8cc665d937647706c2a75f5548)
---
 libavcodec/v4l2_m2m_enc.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
index c23187e6e6..524e9424a5 100644
--- a/libavcodec/v4l2_m2m_enc.c
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -415,13 +415,17 @@ static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format *
     return 1;
 }
 
+static inline int q_full(const V4L2Context *const output)
+{
+    return ff_v4l2_context_q_count(output) == output->num_buffers;
+}
 
 static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
 {
     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
     V4L2Context *const output = &s->output;
     int rv;
-    int needs_slot = ff_v4l2_context_q_count(output) == output->num_buffers;
+    const int needs_slot = q_full(output);
 
     av_log(avctx, AV_LOG_TRACE, "<<< %s; needs_slot=%d\n", __func__, needs_slot);
 
@@ -549,8 +553,20 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
     }
 
 dequeue:
-    ret = ff_v4l2_context_dequeue_packet(capture, avpkt, s->draining ? 300 : 0);
-    ff_v4l2_dq_all(output, 0);
+    // Dequeue a frame
+    for (;;) {
+        int t = q_full(output) ? -1 : s->draining ? 300 : 0;
+        int rv2;
+
+        // If output is full wait for either a packet or output to become not full
+        ret = ff_v4l2_context_dequeue_packet(capture, avpkt, t);
+
+        // If output was full retry packet dequeue
+        t = (ret != AVERROR(EAGAIN) || t != -1) ? 0 : 300;
+        rv2 = ff_v4l2_dq_all(output, t);
+        if (t == 0 || rv2 != 0)
+            break;
+    }
     if (ret)
         return (s->draining && ret == AVERROR(EAGAIN)) ? AVERROR_EOF : ret;
 
-- 
2.43.0


From cc147fd26f95618d75a910962c9b1e95884db82f Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 19 Oct 2022 14:54:29 +0000
Subject: [PATCH 091/157] vf_deinterlace_v4l2m2m: Print dts rather that NOPTS
 in trace

(cherry picked from commit e9b468f35f0c6ad9bfe96f5a05e449afa8ae074a)
---
 libavfilter/vf_deinterlace_v4l2m2m.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
index ce875c2c61..7c6751b69c 100644
--- a/libavfilter/vf_deinterlace_v4l2m2m.c
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -1668,8 +1668,8 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
     V4L2Queue *output              = &ctx->output;
     int ret;
 
-    av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" (%"PRId64") field :%d interlaced: %d aspect:%d/%d\n",
-          __func__, in->pts, AV_NOPTS_VALUE, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den);
+    av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" dts: %"PRId64" field :%d interlaced: %d aspect:%d/%d\n",
+           __func__, in->pts, in->pkt_dts, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den);
     av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__,
            avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out);
 
-- 
2.43.0


From 1649075f8620190eec7ad5b8e721afa5c72e0d81 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 19 Oct 2022 14:55:21 +0000
Subject: [PATCH 092/157] vf_deinterlace_v4l2m2m: Ignore "wanted" when
 processing input

If we gate send a frame to the outlink on its frame_wanted flag then we
will sometimes stall as the flag may not get set by ffmpeg's filter
processing. So stuff the output whether or not it wants it which works
much better.

(cherry picked from commit 808254cc04e5e6574cbab9af254b6c2f3d4142e3)
---
 libavfilter/vf_deinterlace_v4l2m2m.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
index 7c6751b69c..a173a291f8 100644
--- a/libavfilter/vf_deinterlace_v4l2m2m.c
+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
@@ -1812,10 +1812,7 @@ static int deint_v4l2m2m_activate(AVFilterContext *avctx)
 
     ack_inlink(avctx, s, inlink);
 
-    if (!ff_outlink_frame_wanted(outlink)) {
-        av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__);
-    }
-    else if (s->field_order != V4L2_FIELD_ANY)  // Can't DQ if no setup!
+    if (s->field_order != V4L2_FIELD_ANY)  // Can't DQ if no setup!
     {
         AVFrame * frame = av_frame_alloc();
         int rv;
-- 
2.43.0


From f143352c0d9571df67bbc833b911be8edef1f658 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 15 Nov 2022 13:33:00 +0000
Subject: [PATCH 093/157] egl_vout: Make formatting consistent - no code
 changes

---
 libavdevice/egl_vout.c | 741 ++++++++++++++++++++---------------------
 1 file changed, 369 insertions(+), 372 deletions(-)

diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c
index 7b9c610ace..a52cabb082 100644
--- a/libavdevice/egl_vout.c
+++ b/libavdevice/egl_vout.c
@@ -48,20 +48,20 @@
 #define TRACE_ALL 0
 
 struct egl_setup {
-   int conId;
-
-   Display *dpy;
-   EGLDisplay egl_dpy;
-   EGLContext ctx;
-   EGLSurface surf;
-   Window win;
-
-   uint32_t crtcId;
-   int crtcIdx;
-   uint32_t planeId;
-   struct {
-       int x, y, width, height;
-   } compose;
+    int conId;
+
+    Display *dpy;
+    EGLDisplay egl_dpy;
+    EGLContext ctx;
+    EGLSurface surf;
+    Window win;
+
+    uint32_t crtcId;
+    int crtcIdx;
+    uint32_t planeId;
+    struct {
+        int x, y, width, height;
+    } compose;
 };
 
 typedef struct egl_aux_s {
@@ -70,8 +70,7 @@ typedef struct egl_aux_s {
 
 } egl_aux_t;
 
-typedef struct egl_display_env_s
-{
+typedef struct egl_display_env_s {
     AVClass *class;
 
     struct egl_setup setup;
@@ -89,8 +88,8 @@ typedef struct egl_display_env_s
     sem_t display_start_sem;
     sem_t q_sem;
     int q_terminate;
-    AVFrame * q_this;
-    AVFrame * q_next;
+    AVFrame *q_this;
+    AVFrame *q_next;
 
 } egl_display_env_t;
 
@@ -99,45 +98,44 @@ typedef struct egl_display_env_s
  * Remove window border/decorations.
  */
 static void
-no_border( Display *dpy, Window w)
+no_border(Display *dpy, Window w)
 {
-   static const unsigned MWM_HINTS_DECORATIONS = (1 << 1);
-   static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5;
-
-   typedef struct
-   {
-      unsigned long       flags;
-      unsigned long       functions;
-      unsigned long       decorations;
-      long                inputMode;
-      unsigned long       status;
-   } PropMotifWmHints;
-
-   PropMotifWmHints motif_hints;
-   Atom prop, proptype;
-   unsigned long flags = 0;
-
-   /* setup the property */
-   motif_hints.flags = MWM_HINTS_DECORATIONS;
-   motif_hints.decorations = flags;
-
-   /* get the atom for the property */
-   prop = XInternAtom( dpy, "_MOTIF_WM_HINTS", True );
-   if (!prop) {
-      /* something went wrong! */
-      return;
-   }
-
-   /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */
-   proptype = prop;
-
-   XChangeProperty( dpy, w,                         /* display, window */
+    static const unsigned MWM_HINTS_DECORATIONS = (1 << 1);
+    static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5;
+
+    typedef struct {
+        unsigned long       flags;
+        unsigned long       functions;
+        unsigned long       decorations;
+        long                inputMode;
+        unsigned long       status;
+    } PropMotifWmHints;
+
+    PropMotifWmHints motif_hints;
+    Atom prop, proptype;
+    unsigned long flags = 0;
+
+    /* setup the property */
+    motif_hints.flags = MWM_HINTS_DECORATIONS;
+    motif_hints.decorations = flags;
+
+    /* get the atom for the property */
+    prop = XInternAtom(dpy, "_MOTIF_WM_HINTS", True);
+    if (!prop) {
+        /* something went wrong! */
+        return;
+    }
+
+    /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */
+    proptype = prop;
+
+    XChangeProperty(dpy, w,                         /* display, window */
                     prop, proptype,                 /* property, type */
                     32,                             /* format: 32-bit datums */
                     PropModeReplace,                /* mode */
-                    (unsigned char *) &motif_hints, /* data */
+                    (unsigned char *)&motif_hints, /* data */
                     PROP_MOTIF_WM_HINTS_ELEMENTS    /* nelements */
-                  );
+                   );
 }
 
 
@@ -146,247 +144,247 @@ no_border( Display *dpy, Window w)
  * Return the window and context handles.
  */
 static int
-make_window(struct AVFormatContext * const s,
-            egl_display_env_t * const de,
+make_window(struct AVFormatContext *const s,
+            egl_display_env_t *const de,
             Display *dpy, EGLDisplay egl_dpy, const char *name,
             Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet)
 {
-   int scrnum = DefaultScreen( dpy );
-   XSetWindowAttributes attr;
-   unsigned long mask;
-   Window root = RootWindow( dpy, scrnum );
-   Window win;
-   EGLContext ctx;
-   const int fullscreen = de->fullscreen;
-   EGLConfig config;
-   int x = de->window_x;
-   int y = de->window_y;
-   int width = de->window_width ? de->window_width : 1280;
-   int height = de->window_height ? de->window_height : 720;
-
-
-   if (fullscreen) {
-      int scrnum = DefaultScreen(dpy);
-
-      x = 0; y = 0;
-      width = DisplayWidth(dpy, scrnum);
-      height = DisplayHeight(dpy, scrnum);
-   }
-
-   {
-      EGLint num_configs;
-      static const EGLint attribs[] = {
-         EGL_RED_SIZE, 1,
-         EGL_GREEN_SIZE, 1,
-         EGL_BLUE_SIZE, 1,
-         EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
-         EGL_NONE
-      };
-
-      if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) {
-         av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n");
-         return -1;
-      }
-   }
-
-   {
-      EGLint vid;
-      if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) {
-         av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n");
-         return -1;
-      }
-
-      {
-         XVisualInfo visTemplate = {
-            .visualid = vid,
-         };
-         int num_visuals;
-         XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask,
-                                               &visTemplate, &num_visuals);
-
-         /* window attributes */
-         attr.background_pixel = 0;
-         attr.border_pixel = 0;
-         attr.colormap = XCreateColormap( dpy, root, visinfo->visual, AllocNone);
-         attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask;
-         /* XXX this is a bad way to get a borderless window! */
-         mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask;
-
-         win = XCreateWindow( dpy, root, x, y, width, height,
-                              0, visinfo->depth, InputOutput,
-                              visinfo->visual, mask, &attr );
-         XFree(visinfo);
-      }
-   }
-
-   if (fullscreen)
-      no_border(dpy, win);
-
-   /* set hints and properties */
-   {
-      XSizeHints sizehints;
-      sizehints.x = x;
-      sizehints.y = y;
-      sizehints.width  = width;
-      sizehints.height = height;
-      sizehints.flags = USSize | USPosition;
-      XSetNormalHints(dpy, win, &sizehints);
-      XSetStandardProperties(dpy, win, name, name,
-                              None, (char **)NULL, 0, &sizehints);
-   }
-
-   eglBindAPI(EGL_OPENGL_ES_API);
-
-   {
-      static const EGLint ctx_attribs[] = {
-         EGL_CONTEXT_CLIENT_VERSION, 2,
-         EGL_NONE
-      };
-      ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs );
-      if (!ctx) {
-         av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
-         return -1;
-      }
-   }
-
-
-   XMapWindow(dpy, win);
-
-   {
-      EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
-      if (!surf) {
-         av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
-         return -1;
-      }
-
-      if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) {
-         av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
-         return -1;
-      }
-
-      *winRet = win;
-      *ctxRet = ctx;
-      *surfRet = surf;
-   }
-
-   return 0;
+    int scrnum = DefaultScreen(dpy);
+    XSetWindowAttributes attr;
+    unsigned long mask;
+    Window root = RootWindow(dpy, scrnum);
+    Window win;
+    EGLContext ctx;
+    const int fullscreen = de->fullscreen;
+    EGLConfig config;
+    int x = de->window_x;
+    int y = de->window_y;
+    int width = de->window_width ? de->window_width : 1280;
+    int height = de->window_height ? de->window_height : 720;
+
+
+    if (fullscreen) {
+        int scrnum = DefaultScreen(dpy);
+
+        x = 0; y = 0;
+        width = DisplayWidth(dpy, scrnum);
+        height = DisplayHeight(dpy, scrnum);
+    }
+
+    {
+        EGLint num_configs;
+        static const EGLint attribs[] = {
+            EGL_RED_SIZE, 1,
+            EGL_GREEN_SIZE, 1,
+            EGL_BLUE_SIZE, 1,
+            EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
+            EGL_NONE
+        };
+
+        if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) {
+            av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n");
+            return -1;
+        }
+    }
+
+    {
+        EGLint vid;
+        if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) {
+            av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n");
+            return -1;
+        }
+
+        {
+            XVisualInfo visTemplate = {
+                .visualid = vid,
+            };
+            int num_visuals;
+            XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask,
+                                                  &visTemplate, &num_visuals);
+
+            /* window attributes */
+            attr.background_pixel = 0;
+            attr.border_pixel = 0;
+            attr.colormap = XCreateColormap(dpy, root, visinfo->visual, AllocNone);
+            attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask;
+            /* XXX this is a bad way to get a borderless window! */
+            mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask;
+
+            win = XCreateWindow(dpy, root, x, y, width, height,
+                                0, visinfo->depth, InputOutput,
+                                visinfo->visual, mask, &attr);
+            XFree(visinfo);
+        }
+    }
+
+    if (fullscreen)
+        no_border(dpy, win);
+
+    /* set hints and properties */
+    {
+        XSizeHints sizehints;
+        sizehints.x = x;
+        sizehints.y = y;
+        sizehints.width  = width;
+        sizehints.height = height;
+        sizehints.flags = USSize | USPosition;
+        XSetNormalHints(dpy, win, &sizehints);
+        XSetStandardProperties(dpy, win, name, name,
+                               None, (char **)NULL, 0, &sizehints);
+    }
+
+    eglBindAPI(EGL_OPENGL_ES_API);
+
+    {
+        static const EGLint ctx_attribs[] = {
+            EGL_CONTEXT_CLIENT_VERSION, 2,
+            EGL_NONE
+        };
+        ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs);
+        if (!ctx) {
+            av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
+            return -1;
+        }
+    }
+
+
+    XMapWindow(dpy, win);
+
+    {
+        EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
+        if (!surf) {
+            av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
+            return -1;
+        }
+
+        if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) {
+            av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
+            return -1;
+        }
+
+        *winRet = win;
+        *ctxRet = ctx;
+        *surfRet = surf;
+    }
+
+    return 0;
 }
 
 static GLint
-compile_shader(struct AVFormatContext * const avctx, GLenum target, const char *source)
+compile_shader(struct AVFormatContext *const avctx, GLenum target, const char *source)
 {
-   GLuint s = glCreateShader(target);
+    GLuint s = glCreateShader(target);
 
-   if (s == 0) {
-      av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n");
-      return 0;
-   }
+    if (s == 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n");
+        return 0;
+    }
 
-   glShaderSource(s, 1, (const GLchar **) &source, NULL);
-   glCompileShader(s);
+    glShaderSource(s, 1, (const GLchar **)&source, NULL);
+    glCompileShader(s);
 
-   {
-      GLint ok;
-      glGetShaderiv(s, GL_COMPILE_STATUS, &ok);
+    {
+        GLint ok;
+        glGetShaderiv(s, GL_COMPILE_STATUS, &ok);
 
-      if (!ok) {
-         GLchar *info;
-         GLint size;
+        if (!ok) {
+            GLchar *info;
+            GLint size;
 
-         glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size);
-         info = malloc(size);
+            glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size);
+            info = malloc(size);
 
-         glGetShaderInfoLog(s, size, NULL, info);
-         av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source);
+            glGetShaderInfoLog(s, size, NULL, info);
+            av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source);
 
-         return 0;
-      }
-   }
+            return 0;
+        }
+    }
 
-   return s;
+    return s;
 }
 
-static GLuint link_program(struct AVFormatContext * const s, GLint vs, GLint fs)
+static GLuint link_program(struct AVFormatContext *const s, GLint vs, GLint fs)
 {
-   GLuint prog = glCreateProgram();
-
-   if (prog == 0) {
-      av_log(s, AV_LOG_ERROR, "Failed to create program\n");
-      return 0;
-   }
-
-   glAttachShader(prog, vs);
-   glAttachShader(prog, fs);
-   glLinkProgram(prog);
-
-   {
-      GLint ok;
-      glGetProgramiv(prog, GL_LINK_STATUS, &ok);
-      if (!ok) {
-         /* Some drivers return a size of 1 for an empty log.  This is the size
-          * of a log that contains only a terminating NUL character.
-          */
-         GLint size;
-         GLchar *info = NULL;
-         glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size);
-         if (size > 1) {
-            info = malloc(size);
-            glGetProgramInfoLog(prog, size, NULL, info);
-         }
+    GLuint prog = glCreateProgram();
 
-         av_log(s, AV_LOG_ERROR, "Failed to link: %s\n",
-                 (info != NULL) ? info : "<empty log>");
-         return 0;
-      }
-   }
+    if (prog == 0) {
+        av_log(s, AV_LOG_ERROR, "Failed to create program\n");
+        return 0;
+    }
+
+    glAttachShader(prog, vs);
+    glAttachShader(prog, fs);
+    glLinkProgram(prog);
+
+    {
+        GLint ok;
+        glGetProgramiv(prog, GL_LINK_STATUS, &ok);
+        if (!ok) {
+            /* Some drivers return a size of 1 for an empty log.  This is the size
+             * of a log that contains only a terminating NUL character.
+             */
+            GLint size;
+            GLchar *info = NULL;
+            glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size);
+            if (size > 1) {
+                info = malloc(size);
+                glGetProgramInfoLog(prog, size, NULL, info);
+            }
 
-   return prog;
+            av_log(s, AV_LOG_ERROR, "Failed to link: %s\n",
+                   (info != NULL) ? info : "<empty log>");
+            return 0;
+        }
+    }
+
+    return prog;
 }
 
 static int
-gl_setup(struct AVFormatContext * const s)
+gl_setup(struct AVFormatContext *const s)
 {
-   const char *vs =
-      "attribute vec4 pos;\n"
-      "varying vec2 texcoord;\n"
-      "\n"
-      "void main() {\n"
-      "  gl_Position = pos;\n"
-      "  texcoord.x = (pos.x + 1.0) / 2.0;\n"
-      "  texcoord.y = (-pos.y + 1.0) / 2.0;\n"
-      "}\n";
-   const char *fs =
-      "#extension GL_OES_EGL_image_external : enable\n"
-      "precision mediump float;\n"
-      "uniform samplerExternalOES s;\n"
-      "varying vec2 texcoord;\n"
-      "void main() {\n"
-      "  gl_FragColor = texture2D(s, texcoord);\n"
-      "}\n";
-
-   GLuint vs_s;
-   GLuint fs_s;
-   GLuint prog;
-
-   if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) ||
-       !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) ||
-       !(prog = link_program(s, vs_s, fs_s)))
-      return -1;
-
-   glUseProgram(prog);
-
-   {
-      static const float verts[] = {
-         -1, -1,
-         1, -1,
-         1, 1,
-         -1, 1,
-      };
-      glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts);
-   }
-
-   glEnableVertexAttribArray(0);
-   return 0;
+    const char *vs =
+        "attribute vec4 pos;\n"
+        "varying vec2 texcoord;\n"
+        "\n"
+        "void main() {\n"
+        "  gl_Position = pos;\n"
+        "  texcoord.x = (pos.x + 1.0) / 2.0;\n"
+        "  texcoord.y = (-pos.y + 1.0) / 2.0;\n"
+        "}\n";
+    const char *fs =
+        "#extension GL_OES_EGL_image_external : enable\n"
+        "precision mediump float;\n"
+        "uniform samplerExternalOES s;\n"
+        "varying vec2 texcoord;\n"
+        "void main() {\n"
+        "  gl_FragColor = texture2D(s, texcoord);\n"
+        "}\n";
+
+    GLuint vs_s;
+    GLuint fs_s;
+    GLuint prog;
+
+    if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) ||
+        !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) ||
+        !(prog = link_program(s, vs_s, fs_s)))
+        return -1;
+
+    glUseProgram(prog);
+
+    {
+        static const float verts[] = {
+            -1, -1,
+            1, -1,
+            1,  1,
+            -1,  1,
+        };
+        glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts);
+    }
+
+    glEnableVertexAttribArray(0);
+    return 0;
 }
 
 static int egl_vout_write_trailer(AVFormatContext *s)
@@ -400,12 +398,12 @@ static int egl_vout_write_trailer(AVFormatContext *s)
 
 static int egl_vout_write_header(AVFormatContext *s)
 {
-    const AVCodecParameters * const par = s->streams[0]->codecpar;
+    const AVCodecParameters *const par = s->streams[0]->codecpar;
 
 #if TRACE_ALL
     av_log(s, AV_LOG_INFO, "%s\n", __func__);
 #endif
-    if (   s->nb_streams > 1
+    if (s->nb_streams > 1
         || par->codec_type != AVMEDIA_TYPE_VIDEO
         || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
         av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
@@ -416,10 +414,10 @@ static int egl_vout_write_header(AVFormatContext *s)
 }
 
 
-static int do_display(AVFormatContext * const s, egl_display_env_t * const de, AVFrame * const frame)
+static int do_display(AVFormatContext *const s, egl_display_env_t *const de, AVFrame *const frame)
 {
-    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
-    egl_aux_t * da = NULL;
+    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor *)frame->data[0];
+    egl_aux_t *da = NULL;
     unsigned int i;
 
 #if TRACE_ALL
@@ -440,26 +438,26 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A
 
     if (da->texture == 0) {
         EGLint attribs[50];
-        EGLint * a = attribs;
+        EGLint *a = attribs;
         int i, j;
         static const EGLint anames[] = {
-           EGL_DMA_BUF_PLANE0_FD_EXT,
-           EGL_DMA_BUF_PLANE0_OFFSET_EXT,
-           EGL_DMA_BUF_PLANE0_PITCH_EXT,
-           EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
-           EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
-           EGL_DMA_BUF_PLANE1_FD_EXT,
-           EGL_DMA_BUF_PLANE1_OFFSET_EXT,
-           EGL_DMA_BUF_PLANE1_PITCH_EXT,
-           EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT,
-           EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
-           EGL_DMA_BUF_PLANE2_FD_EXT,
-           EGL_DMA_BUF_PLANE2_OFFSET_EXT,
-           EGL_DMA_BUF_PLANE2_PITCH_EXT,
-           EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT,
-           EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT,
+            EGL_DMA_BUF_PLANE0_FD_EXT,
+            EGL_DMA_BUF_PLANE0_OFFSET_EXT,
+            EGL_DMA_BUF_PLANE0_PITCH_EXT,
+            EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
+            EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
+            EGL_DMA_BUF_PLANE1_FD_EXT,
+            EGL_DMA_BUF_PLANE1_OFFSET_EXT,
+            EGL_DMA_BUF_PLANE1_PITCH_EXT,
+            EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT,
+            EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
+            EGL_DMA_BUF_PLANE2_FD_EXT,
+            EGL_DMA_BUF_PLANE2_OFFSET_EXT,
+            EGL_DMA_BUF_PLANE2_PITCH_EXT,
+            EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT,
+            EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT,
         };
-        const EGLint * b = anames;
+        const EGLint *b = anames;
 
         *a++ = EGL_WIDTH;
         *a++ = av_frame_cropped_width(frame);
@@ -470,8 +468,8 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A
 
         for (i = 0; i < desc->nb_layers; ++i) {
             for (j = 0; j < desc->layers[i].nb_planes; ++j) {
-                const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
-                const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
+                const AVDRMPlaneDescriptor *const p = desc->layers[i].planes + j;
+                const AVDRMObjectDescriptor *const obj = desc->objects + p->object_index;
                 *a++ = *b++;
                 *a++ = obj->fd;
                 *a++ = *b++;
@@ -479,13 +477,13 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A
                 *a++ = *b++;
                 *a++ = p->pitch;
                 if (obj->format_modifier == 0) {
-                   b += 2;
+                    b += 2;
                 }
                 else {
-                   *a++ = *b++;
-                   *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF);
-                   *a++ = *b++;
-                   *a++ = (EGLint)(obj->format_modifier >> 32);
+                    *a++ = *b++;
+                    *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF);
+                    *a++ = *b++;
+                    *a++ = (EGLint)(obj->format_modifier >> 32);
                 }
             }
         }
@@ -494,26 +492,26 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A
 
 #if TRACE_ALL
         for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) {
-           av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]);
+            av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]);
         }
 #endif
         {
-           const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy,
-                                              EGL_NO_CONTEXT,
-                                              EGL_LINUX_DMA_BUF_EXT,
-                                              NULL, attribs);
-           if (!image) {
-              av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd);
-              return -1;
-           }
-
-           glGenTextures(1, &da->texture);
-           glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
-           glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-           glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
-           glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
-
-           eglDestroyImageKHR(de->setup.egl_dpy, image);
+            const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy,
+                                                     EGL_NO_CONTEXT,
+                                                     EGL_LINUX_DMA_BUF_EXT,
+                                                     NULL, attribs);
+            if (!image) {
+                av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd);
+                return -1;
+            }
+
+            glGenTextures(1, &da->texture);
+            glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
+            glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+            glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+            glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
+
+            eglDestroyImageKHR(de->setup.egl_dpy, image);
         }
 
         da->fd = desc->objects[0].fd;
@@ -540,7 +538,7 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A
                (long long)modifiers[1],
                (long long)modifiers[2],
                (long long)modifiers[3]
-               );
+              );
 #endif
     }
 
@@ -558,55 +556,55 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A
     return 0;
 }
 
-static void * display_thread(void * v)
+static void* display_thread(void *v)
 {
-    AVFormatContext * const s = v;
-    egl_display_env_t * const de = s->priv_data;
+    AVFormatContext *const s = v;
+    egl_display_env_t *const de = s->priv_data;
 
 #if TRACE_ALL
     av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
 #endif
     {
-       EGLint egl_major, egl_minor;
-
-       de->setup.dpy = XOpenDisplay(NULL);
-       if (!de->setup.dpy) {
-          av_log(s, AV_LOG_ERROR, "Couldn't open X display\n");
-          goto fail;
-       }
-
-       de->setup.egl_dpy = eglGetDisplay(de->setup.dpy);
-       if (!de->setup.egl_dpy) {
-          av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n");
-          goto fail;
-       }
-
-       if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) {
-           av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n");
-           goto fail;
-       }
-
-       av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor);
-
-       if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) {
-          av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n");
-          goto fail;
-       }
+        EGLint egl_major, egl_minor;
+
+        de->setup.dpy = XOpenDisplay(NULL);
+        if (!de->setup.dpy) {
+            av_log(s, AV_LOG_ERROR, "Couldn't open X display\n");
+            goto fail;
+        }
+
+        de->setup.egl_dpy = eglGetDisplay(de->setup.dpy);
+        if (!de->setup.egl_dpy) {
+            av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n");
+            goto fail;
+        }
+
+        if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) {
+            av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n");
+            goto fail;
+        }
+
+        av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor);
+
+        if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) {
+            av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n");
+            goto fail;
+        }
     }
 
     if (!de->window_width || !de->window_height) {
-       de->window_width = 1280;
-       de->window_height = 720;
+        de->window_width = 1280;
+        de->window_height = 720;
     }
     if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout",
                     &de->setup.win, &de->setup.ctx, &de->setup.surf)) {
-       av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__);
-       goto fail;
+        av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__);
+        goto fail;
     }
 
     if (gl_setup(s)) {
-       av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__);
-       goto fail;
+        av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__);
+        goto fail;
     }
 
 #if TRACE_ALL
@@ -615,7 +613,7 @@ static void * display_thread(void * v)
     sem_post(&de->display_start_sem);
 
     for (;;) {
-        AVFrame * frame;
+        AVFrame *frame;
 
         while (sem_wait(&de->q_sem) != 0) {
             av_assert0(errno == EINTR);
@@ -653,9 +651,9 @@ fail:
 
 static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
 {
-    const AVFrame * const src_frame = (AVFrame *)pkt->data;
-    AVFrame * frame;
-    egl_display_env_t * const de = s->priv_data;
+    const AVFrame *const src_frame = (AVFrame *)pkt->data;
+    AVFrame *frame;
+    egl_display_env_t *const de = s->priv_data;
 
 #if TRACE_ALL
     av_log(s, AV_LOG_INFO, "%s\n", __func__);
@@ -668,8 +666,7 @@ static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
     else if (src_frame->format == AV_PIX_FMT_VAAPI) {
         frame = av_frame_alloc();
         frame->format = AV_PIX_FMT_DRM_PRIME;
-        if (av_hwframe_map(frame, src_frame, 0) != 0)
-        {
+        if (av_hwframe_map(frame, src_frame, 0) != 0) {
             av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
             av_frame_free(&frame);
             return AVERROR(EINVAL);
@@ -682,12 +679,12 @@ static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
 
     // Really hacky sync
     while (de->show_all && de->q_next) {
-       usleep(3000);
+        usleep(3000);
     }
 
     pthread_mutex_lock(&de->q_lock);
     {
-        AVFrame * const t = de->q_next;
+        AVFrame *const t = de->q_next;
         de->q_next = frame;
         frame = t;
     }
@@ -702,7 +699,7 @@ static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
 }
 
 static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
-                          unsigned flags)
+                                unsigned flags)
 {
     av_log(s, AV_LOG_ERROR, "%s: NIF: idx=%d, flags=%#x\n", __func__, stream_index, flags);
     return AVERROR_PATCHWELCOME;
@@ -713,7 +710,7 @@ static int egl_vout_control_message(AVFormatContext *s, int type, void *data, si
 #if TRACE_ALL
     av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
 #endif
-    switch(type) {
+    switch (type) {
     case AV_APP_TO_DEV_WINDOW_REPAINT:
         return 0;
     default:
@@ -723,14 +720,14 @@ static int egl_vout_control_message(AVFormatContext *s, int type, void *data, si
 }
 
 // deinit is called if init fails so no need to clean up explicity here
-static int egl_vout_init(struct AVFormatContext * s)
+static int egl_vout_init(struct AVFormatContext *s)
 {
-    egl_display_env_t * const de = s->priv_data;
+    egl_display_env_t *const de = s->priv_data;
     unsigned int i;
 
     av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
 
-    de->setup = (struct egl_setup){0};
+    de->setup = (struct egl_setup) { 0 };
 
     for (i = 0; i != 32; ++i) {
         de->aux[i].fd = -1;
@@ -744,8 +741,8 @@ static int egl_vout_init(struct AVFormatContext * s)
 
     sem_wait(&de->display_start_sem);
     if (de->q_terminate) {
-       av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__);
-       return -1;
+        av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__);
+        return -1;
     }
 
     av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
@@ -753,9 +750,9 @@ static int egl_vout_init(struct AVFormatContext * s)
     return 0;
 }
 
-static void egl_vout_deinit(struct AVFormatContext * s)
+static void egl_vout_deinit(struct AVFormatContext *s)
 {
-    egl_display_env_t * const de = s->priv_data;
+    egl_display_env_t *const de = s->priv_data;
 
     av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
 
@@ -773,11 +770,11 @@ static void egl_vout_deinit(struct AVFormatContext * s)
 
 #define OFFSET(x) offsetof(egl_display_env_t, x)
 static const AVOption options[] = {
-   { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
-   { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
-   { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-   { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-   { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
+    { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
+    { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, { .str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
+    { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
+    { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
+    { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
     { NULL }
 
 };
-- 
2.43.0


From 473aeeba8aaef8a9efa2d308a6ce8d5f207554f1 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 12 Dec 2022 16:49:43 +0000
Subject: [PATCH 094/157] v4l2m2m: reporganise get_raw_format for loop logic

---
 libavcodec/v4l2_context.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 7031f3d340..79a31cf930 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -828,28 +828,22 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
             return 0;
     }
 
-    for (;;) {
+    for (;; ++fdesc.index) {
         ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc);
         if (ret)
             return AVERROR(EINVAL);
 
         if (priv->pix_fmt != AV_PIX_FMT_NONE) {
-            if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) {
-                fdesc.index++;
+            if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt))
                 continue;
-            }
         }
 
         pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
         ret = v4l2_try_raw_format(ctx, pixfmt);
-        if (ret){
-            fdesc.index++;
-            continue;
+        if (ret == 0) {
+            *p = pixfmt;
+            return 0;
         }
-
-        *p = pixfmt;
-
-        return 0;
     }
 
     return AVERROR(EINVAL);
-- 
2.43.0


From 2cb8705ad59171a8aeffa18611a516435f4e3516 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 12 Dec 2022 17:49:12 +0000
Subject: [PATCH 095/157] drm_vout: Set zpos on the plane we pick to ensure it
 is at the front

---
 libavdevice/drm_vout.c | 38 +++++++++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
index cfb33ce7c3..9bd9e04421 100644
--- a/libavdevice/drm_vout.c
+++ b/libavdevice/drm_vout.c
@@ -115,9 +115,11 @@ static int find_plane(struct AVFormatContext * const avctx,
 {
    drmModePlaneResPtr planes;
    drmModePlanePtr plane;
+   drmModeObjectPropertiesPtr props = NULL;
+   drmModePropertyPtr prop = NULL;
    unsigned int i;
    unsigned int j;
-   int ret = 0;
+   int ret = -1;
 
    planes = drmModeGetPlaneResources(drmfd);
    if (!planes)
@@ -154,11 +156,37 @@ static int find_plane(struct AVFormatContext * const avctx,
       break;
    }
 
-   if (i == planes->count_planes)
-      ret = -1;
+   if (i == planes->count_planes) {
+       ret = -1;
+       goto fail;
+   }
 
-   drmModeFreePlaneResources(planes);
-   return ret;
+    props = drmModeObjectGetProperties(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE);
+    if (!props)
+        goto fail;
+    for (i = 0; i != props->count_props; ++i) {
+        if (prop)
+            drmModeFreeProperty(prop);
+        prop = drmModeGetProperty(drmfd, props->props[i]);
+        if (!prop)
+            goto fail;
+        if (strcmp("zpos", prop->name) == 0) {
+            if (drmModeObjectSetProperty(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE, props->props[i], prop->values[1]) == 0)
+                av_log(avctx, AV_LOG_DEBUG, "ZPOS set to %d\n", (int)prop->values[1]);
+            else
+                av_log(avctx, AV_LOG_WARNING, "Failed to set ZPOS on DRM plane\n");
+            break;
+        }
+    }
+
+    ret = 0;
+fail:
+    if (props)
+        drmModeFreeObjectProperties(props);
+    if (prop)
+        drmModeFreeProperty(prop);
+    drmModeFreePlaneResources(planes);
+    return ret;
 }
 
 static void da_uninit(drm_display_env_t * const de, drm_aux_t * da)
-- 
2.43.0


From fe2bdcc5fe821a58165b546ae93a2512e09bfb83 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 12 Dec 2022 17:51:46 +0000
Subject: [PATCH 096/157] drm_vout: Only set modifier flag and pass modifiers
 if there are some

---
 libavdevice/drm_vout.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
index 9bd9e04421..a56adea866 100644
--- a/libavdevice/drm_vout.c
+++ b/libavdevice/drm_vout.c
@@ -34,6 +34,7 @@
 
 #include <xf86drm.h>
 #include <xf86drmMode.h>
+#include <drm_fourcc.h>
 
 #define TRACE_ALL 0
 
@@ -249,6 +250,7 @@ static int do_display(AVFormatContext * const s, drm_display_env_t * const de, A
         uint32_t offsets[4] = {0};
         uint64_t modifiers[4] = {0};
         uint32_t bo_handles[4] = {0};
+        int has_mods = 0;
         int i, j, n;
 
         da->frame = frame;
@@ -258,6 +260,9 @@ static int do_display(AVFormatContext * const s, drm_display_env_t * const de, A
                 av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR);
                 return -1;
             }
+            if (desc->objects[i].format_modifier != DRM_FORMAT_MOD_LINEAR &&
+                desc->objects[i].format_modifier != DRM_FORMAT_MOD_INVALID)
+                has_mods = 1;
         }
 
         n = 0;
@@ -299,11 +304,13 @@ static int do_display(AVFormatContext * const s, drm_display_env_t * const de, A
 #endif
 
         if (drmModeAddFB2WithModifiers(de->drm_fd,
-                                         av_frame_cropped_width(frame),
-                                         av_frame_cropped_height(frame),
-                                         desc->layers[0].format, bo_handles,
-                                         pitches, offsets, modifiers,
-                                         &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) {
+                                       av_frame_cropped_width(frame),
+                                       av_frame_cropped_height(frame),
+                                       desc->layers[0].format, bo_handles,
+                                       pitches, offsets,
+                                       has_mods ? modifiers : NULL,
+                                       &da->fb_handle,
+                                       has_mods ? DRM_MODE_FB_MODIFIERS : 0) != 0) {
             av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR);
             return -1;
         }
-- 
2.43.0


From 3d49f163d74c363441ba817db4750e02fbe2700a Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 12 Dec 2022 17:52:58 +0000
Subject: [PATCH 097/157] drm_vout: Fix typo in error message

---
 libavdevice/drm_vout.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
index a56adea866..351abf1d60 100644
--- a/libavdevice/drm_vout.c
+++ b/libavdevice/drm_vout.c
@@ -596,7 +596,7 @@ static int drm_vout_init(struct AVFormatContext * s)
     sem_init(&de->q_sem_out, 0, 0);
     if (pthread_create(&de->q_thread, NULL, display_thread, s)) {
         rv = AVERROR(errno);
-        av_log(s, AV_LOG_ERROR, "Failed to creatye display thread: %s\n", av_err2str(rv));
+        av_log(s, AV_LOG_ERROR, "Failed to create display thread: %s\n", av_err2str(rv));
         goto fail_close;
     }
 
-- 
2.43.0


From 3d51577cad3cd66fd9ed2dd2623a1cc80d279b9a Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 12 Dec 2022 18:00:41 +0000
Subject: [PATCH 098/157] drm_vout: Add option to name the drm_module to use

---
 libavdevice/drm_vout.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
index 351abf1d60..491e1dc608 100644
--- a/libavdevice/drm_vout.c
+++ b/libavdevice/drm_vout.c
@@ -70,7 +70,9 @@ typedef struct drm_display_env_s
     uint32_t con_id;
     struct drm_setup setup;
     enum AVPixelFormat avfmt;
+
     int show_all;
+    const char * drm_module;
 
     unsigned int ano;
     drm_aux_t aux[AUX_SIZE];
@@ -569,7 +571,6 @@ static int drm_vout_init(struct AVFormatContext * s)
 {
     drm_display_env_t * const de = s->priv_data;
     int rv;
-    const char * drm_module = DRM_MODULE;
 
     av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
 
@@ -578,10 +579,10 @@ static int drm_vout_init(struct AVFormatContext * s)
     de->setup = (struct drm_setup){0};
     de->q_terminate = 0;
 
-    if ((de->drm_fd = drmOpen(drm_module, NULL)) < 0)
+    if ((de->drm_fd = drmOpen(de->drm_module, NULL)) < 0)
     {
         rv = AVERROR(errno);
-        av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", drm_module, av_err2str(rv));
+        av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", de->drm_module, av_err2str(rv));
         return rv;
     }
 
@@ -641,6 +642,7 @@ static void drm_vout_deinit(struct AVFormatContext * s)
 #define OFFSET(x) offsetof(drm_display_env_t, x)
 static const AVOption options[] = {
     { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
+    { "drm_module", "drm_module name to use, default=" DRM_MODULE, OFFSET(drm_module), AV_OPT_TYPE_STRING, { .str = DRM_MODULE }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
     { NULL }
 };
 
-- 
2.43.0


From 744b104ca181d6aa6c9452bf4cc5760a6e606c5e Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 13 Dec 2022 13:01:00 +0000
Subject: [PATCH 099/157] dmabufs: Rework to allow for non-CMA backends

---
 libavcodec/v4l2_req_dmabufs.c | 161 ++++++++++++++++++++++++----------
 1 file changed, 116 insertions(+), 45 deletions(-)

diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
index c4bbed18c6..1c3a5e861f 100644
--- a/libavcodec/v4l2_req_dmabufs.c
+++ b/libavcodec/v4l2_req_dmabufs.c
@@ -1,3 +1,4 @@
+#include <stdatomic.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -19,9 +20,21 @@
 
 #define TRACE_ALLOC 0
 
+struct dmabufs_ctl;
+struct dmabuf_h;
+
+struct dmabuf_fns {
+    int (*buf_alloc)(struct dmabufs_ctl * dbsc, struct dmabuf_h * dh, size_t size);
+    void (*buf_free)(struct dmabuf_h * dh);
+    int (*ctl_new)(struct dmabufs_ctl * dbsc);
+    void (*ctl_free)(struct dmabufs_ctl * dbsc);
+};
+
 struct dmabufs_ctl {
     int fd;
     size_t page_size;
+    void * v;
+    const struct dmabuf_fns * fns;
 };
 
 struct dmabuf_h {
@@ -29,6 +42,8 @@ struct dmabuf_h {
     size_t size;
     size_t len;
     void * mapptr;
+    void * v;
+    const struct dmabuf_fns * fns;
 };
 
 #if TRACE_ALLOC
@@ -88,15 +103,8 @@ struct dmabuf_h * dmabuf_import(int fd, size_t size)
 struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size)
 {
     struct dmabuf_h * dh;
-    struct dma_heap_allocation_data data = {
-        .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1),
-        .fd = 0,
-        .fd_flags = O_RDWR,
-        .heap_flags = 0
-    };
-
     if (old != NULL) {
-        if (old->size == data.len) {
+        if (old->size >= size) {
             return old;
         }
         dmabuf_free(old);
@@ -106,24 +114,16 @@ struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * ol
         (dh = malloc(sizeof(*dh))) == NULL)
         return NULL;
 
-    while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) {
-        int err = errno;
-        request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n",
-                (uint64_t)data.len,
-                dbsc->fd,
-                err,
-                strerror(err));
-        if (err == EINTR)
-            continue;
-        goto fail;
-    }
-
     *dh = (struct dmabuf_h){
-        .fd = data.fd,
-        .size = (size_t)data.len,
-        .mapptr = MAP_FAILED
+        .fd = -1,
+        .mapptr = MAP_FAILED,
+        .fns = dbsc->fns
     };
 
+    if (dh->fns->buf_alloc(dbsc, dh, size) != 0)
+        goto fail;
+
+
 #if TRACE_ALLOC
     ++total_bufs;
     total_size += dh->size;
@@ -220,8 +220,6 @@ void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len)
     dh->len = len;
 }
 
-
-
 void dmabuf_free(struct dmabuf_h * dh)
 {
     if (!dh)
@@ -233,20 +231,63 @@ void dmabuf_free(struct dmabuf_h * dh)
     request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
 #endif
 
-    if (dh->mapptr != MAP_FAILED)
+    dh->fns->buf_free(dh);
+
+    if (dh->mapptr != MAP_FAILED && dh->mapptr != NULL)
         munmap(dh->mapptr, dh->size);
-    while (close(dh->fd) == -1 && errno == EINTR)
-        /* loop */;
+    if (dh->fd != -1)
+        while (close(dh->fd) == -1 && errno == EINTR)
+            /* loop */;
     free(dh);
 }
 
-struct dmabufs_ctl * dmabufs_ctl_new(void)
+static struct dmabufs_ctl * dmabufs_ctl_new2(const struct dmabuf_fns * const fns)
 {
-    struct dmabufs_ctl * dbsc = malloc(sizeof(*dbsc));
+    struct dmabufs_ctl * dbsc = calloc(1, sizeof(*dbsc));
 
     if (!dbsc)
         return NULL;
 
+    dbsc->fd = -1;
+    dbsc->fns = fns;
+    dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE);
+
+    if (fns->ctl_new(dbsc) != 0)
+        goto fail;
+
+    return dbsc;
+
+fail:
+    free(dbsc);
+    return NULL;
+}
+
+static void dmabufs_ctl_free(struct dmabufs_ctl * const dbsc)
+{
+    request_debug(NULL, "Free dmabuf ctl\n");
+
+    dbsc->fns->ctl_free(dbsc);
+
+    free(dbsc);
+}
+
+void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
+{
+    struct dmabufs_ctl * const dbsc = *pDbsc;
+
+    if (!dbsc)
+        return;
+    *pDbsc = NULL;
+
+    dmabufs_ctl_free(dbsc);
+}
+
+//-----------------------------------------------------------------------------
+//
+// Alloc dmabuf via CMA
+
+static int ctl_cma_new(struct dmabufs_ctl * dbsc)
+{
     while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 &&
            errno == EINTR)
         /* Loop */;
@@ -258,31 +299,61 @@ struct dmabufs_ctl * dmabufs_ctl_new(void)
         if (dbsc->fd == -1) {
             request_log("Unable to open either %s or %s\n",
                     DMABUF_NAME1, DMABUF_NAME2);
-            goto fail;
+            return -1;
         }
     }
+    return 0;
+}
 
-    dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE);
-
-    return dbsc;
+static void ctl_cma_free(struct dmabufs_ctl * dbsc)
+{
+    if (dbsc->fd != -1)
+        while (close(dbsc->fd) == -1 && errno == EINTR)
+            /* loop */;
 
-fail:
-    free(dbsc);
-    return NULL;
 }
 
-void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
+static int buf_cma_alloc(struct dmabufs_ctl * const dbsc, struct dmabuf_h * dh, size_t size)
 {
-    struct dmabufs_ctl * const dbsc = *pDbsc;
+    struct dma_heap_allocation_data data = {
+        .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1),
+        .fd = 0,
+        .fd_flags = O_RDWR,
+        .heap_flags = 0
+    };
 
-    if (!dbsc)
-        return;
-    *pDbsc = NULL;
+    while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) {
+        int err = errno;
+        request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n",
+                (uint64_t)data.len,
+                dbsc->fd,
+                err,
+                strerror(err));
+        if (err == EINTR)
+            continue;
+        return -err;
+    }
 
-    while (close(dbsc->fd) == -1 && errno == EINTR)
-        /* loop */;
+    dh->fd = data.fd;
+    dh->size = (size_t)data.len;
+    return 0;
+}
 
-    free(dbsc);
+static void buf_cma_free(struct dmabuf_h * dh)
+{
+    // Nothing needed
 }
 
+static const struct dmabuf_fns dmabuf_cma_fns = {
+    .buf_alloc  = buf_cma_alloc,
+    .buf_free   = buf_cma_free,
+    .ctl_new    = ctl_cma_new,
+    .ctl_free   = ctl_cma_free,
+};
+
+struct dmabufs_ctl * dmabufs_ctl_new(void)
+{
+    request_debug(NULL, "Dmabufs using CMA\n");;
+    return dmabufs_ctl_new2(&dmabuf_cma_fns);
+}
 
-- 
2.43.0


From 59caa0f9e1399f397bb6b4bdc1e1bc889432d4dc Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 13 Dec 2022 13:07:58 +0000
Subject: [PATCH 100/157] dmabufs: Use unref rather than deleet on cmabufs_ctl

---
 libavcodec/v4l2_req_dmabufs.c  | 12 +++++++++++-
 libavcodec/v4l2_req_dmabufs.h  |  3 ++-
 libavcodec/v4l2_request_hevc.c |  4 ++--
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
index 1c3a5e861f..acc0366e76 100644
--- a/libavcodec/v4l2_req_dmabufs.c
+++ b/libavcodec/v4l2_req_dmabufs.c
@@ -31,6 +31,7 @@ struct dmabuf_fns {
 };
 
 struct dmabufs_ctl {
+    atomic_int ref_count;
     int fd;
     size_t page_size;
     void * v;
@@ -271,7 +272,7 @@ static void dmabufs_ctl_free(struct dmabufs_ctl * const dbsc)
     free(dbsc);
 }
 
-void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
+void dmabufs_ctl_unref(struct dmabufs_ctl ** const pDbsc)
 {
     struct dmabufs_ctl * const dbsc = *pDbsc;
 
@@ -279,9 +280,18 @@ void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
         return;
     *pDbsc = NULL;
 
+    if (atomic_fetch_sub(&dbsc->ref_count, 1) != 0)
+        return;
+
     dmabufs_ctl_free(dbsc);
 }
 
+struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc)
+{
+    atomic_fetch_add(&dbsc->ref_count, 1);
+    return dbsc;
+}
+
 //-----------------------------------------------------------------------------
 //
 // Alloc dmabuf via CMA
diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h
index c1d3d8c8d7..381ba2708d 100644
--- a/libavcodec/v4l2_req_dmabufs.h
+++ b/libavcodec/v4l2_req_dmabufs.h
@@ -7,7 +7,8 @@ struct dmabufs_ctl;
 struct dmabuf_h;
 
 struct dmabufs_ctl * dmabufs_ctl_new(void);
-void dmabufs_ctl_delete(struct dmabufs_ctl ** const pdbsc);
+void dmabufs_ctl_unref(struct dmabufs_ctl ** const pdbsc);
+struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc);
 
 // Need not preserve old contents
 // On NULL return old buffer is freed
diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
index 767ecb036a..db7ed13b6d 100644
--- a/libavcodec/v4l2_request_hevc.c
+++ b/libavcodec/v4l2_request_hevc.c
@@ -105,7 +105,7 @@ static int v4l2_request_hevc_uninit(AVCodecContext *avctx)
     mediabufs_ctl_unref(&ctx->mbufs);
     media_pool_delete(&ctx->mpool);
     pollqueue_unref(&ctx->pq);
-    dmabufs_ctl_delete(&ctx->dbufs);
+    dmabufs_ctl_unref(&ctx->dbufs);
     devscan_delete(&ctx->devscan);
 
     decode_q_uninit(&ctx->decode_q);
@@ -324,7 +324,7 @@ fail3:
 fail2:
     pollqueue_unref(&ctx->pq);
 fail1:
-    dmabufs_ctl_delete(&ctx->dbufs);
+    dmabufs_ctl_unref(&ctx->dbufs);
 fail0:
     devscan_delete(&ctx->devscan);
     return ret;
-- 
2.43.0


From ef86ce20f2941dc6a0539bed7c6fb6abec3ddc0d Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 13 Dec 2022 14:21:40 +0000
Subject: [PATCH 101/157] egl_vout: Remove redundant & completely broken debug

---
 libavdevice/egl_vout.c | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c
index a52cabb082..afc7afd13e 100644
--- a/libavdevice/egl_vout.c
+++ b/libavdevice/egl_vout.c
@@ -515,31 +515,6 @@ static int do_display(AVFormatContext *const s, egl_display_env_t *const de, AVF
         }
 
         da->fd = desc->objects[0].fd;
-
-#if 0
-        av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
-               " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
-               av_frame_cropped_width(frame),
-               av_frame_cropped_height(frame),
-               desc->layers[0].format,
-               bo_plane_handles[0],
-               bo_plane_handles[1],
-               bo_plane_handles[2],
-               bo_plane_handles[3],
-               pitches[0],
-               pitches[1],
-               pitches[2],
-               pitches[3],
-               offsets[0],
-               offsets[1],
-               offsets[2],
-               offsets[3],
-               (long long)modifiers[0],
-               (long long)modifiers[1],
-               (long long)modifiers[2],
-               (long long)modifiers[3]
-              );
-#endif
     }
 
     glClearColor(0.5, 0.5, 0.5, 0.5);
-- 
2.43.0


From 8845615c8b262b7bd074f649c8bcadffcdaaacb4 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 13 Dec 2022 16:12:12 +0000
Subject: [PATCH 102/157] v4l2m2m: Use offset from querybuf rather than always
 0

---
 libavcodec/v4l2_buffers.c | 4 +++-
 libavcodec/v4l2_buffers.h | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 9ef2f40e39..5ca58ea593 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -379,7 +379,7 @@ static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
 
     for (int i = 0; i < avbuf->num_planes; i++) {
         layer->planes[i].object_index = i;
-        layer->planes[i].offset = 0;
+        layer->planes[i].offset = avbuf->plane_info[i].offset;
         layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
     }
 
@@ -934,6 +934,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
 
         if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
+            avbuf->plane_info[i].offset = avbuf->buf.m.planes[i].data_offset;
 
             if (want_mmap)
                 avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
@@ -941,6 +942,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
                                                buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
         } else {
             avbuf->plane_info[i].length = avbuf->buf.length;
+            avbuf->plane_info[i].offset = 0;
 
             if (want_mmap)
                 avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
index 1ac32c5989..d91d5d1dd0 100644
--- a/libavcodec/v4l2_buffers.h
+++ b/libavcodec/v4l2_buffers.h
@@ -66,7 +66,8 @@ typedef struct V4L2Buffer {
 
     /* keep track of the mmap address and mmap length */
     struct V4L2Plane_info {
-        int bytesperline;
+        size_t bytesperline;
+        size_t offset;
         void * mm_addr;
         size_t length;
     } plane_info[VIDEO_MAX_PLANES];
-- 
2.43.0


From eb9059b633a9a12ad1fa7a3353dbacd0140c00db Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 13 Dec 2022 17:57:27 +0000
Subject: [PATCH 103/157] v4l2m2m: Fix crash if init errors out before setting
 avctx

---
 libavcodec/v4l2_m2m.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index 1e30d15fd8..ac6bae0dc3 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -278,7 +278,7 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
 
     av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n");
 
-    if (av_codec_is_decoder(s->avctx->codec))
+    if (s->avctx && av_codec_is_decoder(s->avctx->codec))
         av_packet_unref(&s->buf_pkt);
 
     if (s->fd >= 0) {
-- 
2.43.0


From 01041ea5e13a516108959aaef15921670c387967 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 13 Dec 2022 18:10:30 +0000
Subject: [PATCH 104/157] v4l2_buffers: Add and use ctx_to_m2mctx + error debug

---
 libavcodec/v4l2_buffers.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 5ca58ea593..e28ef2d1e8 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -41,11 +41,16 @@
 #define USEC_PER_SEC 1000000
 static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };
 
+static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
+{
+    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
+        container_of(ctx, V4L2m2mContext, output) :
+        container_of(ctx, V4L2m2mContext, capture);
+}
+
 static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
 {
-    return V4L2_TYPE_IS_OUTPUT(buf->context->type) ?
-        container_of(buf->context, V4L2m2mContext, output) :
-        container_of(buf->context, V4L2m2mContext, capture);
+    return ctx_to_m2mctx(buf->context);
 }
 
 static inline AVCodecContext *logger(const V4L2Buffer * const buf)
@@ -883,6 +888,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
     int ret, i;
     V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf));
     AVBufferRef * bufref;
+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
 
     *pbufref = NULL;
     if (avbuf == NULL)
@@ -910,7 +916,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
         avbuf->buf.m.planes = avbuf->planes;
     }
 
-    ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf);
+    ret = ioctl(s->fd, VIDIOC_QUERYBUF, &avbuf->buf);
     if (ret < 0)
         goto fail;
 
@@ -969,10 +975,12 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
     }
 
     if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-        if (buf_to_m2mctx(avbuf)->output_drm) {
+        if (s->output_drm) {
             ret = v4l2_buffer_export_drm(avbuf);
-            if (ret)
-                    goto fail;
+            if (ret) {
+                av_log(logger(avbuf), AV_LOG_ERROR, "Failed to get exported drm handles\n");
+                goto fail;
+            }
         }
     }
 
-- 
2.43.0


From ca4fc662c6fb9b929c42e8247aef9ebe256e1166 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 13 Dec 2022 18:53:22 +0000
Subject: [PATCH 105/157] v4l2m2m: Add ability to use cma alloced dmabufs as
 well as v4l2 mmap

---
 libavcodec/Makefile       |  2 +-
 libavcodec/v4l2_buffers.c | 65 ++++++++++++++++++++++++++-------------
 libavcodec/v4l2_buffers.h |  2 ++
 libavcodec/v4l2_m2m.c     |  6 +++-
 libavcodec/v4l2_m2m.h     |  4 +++
 libavcodec/v4l2_m2m_dec.c | 16 ++++++++++
 6 files changed, 71 insertions(+), 24 deletions(-)

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 11f183c9b9..8b1d669834 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -170,7 +170,7 @@ OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
 OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
 OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
 OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\
-                                          weak_link.o
+                                          weak_link.o v4l2_req_dmabufs.o
 OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\
 					  v4l2_req_devscan.o weak_link.o
 OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index e28ef2d1e8..8d80d19788 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -36,6 +36,7 @@
 #include "v4l2_context.h"
 #include "v4l2_buffers.h"
 #include "v4l2_m2m.h"
+#include "v4l2_req_dmabufs.h"
 #include "weak_link.h"
 
 #define USEC_PER_SEC 1000000
@@ -477,33 +478,46 @@ static void v4l2_free_bufref(void *opaque, uint8_t *data)
     av_buffer_unref(&bufref);
 }
 
+static inline uint32_t ff_v4l2_buf_len(const struct v4l2_buffer * b, unsigned int i)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(b->type) ? b->m.planes[i].length : b->length;
+}
+
 static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
 {
-    struct v4l2_exportbuffer expbuf;
     int i, ret;
+    const V4L2m2mContext * const s = buf_to_m2mctx(avbuf);
 
     for (i = 0; i < avbuf->num_planes; i++) {
-        memset(&expbuf, 0, sizeof(expbuf));
-
-        expbuf.index = avbuf->buf.index;
-        expbuf.type = avbuf->buf.type;
-        expbuf.plane = i;
+        int dma_fd = -1;
+        const uint32_t blen = ff_v4l2_buf_len(&avbuf->buf, i);
+
+        if (s->db_ctl != NULL) {
+            if ((avbuf->dmabuf[i] = dmabuf_alloc(s->db_ctl, blen)) == NULL)
+                return AVERROR(ENOMEM);
+            dma_fd = dmabuf_fd(avbuf->dmabuf[i]);
+            if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type))
+                avbuf->buf.m.planes[i].m.fd = dma_fd;
+            else
+                avbuf->buf.m.fd = dma_fd;
+        }
+        else {
+            struct v4l2_exportbuffer expbuf;
+            memset(&expbuf, 0, sizeof(expbuf));
 
-        ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf);
-        if (ret < 0)
-            return AVERROR(errno);
+            expbuf.index = avbuf->buf.index;
+            expbuf.type = avbuf->buf.type;
+            expbuf.plane = i;
 
-        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) {
-            /* drm frame */
-            avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length;
-            avbuf->drm_frame.objects[i].fd = expbuf.fd;
-            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
-        } else {
-            /* drm frame */
-            avbuf->drm_frame.objects[0].size = avbuf->buf.length;
-            avbuf->drm_frame.objects[0].fd = expbuf.fd;
-            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+            ret = ioctl(s->fd, VIDIOC_EXPBUF, &expbuf);
+            if (ret < 0)
+                return AVERROR(errno);
+            dma_fd = expbuf.fd;
         }
+
+        avbuf->drm_frame.objects[i].size = blen;
+        avbuf->drm_frame.objects[i].fd = dma_fd;
+        avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
     }
 
     return 0;
@@ -870,9 +884,16 @@ static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data)
             munmap(p->mm_addr, p->length);
     }
 
-    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
-        if (avbuf->drm_frame.objects[i].fd != -1)
-            close(avbuf->drm_frame.objects[i].fd);
+    if (avbuf->dmabuf[0] == NULL) {
+        for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
+            if (avbuf->drm_frame.objects[i].fd != -1)
+                close(avbuf->drm_frame.objects[i].fd);
+        }
+    }
+    else {
+        for (i = 0; i != FF_ARRAY_ELEMS(avbuf->dmabuf); ++i) {
+            dmabuf_free(avbuf->dmabuf[i]);
+        }
     }
 
     av_buffer_unref(&avbuf->ref_buf);
diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
index d91d5d1dd0..444ad94b14 100644
--- a/libavcodec/v4l2_buffers.h
+++ b/libavcodec/v4l2_buffers.h
@@ -46,6 +46,7 @@ enum V4L2Buffer_status {
  */
 struct V4L2Context;
 struct ff_weak_link_client;
+struct dmabuf_h;
 
 typedef struct V4L2Buffer {
     /* each buffer needs to have a reference to its context
@@ -80,6 +81,7 @@ typedef struct V4L2Buffer {
 
     enum V4L2Buffer_status status;
 
+    struct dmabuf_h * dmabuf[VIDEO_MAX_PLANES]; // If externally alloced dmabufs - stash other info here
 } V4L2Buffer;
 
 /**
diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index ac6bae0dc3..f802687b1b 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -34,6 +34,7 @@
 #include "v4l2_context.h"
 #include "v4l2_fmt.h"
 #include "v4l2_m2m.h"
+#include "v4l2_req_dmabufs.h"
 
 static void
 xlat_init(xlat_track_t * const x)
@@ -75,7 +76,7 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)
 
     s->capture.done = s->output.done = 0;
     s->capture.name = "capture";
-    s->capture.buf_mem = V4L2_MEMORY_MMAP;
+    s->capture.buf_mem = s->db_ctl != NULL ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
     s->output.name = "output";
     s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
     atomic_init(&s->refcount, 0);
@@ -94,12 +95,14 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)
     if (v4l2_mplane_video(&cap)) {
         s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
         s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
+        s->output.format.type = s->output.type;
         return 0;
     }
 
     if (v4l2_splane_video(&cap)) {
         s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
         s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
+        s->output.format.type = s->output.type;
         return 0;
     }
 
@@ -293,6 +296,7 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
 
     ff_v4l2_context_release(&s->output);
 
+    dmabufs_ctl_unref(&s->db_ctl);
     close(s->fd);
     s->fd = -1;
 
diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 26a7161042..0f41f94694 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -71,6 +71,8 @@ typedef struct xlat_track_s {
     V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
 } xlat_track_t;
 
+struct dmabufs_ctl;
+
 typedef struct V4L2m2mContext {
     char devname[PATH_MAX];
     int fd;
@@ -124,6 +126,7 @@ typedef struct V4L2m2mContext {
     /* Quirks */
     unsigned int quirks;
 
+    struct dmabufs_ctl * db_ctl;
 } V4L2m2mContext;
 
 typedef struct V4L2m2mPriv {
@@ -134,6 +137,7 @@ typedef struct V4L2m2mPriv {
 
     int num_output_buffers;
     int num_capture_buffers;
+    const char * dmabuf_alloc;
     enum AVPixelFormat pix_fmt;
 } V4L2m2mPriv;
 
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 47b2735f82..4d17057298 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -41,6 +41,7 @@
 #include "v4l2_context.h"
 #include "v4l2_m2m.h"
 #include "v4l2_fmt.h"
+#include "v4l2_req_dmabufs.h"
 
 // Pick 64 for max last count - that is >1sec at 60fps
 #define STATS_LAST_COUNT_MAX 64
@@ -896,6 +897,20 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
         s->output_drm = 0;
     }
 
+    s->db_ctl = NULL;
+    if (priv->dmabuf_alloc != NULL && strcmp(priv->dmabuf_alloc, "v4l2") != 0) {
+        if (strcmp(priv->dmabuf_alloc, "cma") == 0)
+            s->db_ctl = dmabufs_ctl_new();
+        else {
+            av_log(avctx, AV_LOG_ERROR, "Unknown dmabuf alloc method: '%s'\n", priv->dmabuf_alloc);
+            return AVERROR(EINVAL);
+        }
+        if (!s->db_ctl) {
+            av_log(avctx, AV_LOG_ERROR, "Can't open dmabuf provider '%s'\n", priv->dmabuf_alloc);
+            return AVERROR(ENOMEM);
+        }
+    }
+
     s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
     if (!s->device_ref) {
         ret = AVERROR(ENOMEM);
@@ -1000,6 +1015,7 @@ static const AVOption options[] = {
     { "num_capture_buffers", "Number of buffers in the capture context",
         OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS },
     { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS },
+    { "dmabuf_alloc", "Dmabuf alloc method", OFFSET(dmabuf_alloc), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS },
     { NULL},
 };
 
-- 
2.43.0


From 7ef160063de94b66a9c0bd5ad4cdd1a9ff66adec Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 5 Jan 2023 14:39:30 +0000
Subject: [PATCH 106/157] pixfmt: Add a #define to indicate presence of SAND
 formats

---
 libavutil/pixfmt.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
index 22f70007c3..5cc780e7d5 100644
--- a/libavutil/pixfmt.h
+++ b/libavutil/pixfmt.h
@@ -378,6 +378,8 @@ enum AVPixelFormat {
     AV_PIX_FMT_Y210BE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian
     AV_PIX_FMT_Y210LE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian
 // RPI - not on ifdef so can be got at by calling progs
+// #define so code that uses this can know it is there
+#define AVUTIL_HAVE_PIX_FMT_SAND 1
     AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
     AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
     AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
-- 
2.43.0


From 3067ef324c8571730ca74588858aea7a685802e3 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 11 Jan 2023 16:30:37 +0000
Subject: [PATCH 107/157] v4l2_m2m_dec: Fix initial pkt send if no extradata

---
 libavcodec/v4l2_m2m_dec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 4d17057298..9daf05adfe 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -240,7 +240,7 @@ copy_extradata(AVCodecContext * const avctx,
     else
         len = src_len < 0 ? AVERROR(EINVAL) : src_len;
 
-    // Zero length is OK but we swant to stop - -ve is error val
+    // Zero length is OK but we want to stop - -ve is error val
     if (len <= 0)
         return len;
 
@@ -525,7 +525,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
 
     if (s->extdata_sent)
         ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
-    else if (s->extdata_data)
+    else
         ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);
 
     if (ret == AVERROR(EAGAIN)) {
-- 
2.43.0


From f8e1392e6b4e45a5fb74eec11cb304f6813e2f9c Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 16 Jan 2023 16:05:09 +0000
Subject: [PATCH 108/157] v4l2m2m_dec: Make capture timeout long once pending
 count > 31

For some applications (ffmpeg command line) the current heuristic of adding
a short timeout and preferring DQ over Q once we think we have buffers
Qed in V4L2 is insufficient to prevent arbitrary buffer growth.
Unfortunately the current method of guessing the number of Qed buffers isn't
reliable enough to allow for a long timeout with only a few few buffers
believed pending so only do so once the number of buffers believed pending
exceeds plausible inaccuracies caused by buffer reordering.

The limit could be optimised by codec or apparent latency but a simple
number should reduce the  unexpected consequences.
---
 libavcodec/v4l2_m2m.h     |  3 ++-
 libavcodec/v4l2_m2m_dec.c | 18 ++++++++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index 0f41f94694..ded1478a49 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -66,7 +66,7 @@ typedef struct pts_stats_s
 
 typedef struct xlat_track_s {
     unsigned int track_no;
-    int64_t last_pts;
+    int64_t last_pts;    // Last valid PTS decoded
     int64_t last_opaque;
     V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
 } xlat_track_t;
@@ -88,6 +88,7 @@ typedef struct V4L2m2mContext {
 
     /* null frame/packet received */
     int draining;
+    int running;
     AVPacket buf_pkt;
 
     /* Reference to a frame. Only used during encoding */
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 9daf05adfe..c8ab883d7e 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -582,7 +582,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 
     do {
         const int pending = xlat_pending(&s->xlat);
-        const int prefer_dq = (pending > 3);
+        const int prefer_dq = (pending > 4);
         const int last_src_rv = src_rv;
 
         av_log(avctx, AV_LOG_TRACE, "Pending=%d, src_rv=%d, req_pkt=%d\n", pending, src_rv, s->req_pkt);
@@ -611,10 +611,14 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
         // (b) enqueue returned a status indicating that decode should be attempted
         if (dst_rv != 0 && TRY_DQ(src_rv)) {
             // Pick a timeout depending on state
+            // The pending count isn't completely reliable so it is good enough
+            // hint that we want a frame but not good enough to require it in
+            // all cases; however if it has got > 31 that exceeds its margin of
+            // error so require a frame to prevent ridiculous levels of latency
             const int t =
                 src_rv == NQ_Q_FULL ? -1 :
                 src_rv == NQ_DRAINING ? 300 :
-                prefer_dq ? 5 : 0;
+                prefer_dq ? (s->running && pending > 31 ? 100 : 5) : 0;
 
             // Dequeue frame will unref any previous contents of frame
             // if it returns success so we don't need an explicit unref
@@ -631,8 +635,13 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
                 }
             }
 
-            if (dst_rv == 0)
+            if (dst_rv == 0) {
                 set_best_effort_pts(avctx, &s->pts_stat, frame);
+                if (!s->running) {
+                    s->running = 1;
+                    av_log(avctx, AV_LOG_VERBOSE, "Decode running\n");
+                }
+            }
 
             if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
                 av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
@@ -998,7 +1007,8 @@ static void v4l2_decode_flush(AVCodecContext *avctx)
 
     // resend extradata
     s->extdata_sent = 0;
-    // clear EOS status vars
+    // clear status vars
+    s->running = 0;
     s->draining = 0;
     output->done = 0;
     capture->done = 0;
-- 
2.43.0


From 0c339afb8d74908f531eed477eb965820823bca9 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 6 Feb 2023 19:23:16 +0000
Subject: [PATCH 109/157] Initial buffersink alloc callback code

(cherry picked from commit dde8d3c8f3cc279b9b92ed4f10a2e3990f4aadeb)
---
 libavfilter/buffersink.c | 44 ++++++++++++++++++++++++++++++++++++++++
 libavfilter/buffersink.h |  3 +++
 2 files changed, 47 insertions(+)

diff --git a/libavfilter/buffersink.c b/libavfilter/buffersink.c
index 9ab83696ce..837579946d 100644
--- a/libavfilter/buffersink.c
+++ b/libavfilter/buffersink.c
@@ -62,6 +62,11 @@ typedef struct BufferSinkContext {
     int sample_rates_size;
 
     AVFrame *peeked_frame;
+
+    union {
+        av_buffersink_alloc_video_frame * video;
+    } alloc_cb;
+    void * alloc_v;
 } BufferSinkContext;
 
 #define NB_ITEMS(list) (list ## _size / sizeof(*list))
@@ -154,6 +159,44 @@ int attribute_align_arg av_buffersink_get_samples(AVFilterContext *ctx,
     return get_frame_internal(ctx, frame, 0, nb_samples);
 }
 
+static AVFrame * alloc_video_buffer(AVFilterLink *link, int w, int h)
+{
+    AVFilterContext * const ctx = link->dst;
+    BufferSinkContext * const bs = ctx->priv;
+    return bs->alloc_cb.video ? bs->alloc_cb.video(ctx, bs->alloc_v, w, h) :
+        ff_default_get_video_buffer(link, w, h);
+}
+
+int av_buffersink_set_alloc_video_frame(AVFilterContext *ctx, av_buffersink_alloc_video_frame * cb, void * v)
+{
+    BufferSinkContext * const bs = ctx->priv;
+    bs->alloc_cb.video = cb;
+    bs->alloc_v = v;
+    return 0;
+}
+
+#if FF_API_BUFFERSINK_ALLOC
+AVBufferSinkParams *av_buffersink_params_alloc(void)
+{
+    static const int pixel_fmts[] = { AV_PIX_FMT_NONE };
+    AVBufferSinkParams *params = av_malloc(sizeof(AVBufferSinkParams));
+    if (!params)
+        return NULL;
+
+    params->pixel_fmts = pixel_fmts;
+    return params;
+}
+
+AVABufferSinkParams *av_abuffersink_params_alloc(void)
+{
+    AVABufferSinkParams *params = av_mallocz(sizeof(AVABufferSinkParams));
+
+    if (!params)
+        return NULL;
+    return params;
+}
+#endif
+
 static av_cold int common_init(AVFilterContext *ctx)
 {
     BufferSinkContext *buf = ctx->priv;
@@ -381,6 +424,7 @@ static const AVFilterPad avfilter_vsink_buffer_inputs[] = {
     {
         .name = "default",
         .type = AVMEDIA_TYPE_VIDEO,
+        .get_buffer = {.video = alloc_video_buffer},
     },
 };
 
diff --git a/libavfilter/buffersink.h b/libavfilter/buffersink.h
index 64e08de53e..09737d322f 100644
--- a/libavfilter/buffersink.h
+++ b/libavfilter/buffersink.h
@@ -166,6 +166,9 @@ int av_buffersink_get_frame(AVFilterContext *ctx, AVFrame *frame);
  */
 int av_buffersink_get_samples(AVFilterContext *ctx, AVFrame *frame, int nb_samples);
 
+typedef AVFrame * av_buffersink_alloc_video_frame(AVFilterContext * ctx, void * v, int w, int h);
+int av_buffersink_set_alloc_video_frame(AVFilterContext *ctx, av_buffersink_alloc_video_frame * cb, void * v);
+
 /**
  * @}
  */
-- 
2.43.0


From 5f1a6227f339022e477afd94d7f49fe1c5720e96 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 30 Jan 2023 17:23:12 +0000
Subject: [PATCH 110/157] v4l2_m2m_dec: Add a profile check

Check the profile in avctx aginst what the v4l2 driver advertises. If
the driver doesn't support the check then just accept anything.

(cherry picked from commit 6dd83dead9ebce419fdea152db0c9f5e9a94e9ef)
---
 libavcodec/v4l2_m2m_dec.c | 125 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index c8ab883d7e..098adf4821 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -715,6 +715,127 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 }
 #endif
 
+static uint32_t
+avprofile_to_v4l2(const enum AVCodecID codec_id, const int avprofile)
+{
+    switch (codec_id) {
+        case AV_CODEC_ID_H264:
+            switch (avprofile) {
+                case FF_PROFILE_H264_BASELINE:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_BASELINE;
+                case FF_PROFILE_H264_CONSTRAINED_BASELINE:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_CONSTRAINED_BASELINE;
+                case FF_PROFILE_H264_MAIN:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_MAIN;
+                case FF_PROFILE_H264_EXTENDED:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_EXTENDED;
+                case FF_PROFILE_H264_HIGH:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH;
+                case FF_PROFILE_H264_HIGH_10:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_10;
+                case FF_PROFILE_H264_HIGH_10_INTRA:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_10_INTRA;
+                case FF_PROFILE_H264_MULTIVIEW_HIGH:
+                case FF_PROFILE_H264_HIGH_422:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_422;
+                case FF_PROFILE_H264_HIGH_422_INTRA:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_422_INTRA;
+                case FF_PROFILE_H264_STEREO_HIGH:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_STEREO_HIGH;
+                case FF_PROFILE_H264_HIGH_444_PREDICTIVE:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_444_PREDICTIVE;
+                case FF_PROFILE_H264_HIGH_444_INTRA:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_444_INTRA;
+                case FF_PROFILE_H264_CAVLC_444:
+                    return V4L2_MPEG_VIDEO_H264_PROFILE_CAVLC_444_INTRA;
+                case FF_PROFILE_H264_HIGH_444:
+                default:
+                    break;
+//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_BASELINE		= 12,
+//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_HIGH		= 13,
+//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_HIGH_INTRA	= 14,
+//                    V4L2_MPEG_VIDEO_H264_PROFILE_MULTIVIEW_HIGH		= 16,
+//                    V4L2_MPEG_VIDEO_H264_PROFILE_CONSTRAINED_HIGH		= 17,
+            }
+            break;
+        case AV_CODEC_ID_MPEG2VIDEO:
+        case AV_CODEC_ID_MPEG4:
+        case AV_CODEC_ID_VC1:
+        case AV_CODEC_ID_VP8:
+        case AV_CODEC_ID_VP9:
+        case AV_CODEC_ID_AV1:
+            // Most profiles are a simple number that matches the V4L2 enum
+            return avprofile;
+        default:
+            break;
+    }
+    return ~(uint32_t)0;
+}
+
+// This check mirrors Chrome's profile check by testing to see if the profile
+// exists as a possible value for the V4L2 profile control
+static int
+check_profile(AVCodecContext *const avctx, V4L2m2mContext *const s)
+{
+    struct v4l2_queryctrl query_ctrl;
+    struct v4l2_querymenu query_menu;
+    uint32_t profile_id;
+
+    // An unset profile is almost certainly zero or -99 - do not reject
+    if (avctx->profile <= 0) {
+        av_log(avctx, AV_LOG_VERBOSE, "Profile <= 0 - check skipped\n");
+        return 0;
+    }
+
+    memset(&query_ctrl, 0, sizeof(query_ctrl));
+    switch (avctx->codec_id) {
+        case AV_CODEC_ID_MPEG2VIDEO:
+            profile_id = V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE;
+            break;
+        case AV_CODEC_ID_MPEG4:
+            profile_id = V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE;
+            break;
+        case AV_CODEC_ID_H264:
+            profile_id = V4L2_CID_MPEG_VIDEO_H264_PROFILE;
+            break;
+        case AV_CODEC_ID_VP8:
+            profile_id = V4L2_CID_MPEG_VIDEO_VP8_PROFILE;
+            break;
+        case AV_CODEC_ID_VP9:
+            profile_id = V4L2_CID_MPEG_VIDEO_VP9_PROFILE;
+            break;
+#ifdef V4L2_CID_MPEG_VIDEO_AV1_PROFILE
+        case AV_CODEC_ID_AV1:
+            profile_id = V4L2_CID_MPEG_VIDEO_AV1_PROFILE;
+            break;
+#endif
+        default:
+            av_log(avctx, AV_LOG_VERBOSE, "Can't map profile for codec id %d; profile check skipped\n", avctx->codec_id);
+            return 0;
+    }
+
+    query_ctrl = (struct v4l2_queryctrl){.id = profile_id};
+    if (ioctl(s->fd, VIDIOC_QUERYCTRL, &query_ctrl) != 0) {
+        av_log(avctx, AV_LOG_VERBOSE, "Query profile ctrl (%#x) not supported: assume OK\n", query_ctrl.id);
+    }
+    else {
+        av_log(avctx, AV_LOG_DEBUG, "%s: Control supported: %#x\n", __func__, query_ctrl.id);
+
+        query_menu = (struct v4l2_querymenu){
+            .id = query_ctrl.id,
+            .index = avprofile_to_v4l2(avctx->codec_id, avctx->profile),
+        };
+
+        if (query_menu.index > query_ctrl.maximum ||
+            query_menu.index < query_ctrl.minimum ||
+            ioctl(s->fd, VIDIOC_QUERYMENU, &query_menu) != 0) {
+            return AVERROR(ENOENT);
+        }
+    }
+
+    return 0;
+};
+
 static int
 check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
 {
@@ -955,6 +1076,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     if ((ret = check_size(avctx, s)) != 0)
         return ret;
 
+    if ((ret = check_profile(avctx, s)) != 0) {
+        av_log(avctx, AV_LOG_WARNING, "Profile %d not supported by decode\n", avctx->profile);
+        return ret;
+    }
     return 0;
 }
 
-- 
2.43.0


From f4b9eb7af69782fc5d8c0ea3f21180193a9755e8 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 1 Feb 2023 17:24:39 +0000
Subject: [PATCH 111/157] v4l2_m2m_dec: Add extradata parse for h264 & hevc

If we have extradata we can extract profile & level and potentailly
other useful info from it. Use the codec parser to get it if the decoder
is configured.

(cherry picked from commit 6d431e79adeb246c2ed8cebce9011d81175a3906)
---
 libavcodec/v4l2_m2m_dec.c | 84 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 098adf4821..e64bc707d3 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -21,6 +21,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "config.h"
+
 #include <linux/videodev2.h>
 #include <sys/ioctl.h>
 
@@ -43,6 +45,13 @@
 #include "v4l2_fmt.h"
 #include "v4l2_req_dmabufs.h"
 
+#if CONFIG_H264_DECODER
+#include "h264_parse.h"
+#endif
+#if CONFIG_HEVC_DECODER
+#include "hevc_parse.h"
+#endif
+
 // Pick 64 for max last count - that is >1sec at 60fps
 #define STATS_LAST_COUNT_MAX 64
 #define STATS_INTERVAL_MAX (1 << 30)
@@ -956,6 +965,78 @@ static uint32_t max_coded_size(const AVCodecContext * const avctx)
     return size + (1 << 16);
 }
 
+static void
+parse_extradata(AVCodecContext *avctx)
+{
+    if (!avctx->extradata || !avctx->extradata_size)
+        return;
+
+    switch (avctx->codec_id) {
+#if CONFIG_H264_DECODER
+        case AV_CODEC_ID_H264:
+        {
+            H264ParamSets ps = {{NULL}};
+            int is_avc = 0;
+            int nal_length_size = 0;
+            int ret;
+
+            ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size,
+                                           &ps, &is_avc, &nal_length_size,
+                                           avctx->err_recognition, avctx);
+            if (ret > 0) {
+                const SPS * sps = NULL;
+                unsigned int i;
+                for (i = 0; i != MAX_SPS_COUNT; ++i) {
+                    if (ps.sps_list[i]) {
+                        sps = (const SPS *)ps.sps_list[i]->data;
+                        break;
+                    }
+                }
+                if (sps) {
+                    avctx->profile = ff_h264_get_profile(sps);
+                    avctx->level = sps->level_idc;
+                }
+            }
+            ff_h264_ps_uninit(&ps);
+            break;
+        }
+#endif
+#if CONFIG_HEVC_DECODER
+        case AV_CODEC_ID_HEVC:
+        {
+            HEVCParamSets ps = {{NULL}};
+            HEVCSEI sei = {{{{0}}}};
+            int is_nalff = 0;
+            int nal_length_size = 0;
+            int ret;
+
+            ret = ff_hevc_decode_extradata(avctx->extradata, avctx->extradata_size,
+                                           &ps, &sei, &is_nalff, &nal_length_size,
+                                           avctx->err_recognition, 0, avctx);
+            if (ret > 0) {
+                const HEVCSPS * sps = NULL;
+                unsigned int i;
+                for (i = 0; i != HEVC_MAX_SPS_COUNT; ++i) {
+                    if (ps.sps_list[i]) {
+                        sps = (const HEVCSPS *)ps.sps_list[i]->data;
+                        break;
+                    }
+                }
+                if (sps) {
+                    avctx->profile = sps->ptl.general_ptl.profile_idc;
+                    avctx->level   = sps->ptl.general_ptl.level_idc;
+                }
+            }
+            ff_hevc_ps_uninit(&ps);
+            ff_hevc_reset_sei(&sei);
+            break;
+        }
+#endif
+        default:
+            break;
+    }
+}
+
 static av_cold int v4l2_decode_init(AVCodecContext *avctx)
 {
     V4L2Context *capture, *output;
@@ -976,7 +1057,8 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
         avctx->ticks_per_frame = 2;
     }
 
-    av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level);
+    parse_extradata(avctx);
+
     ret = ff_v4l2_m2m_create_context(priv, &s);
     if (ret < 0)
         return ret;
-- 
2.43.0


From 2e3d4e17572fce6b3f61c116debf75421339cd06 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 20 Mar 2023 18:15:08 +0000
Subject: [PATCH 112/157] vulkan: Add missing decode extension defines

When building on bookworm the video decode extension names
were missing. This adds them. I expect this patch will be
obsolete shortly but it solves a current problem.
---
 libavutil/hwcontext_vulkan.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index ffd4f5dec4..d59f9409dd 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -57,6 +57,14 @@
 #define CHECK_CU(x) FF_CUDA_CHECK_DL(cuda_cu, cu, x)
 #endif
 
+// Sometimes missing definitions
+#ifndef VK_EXT_VIDEO_DECODE_H264_EXTENSION_NAME
+#define VK_EXT_VIDEO_DECODE_H264_EXTENSION_NAME "VK_EXT_video_decode_h264"
+#endif
+#ifndef VK_EXT_VIDEO_DECODE_H265_EXTENSION_NAME
+#define VK_EXT_VIDEO_DECODE_H265_EXTENSION_NAME "VK_EXT_video_decode_h265"
+#endif
+
 typedef struct VulkanQueueCtx {
     VkFence fence;
     VkQueue queue;
-- 
2.43.0


From 440ac5e5c78f73aed117c0e27c1853903bfea5b0 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 21 Mar 2023 14:20:05 +0000
Subject: [PATCH 113/157] v4l2_m2m_dec: Fix config file for finding if decoder
 enabled

Fixes parsing of extradata for profile testing. 5.x changed where that
info is defined.
---
 libavcodec/v4l2_m2m_dec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index e64bc707d3..91136f03da 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -21,7 +21,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "config.h"
+#include "config_components.h"
 
 #include <linux/videodev2.h>
 #include <sys/ioctl.h>
-- 
2.43.0


From 120727de71722901fec8f7f92cf6e34e0b162143 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 21 Mar 2023 14:23:20 +0000
Subject: [PATCH 114/157] v4l2_m2m_dec: Display profile given if skipped in
 debug

---
 libavcodec/v4l2_m2m_dec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 91136f03da..d124c7b1fc 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -792,7 +792,7 @@ check_profile(AVCodecContext *const avctx, V4L2m2mContext *const s)
 
     // An unset profile is almost certainly zero or -99 - do not reject
     if (avctx->profile <= 0) {
-        av_log(avctx, AV_LOG_VERBOSE, "Profile <= 0 - check skipped\n");
+        av_log(avctx, AV_LOG_VERBOSE, "Profile %d <= 0 - check skipped\n", avctx->profile);
         return 0;
     }
 
-- 
2.43.0


From 89cca0859d2caff72f1a86e19bdb9710b9925f41 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 19 Apr 2023 10:47:58 +0000
Subject: [PATCH 115/157] swcale: Add explicit bgr24->yv12 conversion

(cherry picked from commit 9a22d429f46a038321c66a0cd54737177641b434)
---
 libswscale/rgb2rgb.c          |  5 +++++
 libswscale/rgb2rgb.h          |  7 +++++++
 libswscale/rgb2rgb_template.c | 36 ++++++++++++++++++++++++++++++-----
 libswscale/swscale_unscaled.c | 22 +++++++++++++++++++++
 4 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
index e98fdac8ea..84bb56e60e 100644
--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -83,6 +83,11 @@ void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst,
                        int width, int height,
                        int lumStride, int chromStride, int srcStride,
                        int32_t *rgb2yuv);
+void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst,
+                       uint8_t *udst, uint8_t *vdst,
+                       int width, int height,
+                       int lumStride, int chromStride, int srcStride,
+                       int32_t *rgb2yuv);
 void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
                  int srcStride, int dstStride);
 void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
index f3951d523e..0028ab345f 100644
--- a/libswscale/rgb2rgb.h
+++ b/libswscale/rgb2rgb.h
@@ -79,6 +79,9 @@ void    rgb12to15(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                       uint8_t *vdst, int width, int height, int lumStride,
                       int chromStride, int srcStride, int32_t *rgb2yuv);
+void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                      uint8_t *vdst, int width, int height, int lumStride,
+                      int chromStride, int srcStride, int32_t *rgb2yuv);
 
 /**
  * Height should be a multiple of 2 and width should be a multiple of 16.
@@ -128,6 +131,10 @@ extern void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                               int width, int height,
                               int lumStride, int chromStride, int srcStride,
                               int32_t *rgb2yuv);
+extern void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                              int width, int height,
+                              int lumStride, int chromStride, int srcStride,
+                              int32_t *rgb2yuv);
 extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
                         int srcStride, int dstStride);
 
diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
index 42c69801ba..e2437826dd 100644
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -646,13 +646,14 @@ static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst,
  * others are ignored in the C version.
  * FIXME: Write HQ version.
  */
-void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                    uint8_t *vdst, int width, int height, int lumStride,
-                   int chromStride, int srcStride, int32_t *rgb2yuv)
+                   int chromStride, int srcStride, int32_t *rgb2yuv,
+                   const uint8_t x[9])
 {
-    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
-    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
-    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+    int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]];
+    int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]];
+    int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]];
     int y;
     const int chromWidth = width >> 1;
 
@@ -707,6 +708,30 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
     }
 }
 
+void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv)
+{
+    static const uint8_t x[9] = {
+        RY_IDX, GY_IDX, BY_IDX,
+        RU_IDX, GU_IDX, BU_IDX,
+        RV_IDX, GV_IDX, BV_IDX,
+    };
+    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x);
+}
+
+void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv)
+{
+    static const uint8_t x[9] = {
+         BY_IDX, GY_IDX, RY_IDX,
+         BU_IDX, GU_IDX, RU_IDX,
+         BV_IDX, GV_IDX, RV_IDX,
+    };
+    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x);
+}
+
 static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2,
                               uint8_t *dest, int width, int height,
                               int src1Stride, int src2Stride, int dstStride)
@@ -980,6 +1005,7 @@ static av_cold void rgb2rgb_init_c(void)
     yuy2toyv12         = yuy2toyv12_c;
     planar2x           = planar2x_c;
     ff_rgb24toyv12     = ff_rgb24toyv12_c;
+    ff_bgr24toyv12     = ff_bgr24toyv12_c;
     interleaveBytes    = interleaveBytes_c;
     deinterleaveBytes  = deinterleaveBytes_c;
     vu9_to_vu12        = vu9_to_vu12_c;
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index 9af2e7ecc3..9047030ae4 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -1654,6 +1654,23 @@ static int bgr24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
     return srcSliceH;
 }
 
+static int rgb24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+                              int srcStride[], int srcSliceY, int srcSliceH,
+                              uint8_t *dst[], int dstStride[])
+{
+    ff_bgr24toyv12(
+        src[0],
+        dst[0] +  srcSliceY       * dstStride[0],
+        dst[1] + (srcSliceY >> 1) * dstStride[1],
+        dst[2] + (srcSliceY >> 1) * dstStride[2],
+        c->srcW, srcSliceH,
+        dstStride[0], dstStride[1], srcStride[0],
+        c->input_rgb2yuv_table);
+    if (dst[3])
+        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
+    return srcSliceH;
+}
+
 static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
                              int srcStride[], int srcSliceY, int srcSliceH,
                              uint8_t *dst[], int dstStride[])
@@ -2037,6 +2054,11 @@ void ff_get_unscaled_swscale(SwsContext *c)
         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
         !(flags & SWS_ACCURATE_RND) && !(dstW&1))
         c->convert_unscaled = bgr24ToYv12Wrapper;
+    /* rgb24toYV12 */
+    if (srcFormat == AV_PIX_FMT_RGB24 &&
+        (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
+        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        c->convert_unscaled = rgb24ToYv12Wrapper;
 
     /* RGB/BGR -> RGB/BGR (no dither needed forms) */
     if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c)
-- 
2.43.0


From 72cb864f7ce0948b9d06bf13913cf14797d36145 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 20 Apr 2023 11:26:10 +0000
Subject: [PATCH 116/157] swscale: Add unscaled XRGB->YUV420P functions

(cherry picked from commit 04cc32ee3f390de513ad8c6156c0c66b2c60abc8)
---
 libswscale/rgb2rgb.c          |  20 ++++++
 libswscale/rgb2rgb.h          |  16 +++++
 libswscale/rgb2rgb_template.c | 123 ++++++++++++++++++++++++++++++----
 libswscale/swscale_unscaled.c |  89 ++++++++++++++++++++++++
 4 files changed, 236 insertions(+), 12 deletions(-)

diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
index 84bb56e60e..c3b9079d2b 100644
--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -88,6 +88,26 @@ void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst,
                        int width, int height,
                        int lumStride, int chromStride, int srcStride,
                        int32_t *rgb2yuv);
+void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst,
+					  uint8_t *udst, uint8_t *vdst,
+					  int width, int height,
+					  int lumStride, int chromStride, int srcStride,
+					  int32_t *rgb2yuv);
+void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst,
+					  uint8_t *udst, uint8_t *vdst,
+					  int width, int height,
+					  int lumStride, int chromStride, int srcStride,
+					  int32_t *rgb2yuv);
+void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst,
+					  uint8_t *udst, uint8_t *vdst,
+					  int width, int height,
+					  int lumStride, int chromStride, int srcStride,
+					  int32_t *rgb2yuv);
+void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst,
+					  uint8_t *udst, uint8_t *vdst,
+					  int width, int height,
+					  int lumStride, int chromStride, int srcStride,
+					  int32_t *rgb2yuv);
 void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
                  int srcStride, int dstStride);
 void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
index 0028ab345f..a0dd3ffb79 100644
--- a/libswscale/rgb2rgb.h
+++ b/libswscale/rgb2rgb.h
@@ -135,6 +135,22 @@ extern void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                               int width, int height,
                               int lumStride, int chromStride, int srcStride,
                               int32_t *rgb2yuv);
+extern void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                             int width, int height,
+                             int lumStride, int chromStride, int srcStride,
+                             int32_t *rgb2yuv);
+extern void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                             int width, int height,
+                             int lumStride, int chromStride, int srcStride,
+                             int32_t *rgb2yuv);
+extern void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                             int width, int height,
+                             int lumStride, int chromStride, int srcStride,
+                             int32_t *rgb2yuv);
+extern void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                             int width, int height,
+                             int lumStride, int chromStride, int srcStride,
+                             int32_t *rgb2yuv);
 extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
                         int srcStride, int dstStride);
 
diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
index e2437826dd..703de90690 100644
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -708,30 +708,125 @@ static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
     }
 }
 
+static const uint8_t x_rgb[9] = {
+    RY_IDX, GY_IDX, BY_IDX,
+    RU_IDX, GU_IDX, BU_IDX,
+    RV_IDX, GV_IDX, BV_IDX,
+};
+
+static const uint8_t x_bgr[9] = {
+     BY_IDX, GY_IDX, RY_IDX,
+     BU_IDX, GU_IDX, RU_IDX,
+     BV_IDX, GV_IDX, RV_IDX,
+};
+
 void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                    uint8_t *vdst, int width, int height, int lumStride,
                    int chromStride, int srcStride, int32_t *rgb2yuv)
 {
-    static const uint8_t x[9] = {
-        RY_IDX, GY_IDX, BY_IDX,
-        RU_IDX, GU_IDX, BU_IDX,
-        RV_IDX, GV_IDX, BV_IDX,
-    };
-    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x);
+    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
 }
 
 void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                    uint8_t *vdst, int width, int height, int lumStride,
                    int chromStride, int srcStride, int32_t *rgb2yuv)
 {
-    static const uint8_t x[9] = {
-         BY_IDX, GY_IDX, RY_IDX,
-         BU_IDX, GU_IDX, RU_IDX,
-         BV_IDX, GV_IDX, RV_IDX,
-    };
-    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x);
+    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
 }
 
+static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv,
+                   const uint8_t x[9])
+{
+    int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]];
+    int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]];
+    int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]];
+    int y;
+    const int chromWidth = width >> 1;
+
+    for (y = 0; y < height; y += 2) {
+        int i;
+        for (i = 0; i < chromWidth; i++) {
+            unsigned int b = src[8 * i + 2];
+            unsigned int g = src[8 * i + 1];
+            unsigned int r = src[8 * i + 0];
+
+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
+            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
+            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
+
+            udst[i]     = U;
+            vdst[i]     = V;
+            ydst[2 * i] = Y;
+
+            b = src[8 * i + 6];
+            g = src[8 * i + 5];
+            r = src[8 * i + 4];
+
+            Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+            ydst[2 * i + 1] = Y;
+        }
+        ydst += lumStride;
+        src  += srcStride;
+
+        if (y+1 == height)
+            break;
+
+        for (i = 0; i < chromWidth; i++) {
+            unsigned int b = src[8 * i + 2];
+            unsigned int g = src[8 * i + 1];
+            unsigned int r = src[8 * i + 0];
+
+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+
+            ydst[2 * i] = Y;
+
+            b = src[8 * i + 6];
+            g = src[8 * i + 5];
+            r = src[8 * i + 4];
+
+            Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+            ydst[2 * i + 1] = Y;
+        }
+        udst += chromStride;
+        vdst += chromStride;
+        ydst += lumStride;
+        src  += srcStride;
+    }
+}
+
+static void ff_rgbxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv)
+{
+    rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
+}
+
+static void ff_bgrxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv)
+{
+    rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
+}
+
+// As the general code does no SIMD-like ops simply adding 1 to the src address
+// will fix the ignored alpha position
+static void ff_xrgbtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv)
+{
+    rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
+}
+
+static void ff_xbgrtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv)
+{
+    rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
+}
+
+
 static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2,
                               uint8_t *dest, int width, int height,
                               int src1Stride, int src2Stride, int dstStride)
@@ -1006,6 +1101,10 @@ static av_cold void rgb2rgb_init_c(void)
     planar2x           = planar2x_c;
     ff_rgb24toyv12     = ff_rgb24toyv12_c;
     ff_bgr24toyv12     = ff_bgr24toyv12_c;
+    ff_rgbxtoyv12      = ff_rgbxtoyv12_c;
+    ff_bgrxtoyv12      = ff_bgrxtoyv12_c;
+    ff_xrgbtoyv12      = ff_xrgbtoyv12_c;
+    ff_xbgrtoyv12      = ff_xbgrtoyv12_c;
     interleaveBytes    = interleaveBytes_c;
     deinterleaveBytes  = deinterleaveBytes_c;
     vu9_to_vu12        = vu9_to_vu12_c;
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index 9047030ae4..053c06adf5 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -1671,6 +1671,74 @@ static int rgb24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
     return srcSliceH;
 }
 
+static int bgrxToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+                             int srcStride[], int srcSliceY, int srcSliceH,
+                             uint8_t *dst[], int dstStride[])
+{
+    ff_bgrxtoyv12(
+        src[0],
+        dst[0] +  srcSliceY       * dstStride[0],
+        dst[1] + (srcSliceY >> 1) * dstStride[1],
+        dst[2] + (srcSliceY >> 1) * dstStride[2],
+        c->srcW, srcSliceH,
+        dstStride[0], dstStride[1], srcStride[0],
+        c->input_rgb2yuv_table);
+    if (dst[3])
+        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
+    return srcSliceH;
+}
+
+static int rgbxToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+                             int srcStride[], int srcSliceY, int srcSliceH,
+                             uint8_t *dst[], int dstStride[])
+{
+    ff_rgbxtoyv12(
+        src[0],
+        dst[0] +  srcSliceY       * dstStride[0],
+        dst[1] + (srcSliceY >> 1) * dstStride[1],
+        dst[2] + (srcSliceY >> 1) * dstStride[2],
+        c->srcW, srcSliceH,
+        dstStride[0], dstStride[1], srcStride[0],
+        c->input_rgb2yuv_table);
+    if (dst[3])
+        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
+    return srcSliceH;
+}
+
+static int xbgrToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+                             int srcStride[], int srcSliceY, int srcSliceH,
+                             uint8_t *dst[], int dstStride[])
+{
+    ff_xbgrtoyv12(
+        src[0],
+        dst[0] +  srcSliceY       * dstStride[0],
+        dst[1] + (srcSliceY >> 1) * dstStride[1],
+        dst[2] + (srcSliceY >> 1) * dstStride[2],
+        c->srcW, srcSliceH,
+        dstStride[0], dstStride[1], srcStride[0],
+        c->input_rgb2yuv_table);
+    if (dst[3])
+        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
+    return srcSliceH;
+}
+
+static int xrgbToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+                             int srcStride[], int srcSliceY, int srcSliceH,
+                             uint8_t *dst[], int dstStride[])
+{
+    ff_xrgbtoyv12(
+        src[0],
+        dst[0] +  srcSliceY       * dstStride[0],
+        dst[1] + (srcSliceY >> 1) * dstStride[1],
+        dst[2] + (srcSliceY >> 1) * dstStride[2],
+        c->srcW, srcSliceH,
+        dstStride[0], dstStride[1], srcStride[0],
+        c->input_rgb2yuv_table);
+    if (dst[3])
+        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
+    return srcSliceH;
+}
+
 static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
                              int srcStride[], int srcSliceY, int srcSliceH,
                              uint8_t *dst[], int dstStride[])
@@ -2060,6 +2128,27 @@ void ff_get_unscaled_swscale(SwsContext *c)
         !(flags & SWS_ACCURATE_RND) && !(dstW&1))
         c->convert_unscaled = rgb24ToYv12Wrapper;
 
+    /* bgrxtoYV12 */
+    if (((srcFormat == AV_PIX_FMT_BGRA && dstFormat == AV_PIX_FMT_YUV420P) ||
+         (srcFormat == AV_PIX_FMT_BGR0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
+        !(flags & SWS_ACCURATE_RND))
+        c->convert_unscaled = bgrxToYv12Wrapper;
+    /* rgbx24toYV12 */
+    if (((srcFormat == AV_PIX_FMT_RGBA && dstFormat == AV_PIX_FMT_YUV420P) ||
+         (srcFormat == AV_PIX_FMT_RGB0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
+        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        c->convert_unscaled = rgbxToYv12Wrapper;
+    /* xbgrtoYV12 */
+    if (((srcFormat == AV_PIX_FMT_ABGR && dstFormat == AV_PIX_FMT_YUV420P) ||
+         (srcFormat == AV_PIX_FMT_0BGR && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
+        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        c->convert_unscaled = xbgrToYv12Wrapper;
+    /* xrgb24toYV12 */
+    if (((srcFormat == AV_PIX_FMT_ARGB && dstFormat == AV_PIX_FMT_YUV420P) ||
+         (srcFormat == AV_PIX_FMT_0RGB && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
+        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        c->convert_unscaled = xrgbToYv12Wrapper;
+
     /* RGB/BGR -> RGB/BGR (no dither needed forms) */
     if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c)
         && (!needsDither || (c->flags&(SWS_FAST_BILINEAR|SWS_POINT))))
-- 
2.43.0


From 8a81aa6a58912f4e9bb73e0d89f614b64895c3e8 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 20 Apr 2023 11:35:44 +0000
Subject: [PATCH 117/157] swscale: Add aarch64 unscaled RGB24->YUV420P

(cherry picked from commit 0cf416312095ce5bea3d2f7e9b14736d4b3ed160)
---
 libswscale/aarch64/rgb2rgb.c      |  40 +++++++
 libswscale/aarch64/rgb2rgb_neon.S | 181 ++++++++++++++++++++++++++++++
 2 files changed, 221 insertions(+)

diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index a9bf6ff9e0..6d3e0000dc 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -30,6 +30,44 @@
 void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
                               uint8_t *dest, int width, int height,
                               int src1Stride, int src2Stride, int dstStride);
+void ff_bgr24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv);
+void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv);
+
+// RGB to YUV asm fns process 16 pixels at once so ensure that the output
+// will fit into the stride. ARM64 should cope with unaligned SIMD r/w so
+// don't test for that
+// Fall back to C if we cannot use asm
+
+static inline int chkw(const int width, const int lumStride, const int chromStride)
+{
+    const int aw = FFALIGN(width, 16);
+    return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2;
+}
+
+static void rgb24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv)
+{
+    if (chkw(width, lumStride, chromStride))
+        ff_rgb24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv);
+    else
+        ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv);
+}
+
+static void bgr24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *bgr2yuv)
+{
+    if (chkw(width, lumStride, chromStride))
+        ff_bgr24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv);
+    else
+        ff_bgr24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv);
+}
+
 
 av_cold void rgb2rgb_init_aarch64(void)
 {
@@ -37,5 +75,7 @@ av_cold void rgb2rgb_init_aarch64(void)
 
     if (have_neon(cpu_flags)) {
         interleaveBytes = ff_interleave_bytes_neon;
+        ff_rgb24toyv12 = rgb24toyv12_check;
+        ff_bgr24toyv12 = bgr24toyv12_check;
     }
 }
diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index d81110ec57..8cf40b65f5 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -77,3 +77,184 @@ function ff_interleave_bytes_neon, export=1
 0:
         ret
 endfunc
+
+// void ff_rgb24toyv12_aarch64(
+//              const uint8_t *src,             // x0
+//              uint8_t *ydst,                  // x1
+//              uint8_t *udst,                  // x2
+//              uint8_t *vdst,                  // x3
+//              int width,                      // w4
+//              int height,                     // w5
+//              int lumStride,                  // w6
+//              int chromStride,                // w7
+//              int srcStr,                     // [sp, #0]
+//              int32_t *rgb2yuv);              // [sp, #8]
+
+function ff_rgb24toyv12_aarch64, export=1
+        ldr             x15, [sp, #8]
+        ld1             {v3.s}[2], [x15], #4
+        ld1             {v3.s}[1], [x15], #4
+        ld1             {v3.s}[0], [x15], #4
+        ld1             {v4.s}[2], [x15], #4
+        ld1             {v4.s}[1], [x15], #4
+        ld1             {v4.s}[0], [x15], #4
+        ld1             {v5.s}[2], [x15], #4
+        ld1             {v5.s}[1], [x15], #4
+        ld1             {v5.s}[0], [x15]
+        b               99f
+endfunc
+
+// void ff_bgr24toyv12_aarch64(
+//              const uint8_t *src,             // x0
+//              uint8_t *ydst,                  // x1
+//              uint8_t *udst,                  // x2
+//              uint8_t *vdst,                  // x3
+//              int width,                      // w4
+//              int height,                     // w5
+//              int lumStride,                  // w6
+//              int chromStride,                // w7
+//              int srcStr,                     // [sp, #0]
+//              int32_t *rgb2yuv);              // [sp, #8]
+
+function ff_bgr24toyv12_aarch64, export=1
+        ldr             x15, [sp, #8]
+        ld3             {v3.s, v4.s, v5.s}[0], [x15], #12
+        ld3             {v3.s, v4.s, v5.s}[1], [x15], #12
+        ld3             {v3.s, v4.s, v5.s}[2], [x15]
+99:
+        ldr             w14, [sp, #0]
+        movi            v18.8b, #128
+        uxtl            v17.8h, v18.8b
+
+        // Even line - YUV
+1:
+        mov             x10, x0
+        mov             x11, x1
+        mov             x12, x2
+        mov             x13, x3
+        mov             w9,  w4
+
+0:
+        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
+
+        uxtl2           v20.8h, v0.16b
+        uxtl2           v21.8h, v1.16b
+        uxtl2           v22.8h, v2.16b
+
+        uxtl            v0.8h, v0.8b
+        uxtl            v1.8h, v1.8b
+        uxtl            v2.8h, v2.8b
+        // Y0
+        smull           v6.4s, v0.4h, v3.h[0]
+        smull2          v7.4s, v0.8h, v3.h[0]
+        smlal           v6.4s, v1.4h, v4.h[0]
+        smlal2          v7.4s, v1.8h, v4.h[0]
+        smlal           v6.4s, v2.4h, v5.h[0]
+        smlal2          v7.4s, v2.8h, v5.h[0]
+        shrn            v6.4h, v6.4s, #12
+        shrn2           v6.8h, v7.4s, #12
+        add             v6.8h, v6.8h, v17.8h     // +128 (>> 3 = 16)
+        uqrshrn         v16.8b, v6.8h, #3
+        // Y1
+        smull           v6.4s, v20.4h, v3.h[0]
+        smull2          v7.4s, v20.8h, v3.h[0]
+        smlal           v6.4s, v21.4h, v4.h[0]
+        smlal2          v7.4s, v21.8h, v4.h[0]
+        smlal           v6.4s, v22.4h, v5.h[0]
+        smlal2          v7.4s, v22.8h, v5.h[0]
+        shrn            v6.4h, v6.4s, #12
+        shrn2           v6.8h, v7.4s, #12
+        add             v6.8h, v6.8h, v17.8h
+        uqrshrn2        v16.16b, v6.8h, #3
+        // Y0/Y1
+        st1             {v16.16b}, [x11], #16
+
+        uzp1            v0.8h, v0.8h, v20.8h
+        uzp1            v1.8h, v1.8h, v21.8h
+        uzp1            v2.8h, v2.8h, v22.8h
+
+        // U
+        // Vector subscript *2 as we loaded into S but are only using H
+        smull           v6.4s, v0.4h, v3.h[2]
+        smull2          v7.4s, v0.8h, v3.h[2]
+        smlal           v6.4s, v1.4h, v4.h[2]
+        smlal2          v7.4s, v1.8h, v4.h[2]
+        smlal           v6.4s, v2.4h, v5.h[2]
+        smlal2          v7.4s, v2.8h, v5.h[2]
+        shrn            v6.4h, v6.4s, #14
+        shrn2           v6.8h, v7.4s, #14
+        sqrshrn         v6.8b, v6.8h, #1
+        add             v6.8b, v6.8b, v18.8b     // +128
+        st1             {v6.8b}, [x12], #8
+
+        // V
+        smull           v6.4s, v0.4h, v3.h[4]
+        smull2          v7.4s, v0.8h, v3.h[4]
+        smlal           v6.4s, v1.4h, v4.h[4]
+        smlal2          v7.4s, v1.8h, v4.h[4]
+        smlal           v6.4s, v2.4h, v5.h[4]
+        smlal2          v7.4s, v2.8h, v5.h[4]
+        shrn            v6.4h, v6.4s, #14
+        shrn2           v6.8h, v7.4s, #14
+        sqrshrn         v6.8b, v6.8h, #1
+        add             v6.8b, v6.8b, v18.8b     // +128
+        st1             {v6.8b}, [x13], #8
+
+        subs            w9, w9, #16
+        b.gt            0b
+
+        // Odd line - Y only
+
+        add             x0, x0, w14, SXTX
+        add             x1, x1, w6, SXTX
+        mov             x10, x0
+        mov             x11, x1
+        mov             w9,  w4
+
+0:
+        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
+
+        uxtl2           v20.8h, v0.16b
+        uxtl2           v21.8h, v1.16b
+        uxtl2           v22.8h, v2.16b
+
+        uxtl            v0.8h, v0.8b
+        uxtl            v1.8h, v1.8b
+        uxtl            v2.8h, v2.8b
+        // Y0
+        smull           v6.4s, v0.4h, v3.h[0]
+        smull2          v7.4s, v0.8h, v3.h[0]
+        smlal           v6.4s, v1.4h, v4.h[0]
+        smlal2          v7.4s, v1.8h, v4.h[0]
+        smlal           v6.4s, v2.4h, v5.h[0]
+        smlal2          v7.4s, v2.8h, v5.h[0]
+        shrn            v6.4h, v6.4s, #12
+        shrn2           v6.8h, v7.4s, #12
+        add             v6.8h, v6.8h, v17.8h
+        uqrshrn         v16.8b, v6.8h, #3
+        // Y1
+        smull           v6.4s, v20.4h, v3.h[0]
+        smull2          v7.4s, v20.8h, v3.h[0]
+        smlal           v6.4s, v21.4h, v4.h[0]
+        smlal2          v7.4s, v21.8h, v4.h[0]
+        smlal           v6.4s, v22.4h, v5.h[0]
+        smlal2          v7.4s, v22.8h, v5.h[0]
+        shrn            v6.4h, v6.4s, #12
+        shrn2           v6.8h, v7.4s, #12
+        add             v6.8h, v6.8h, v17.8h
+        uqrshrn2        v16.16b, v6.8h, #3
+        // Y0/Y1
+        st1             {v16.16b}, [x11], #16
+
+        subs            w9, w9, #16
+        b.gt            0b
+
+        add             x0, x0, w14, SXTX
+        add             x1, x1, w6, SXTX
+        add             x2, x2, w7, SXTX
+        add             x3, x3, w7, SXTX
+        subs            w5, w5, #2
+        b.gt            1b
+
+        ret
+endfunc
-- 
2.43.0


From 0bf32c5314f8c96a203c3250138640fa6f664f6a Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 27 Apr 2023 13:03:52 +0000
Subject: [PATCH 118/157] rgb2rgb: Fix rgb24->yuv420p with arbitrary wxh

(cherry picked from commit 58771fdf0218dc670d8a343824f540e2f6e8785d)
---
 libswscale/aarch64/rgb2rgb.c      |   5 +-
 libswscale/aarch64/rgb2rgb_neon.S | 440 ++++++++++++++++++++++++------
 2 files changed, 355 insertions(+), 90 deletions(-)

diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index 6d3e0000dc..f10c4ef2de 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -44,8 +44,9 @@ void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
 
 static inline int chkw(const int width, const int lumStride, const int chromStride)
 {
-    const int aw = FFALIGN(width, 16);
-    return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2;
+//    const int aw = FFALIGN(width, 16);
+//    return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2;
+    return 1;
 }
 
 static void rgb24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 8cf40b65f5..978ab443ea 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -116,6 +116,25 @@ endfunc
 //              int srcStr,                     // [sp, #0]
 //              int32_t *rgb2yuv);              // [sp, #8]
 
+// regs
+// v0-2         Src bytes - reused as chroma src
+// v3-5         Coeffs (packed very inefficiently - could be squashed)
+// v6           128b
+// v7           128h
+// v8-15        Reserved
+// v16-18       Lo Src expanded as H
+// v19          -
+// v20-22       Hi Src expanded as H
+// v23          -
+// v24          U out
+// v25          U tmp
+// v26          Y out
+// v27-29       Y tmp
+// v30          V out
+// v31          V tmp
+
+// Assumes Little Endian in tail stores & conversion matrix
+
 function ff_bgr24toyv12_aarch64, export=1
         ldr             x15, [sp, #8]
         ld3             {v3.s, v4.s, v5.s}[0], [x15], #12
@@ -123,138 +142,383 @@ function ff_bgr24toyv12_aarch64, export=1
         ld3             {v3.s, v4.s, v5.s}[2], [x15]
 99:
         ldr             w14, [sp, #0]
-        movi            v18.8b, #128
-        uxtl            v17.8h, v18.8b
-
-        // Even line - YUV
+        movi            v7.8b, #128
+        uxtl            v6.8h, v7.8b
+        // Ensure if nothing to do then we do nothing
+        cmp             w4, #0
+        b.le            90f
+        cmp             w5, #0
+        b.le            90f
+        // If w % 16 != 0 then -16 so we do main loop 1 fewer times with
+        // the remainder done in the tail
+        tst             w4, #15
+        b.eq            1f
+        sub             w4, w4, #16
 1:
+
+// -------------------- Even line body - YUV
+11:
+        subs            w9,  w4, #0
         mov             x10, x0
         mov             x11, x1
         mov             x12, x2
         mov             x13, x3
-        mov             w9,  w4
+        b.lt            12f
 
-0:
         ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
+        subs            w9, w9, #16
+        b.le            13f
+
+10:
+        uxtl            v16.8h, v0.8b
+        uxtl            v17.8h, v1.8b
+        uxtl            v18.8h, v2.8b
 
         uxtl2           v20.8h, v0.16b
         uxtl2           v21.8h, v1.16b
         uxtl2           v22.8h, v2.16b
 
-        uxtl            v0.8h, v0.8b
-        uxtl            v1.8h, v1.8b
-        uxtl            v2.8h, v2.8b
+        bic             v0.8h, #0xff, LSL #8
+        bic             v1.8h, #0xff, LSL #8
+        bic             v2.8h, #0xff, LSL #8
+
+        // Testing shows it is faster to stack the smull/smlal ops together
+        // rather than interleave them between channels and indeed even the
+        // shift/add sections seem happier not interleaved
+
         // Y0
-        smull           v6.4s, v0.4h, v3.h[0]
-        smull2          v7.4s, v0.8h, v3.h[0]
-        smlal           v6.4s, v1.4h, v4.h[0]
-        smlal2          v7.4s, v1.8h, v4.h[0]
-        smlal           v6.4s, v2.4h, v5.h[0]
-        smlal2          v7.4s, v2.8h, v5.h[0]
-        shrn            v6.4h, v6.4s, #12
-        shrn2           v6.8h, v7.4s, #12
-        add             v6.8h, v6.8h, v17.8h     // +128 (>> 3 = 16)
-        uqrshrn         v16.8b, v6.8h, #3
+        smull           v26.4s, v16.4h, v3.h[0]
+        smlal           v26.4s, v17.4h, v4.h[0]
+        smlal           v26.4s, v18.4h, v5.h[0]
+        smull2          v27.4s, v16.8h, v3.h[0]
+        smlal2          v27.4s, v17.8h, v4.h[0]
+        smlal2          v27.4s, v18.8h, v5.h[0]
         // Y1
-        smull           v6.4s, v20.4h, v3.h[0]
-        smull2          v7.4s, v20.8h, v3.h[0]
-        smlal           v6.4s, v21.4h, v4.h[0]
-        smlal2          v7.4s, v21.8h, v4.h[0]
-        smlal           v6.4s, v22.4h, v5.h[0]
-        smlal2          v7.4s, v22.8h, v5.h[0]
-        shrn            v6.4h, v6.4s, #12
-        shrn2           v6.8h, v7.4s, #12
-        add             v6.8h, v6.8h, v17.8h
-        uqrshrn2        v16.16b, v6.8h, #3
+        smull           v28.4s, v20.4h, v3.h[0]
+        smlal           v28.4s, v21.4h, v4.h[0]
+        smlal           v28.4s, v22.4h, v5.h[0]
+        smull2          v29.4s, v20.8h, v3.h[0]
+        smlal2          v29.4s, v21.8h, v4.h[0]
+        smlal2          v29.4s, v22.8h, v5.h[0]
+        shrn            v26.4h, v26.4s, #12
+        shrn2           v26.8h, v27.4s, #12
+        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
+        uqrshrn         v26.8b, v26.8h, #3
+        shrn            v28.4h, v28.4s, #12
+        shrn2           v28.8h, v29.4s, #12
+        add             v28.8h, v28.8h, v6.8h
+        uqrshrn2        v26.16b, v28.8h, #3
         // Y0/Y1
-        st1             {v16.16b}, [x11], #16
-
-        uzp1            v0.8h, v0.8h, v20.8h
-        uzp1            v1.8h, v1.8h, v21.8h
-        uzp1            v2.8h, v2.8h, v22.8h
 
         // U
         // Vector subscript *2 as we loaded into S but are only using H
-        smull           v6.4s, v0.4h, v3.h[2]
-        smull2          v7.4s, v0.8h, v3.h[2]
-        smlal           v6.4s, v1.4h, v4.h[2]
-        smlal2          v7.4s, v1.8h, v4.h[2]
-        smlal           v6.4s, v2.4h, v5.h[2]
-        smlal2          v7.4s, v2.8h, v5.h[2]
-        shrn            v6.4h, v6.4s, #14
-        shrn2           v6.8h, v7.4s, #14
-        sqrshrn         v6.8b, v6.8h, #1
-        add             v6.8b, v6.8b, v18.8b     // +128
-        st1             {v6.8b}, [x12], #8
+        smull           v24.4s, v0.4h, v3.h[2]
+        smlal           v24.4s, v1.4h, v4.h[2]
+        smlal           v24.4s, v2.4h, v5.h[2]
+        smull2          v25.4s, v0.8h, v3.h[2]
+        smlal2          v25.4s, v1.8h, v4.h[2]
+        smlal2          v25.4s, v2.8h, v5.h[2]
 
         // V
-        smull           v6.4s, v0.4h, v3.h[4]
-        smull2          v7.4s, v0.8h, v3.h[4]
-        smlal           v6.4s, v1.4h, v4.h[4]
-        smlal2          v7.4s, v1.8h, v4.h[4]
-        smlal           v6.4s, v2.4h, v5.h[4]
-        smlal2          v7.4s, v2.8h, v5.h[4]
-        shrn            v6.4h, v6.4s, #14
-        shrn2           v6.8h, v7.4s, #14
-        sqrshrn         v6.8b, v6.8h, #1
-        add             v6.8b, v6.8b, v18.8b     // +128
-        st1             {v6.8b}, [x13], #8
+        smull           v30.4s, v0.4h, v3.h[4]
+        smlal           v30.4s, v1.4h, v4.h[4]
+        smlal           v30.4s, v2.4h, v5.h[4]
+        smull2          v31.4s, v0.8h, v3.h[4]
+        smlal2          v31.4s, v1.8h, v4.h[4]
+        smlal2          v31.4s, v2.8h, v5.h[4]
+
+        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
+
+        shrn            v24.4h, v24.4s, #14
+        shrn2           v24.8h, v25.4s, #14
+        sqrshrn         v24.8b, v24.8h, #1
+        add             v24.8b, v24.8b, v7.8b     // +128
+        shrn            v30.4h, v30.4s, #14
+        shrn2           v30.8h, v31.4s, #14
+        sqrshrn         v30.8b, v30.8h, #1
+        add             v30.8b, v30.8b, v7.8b     // +128
 
         subs            w9, w9, #16
-        b.gt            0b
 
-        // Odd line - Y only
+        st1             {v26.16b}, [x11], #16
+        st1             {v24.8b}, [x12], #8
+        st1             {v30.8b}, [x13], #8
+
+        b.gt            10b
+
+// -------------------- Even line tail - YUV
+// If width % 16 == 0 then simply runs once with preloaded RGB
+// If other then deals with preload & then does remaining tail
+
+13:
+        // Body is simple copy of main loop body minus preload
+
+        uxtl            v16.8h, v0.8b
+        uxtl            v17.8h, v1.8b
+        uxtl            v18.8h, v2.8b
+
+        uxtl2           v20.8h, v0.16b
+        uxtl2           v21.8h, v1.16b
+        uxtl2           v22.8h, v2.16b
+
+        bic             v0.8h, #0xff, LSL #8
+        bic             v1.8h, #0xff, LSL #8
+        bic             v2.8h, #0xff, LSL #8
+
+        // Y0
+        smull           v26.4s, v16.4h, v3.h[0]
+        smlal           v26.4s, v17.4h, v4.h[0]
+        smlal           v26.4s, v18.4h, v5.h[0]
+        smull2          v27.4s, v16.8h, v3.h[0]
+        smlal2          v27.4s, v17.8h, v4.h[0]
+        smlal2          v27.4s, v18.8h, v5.h[0]
+        // Y1
+        smull           v28.4s, v20.4h, v3.h[0]
+        smlal           v28.4s, v21.4h, v4.h[0]
+        smlal           v28.4s, v22.4h, v5.h[0]
+        smull2          v29.4s, v20.8h, v3.h[0]
+        smlal2          v29.4s, v21.8h, v4.h[0]
+        smlal2          v29.4s, v22.8h, v5.h[0]
+        shrn            v26.4h, v26.4s, #12
+        shrn2           v26.8h, v27.4s, #12
+        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
+        uqrshrn         v26.8b, v26.8h, #3
+        shrn            v28.4h, v28.4s, #12
+        shrn2           v28.8h, v29.4s, #12
+        add             v28.8h, v28.8h, v6.8h
+        uqrshrn2        v26.16b, v28.8h, #3
+        // Y0/Y1
+
+        // U
+        // Vector subscript *2 as we loaded into S but are only using H
+        smull           v24.4s, v0.4h, v3.h[2]
+        smlal           v24.4s, v1.4h, v4.h[2]
+        smlal           v24.4s, v2.4h, v5.h[2]
+        smull2          v25.4s, v0.8h, v3.h[2]
+        smlal2          v25.4s, v1.8h, v4.h[2]
+        smlal2          v25.4s, v2.8h, v5.h[2]
 
+        // V
+        smull           v30.4s, v0.4h, v3.h[4]
+        smlal           v30.4s, v1.4h, v4.h[4]
+        smlal           v30.4s, v2.4h, v5.h[4]
+        smull2          v31.4s, v0.8h, v3.h[4]
+        smlal2          v31.4s, v1.8h, v4.h[4]
+        smlal2          v31.4s, v2.8h, v5.h[4]
+
+        cmp             w9, #-16
+
+        shrn            v24.4h, v24.4s, #14
+        shrn2           v24.8h, v25.4s, #14
+        sqrshrn         v24.8b, v24.8h, #1
+        add             v24.8b, v24.8b, v7.8b     // +128
+        shrn            v30.4h, v30.4s, #14
+        shrn2           v30.8h, v31.4s, #14
+        sqrshrn         v30.8b, v30.8h, #1
+        add             v30.8b, v30.8b, v7.8b     // +128
+
+        // Here:
+        // w9 == 0      width % 16 == 0, tail done
+        // w9 > -16     1st tail done (16 pels), remainder still to go
+        // w9 == -16    shouldn't happen
+        // w9 > -32     2nd tail done
+        // w9 <= -32    shouldn't happen
+
+        b.lt            2f
+        st1             {v26.16b}, [x11], #16
+        st1             {v24.8b}, [x12], #8
+        st1             {v30.8b}, [x13], #8
+        cbz             w9, 3f
+
+12:
+        sub             w9, w9, #16
+
+        tbz             w9, #3, 1f
+        ld3             {v0.8b, v1.8b, v2.8b},  [x10], #24
+1:      tbz             w9, #2, 1f
+        ld3             {v0.b, v1.b, v2.b}[8],  [x10], #3
+        ld3             {v0.b, v1.b, v2.b}[9],  [x10], #3
+        ld3             {v0.b, v1.b, v2.b}[10], [x10], #3
+        ld3             {v0.b, v1.b, v2.b}[11], [x10], #3
+1:      tbz             w9, #1, 1f
+        ld3             {v0.b, v1.b, v2.b}[12], [x10], #3
+        ld3             {v0.b, v1.b, v2.b}[13], [x10], #3
+1:      tbz             w9, #0, 13b
+        ld3             {v0.b, v1.b, v2.b}[14], [x10], #3
+        b               13b
+
+2:
+        tbz             w9, #3, 1f
+        st1             {v26.8b},    [x11], #8
+        st1             {v24.s}[0],  [x12], #4
+        st1             {v30.s}[0],  [x13], #4
+1:      tbz             w9, #2, 1f
+        st1             {v26.s}[2],  [x11], #4
+        st1             {v24.h}[2],  [x12], #2
+        st1             {v30.h}[2],  [x13], #2
+1:      tbz             w9, #1, 1f
+        st1             {v26.h}[6],  [x11], #2
+        st1             {v24.b}[6],  [x12], #1
+        st1             {v30.b}[6],  [x13], #1
+1:      tbz             w9, #0, 1f
+        st1             {v26.b}[14], [x11]
+        st1             {v24.b}[7],  [x12]
+        st1             {v30.b}[7],  [x13]
+1:
+3:
+
+// -------------------- Odd line body - Y only
+
+        subs            w5, w5, #1
+        b.eq            90f
+
+        subs            w9,  w4, #0
         add             x0, x0, w14, SXTX
         add             x1, x1, w6, SXTX
         mov             x10, x0
         mov             x11, x1
-        mov             w9,  w4
+        b.lt            12f
 
-0:
         ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
+        subs            w9, w9, #16
+        b.le            13f
+
+10:
+        uxtl            v16.8h, v0.8b
+        uxtl            v17.8h, v1.8b
+        uxtl            v18.8h, v2.8b
 
         uxtl2           v20.8h, v0.16b
         uxtl2           v21.8h, v1.16b
         uxtl2           v22.8h, v2.16b
 
-        uxtl            v0.8h, v0.8b
-        uxtl            v1.8h, v1.8b
-        uxtl            v2.8h, v2.8b
+        // Testing shows it is faster to stack the smull/smlal ops together
+        // rather than interleave them between channels and indeed even the
+        // shift/add sections seem happier not interleaved
+
         // Y0
-        smull           v6.4s, v0.4h, v3.h[0]
-        smull2          v7.4s, v0.8h, v3.h[0]
-        smlal           v6.4s, v1.4h, v4.h[0]
-        smlal2          v7.4s, v1.8h, v4.h[0]
-        smlal           v6.4s, v2.4h, v5.h[0]
-        smlal2          v7.4s, v2.8h, v5.h[0]
-        shrn            v6.4h, v6.4s, #12
-        shrn2           v6.8h, v7.4s, #12
-        add             v6.8h, v6.8h, v17.8h
-        uqrshrn         v16.8b, v6.8h, #3
+        smull           v26.4s, v16.4h, v3.h[0]
+        smlal           v26.4s, v17.4h, v4.h[0]
+        smlal           v26.4s, v18.4h, v5.h[0]
+        smull2          v27.4s, v16.8h, v3.h[0]
+        smlal2          v27.4s, v17.8h, v4.h[0]
+        smlal2          v27.4s, v18.8h, v5.h[0]
         // Y1
-        smull           v6.4s, v20.4h, v3.h[0]
-        smull2          v7.4s, v20.8h, v3.h[0]
-        smlal           v6.4s, v21.4h, v4.h[0]
-        smlal2          v7.4s, v21.8h, v4.h[0]
-        smlal           v6.4s, v22.4h, v5.h[0]
-        smlal2          v7.4s, v22.8h, v5.h[0]
-        shrn            v6.4h, v6.4s, #12
-        shrn2           v6.8h, v7.4s, #12
-        add             v6.8h, v6.8h, v17.8h
-        uqrshrn2        v16.16b, v6.8h, #3
+        smull           v28.4s, v20.4h, v3.h[0]
+        smlal           v28.4s, v21.4h, v4.h[0]
+        smlal           v28.4s, v22.4h, v5.h[0]
+        smull2          v29.4s, v20.8h, v3.h[0]
+        smlal2          v29.4s, v21.8h, v4.h[0]
+        smlal2          v29.4s, v22.8h, v5.h[0]
+
+        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
+
+        shrn            v26.4h, v26.4s, #12
+        shrn2           v26.8h, v27.4s, #12
+        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
+        uqrshrn         v26.8b, v26.8h, #3
+        shrn            v28.4h, v28.4s, #12
+        shrn2           v28.8h, v29.4s, #12
+        add             v28.8h, v28.8h, v6.8h
+        uqrshrn2        v26.16b, v28.8h, #3
         // Y0/Y1
-        st1             {v16.16b}, [x11], #16
 
         subs            w9, w9, #16
-        b.gt            0b
+
+        st1             {v26.16b}, [x11], #16
+
+        b.gt            10b
+
+// -------------------- Odd line tail - Y
+// If width % 16 == 0 then simply runs once with preloaded RGB
+// If other then deals with preload & then does remaining tail
+
+13:
+        // Body is simple copy of main loop body minus preload
+
+        uxtl            v16.8h, v0.8b
+        uxtl            v17.8h, v1.8b
+        uxtl            v18.8h, v2.8b
+
+        uxtl2           v20.8h, v0.16b
+        uxtl2           v21.8h, v1.16b
+        uxtl2           v22.8h, v2.16b
+
+        // Y0
+        smull           v26.4s, v16.4h, v3.h[0]
+        smlal           v26.4s, v17.4h, v4.h[0]
+        smlal           v26.4s, v18.4h, v5.h[0]
+        smull2          v27.4s, v16.8h, v3.h[0]
+        smlal2          v27.4s, v17.8h, v4.h[0]
+        smlal2          v27.4s, v18.8h, v5.h[0]
+        // Y1
+        smull           v28.4s, v20.4h, v3.h[0]
+        smlal           v28.4s, v21.4h, v4.h[0]
+        smlal           v28.4s, v22.4h, v5.h[0]
+        smull2          v29.4s, v20.8h, v3.h[0]
+        smlal2          v29.4s, v21.8h, v4.h[0]
+        smlal2          v29.4s, v22.8h, v5.h[0]
+
+        cmp             w9, #-16
+
+        shrn            v26.4h, v26.4s, #12
+        shrn2           v26.8h, v27.4s, #12
+        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
+        uqrshrn         v26.8b, v26.8h, #3
+        shrn            v28.4h, v28.4s, #12
+        shrn2           v28.8h, v29.4s, #12
+        add             v28.8h, v28.8h, v6.8h
+        uqrshrn2        v26.16b, v28.8h, #3
+        // Y0/Y1
+
+        // Here:
+        // w9 == 0      width % 16 == 0, tail done
+        // w9 > -16     1st tail done (16 pels), remainder still to go
+        // w9 == -16    shouldn't happen
+        // w9 > -32     2nd tail done
+        // w9 <= -32    shouldn't happen
+
+        b.lt            2f
+        st1             {v26.16b}, [x11], #16
+        cbz             w9, 3f
+
+12:
+        sub             w9, w9, #16
+
+        tbz             w9, #3, 1f
+        ld3             {v0.8b, v1.8b, v2.8b},  [x10], #24
+1:      tbz             w9, #2, 1f
+        ld3             {v0.b, v1.b, v2.b}[8],  [x10], #3
+        ld3             {v0.b, v1.b, v2.b}[9],  [x10], #3
+        ld3             {v0.b, v1.b, v2.b}[10], [x10], #3
+        ld3             {v0.b, v1.b, v2.b}[11], [x10], #3
+1:      tbz             w9, #1, 1f
+        ld3             {v0.b, v1.b, v2.b}[12], [x10], #3
+        ld3             {v0.b, v1.b, v2.b}[13], [x10], #3
+1:      tbz             w9, #0, 13b
+        ld3             {v0.b, v1.b, v2.b}[14], [x10], #3
+        b               13b
+
+2:
+        tbz             w9, #3, 1f
+        st1             {v26.8b},    [x11], #8
+1:      tbz             w9, #2, 1f
+        st1             {v26.s}[2],  [x11], #4
+1:      tbz             w9, #1, 1f
+        st1             {v26.h}[6],  [x11], #2
+1:      tbz             w9, #0, 1f
+        st1             {v26.b}[14], [x11]
+1:
+3:
+
+// ------------------- Loop to start
 
         add             x0, x0, w14, SXTX
         add             x1, x1, w6, SXTX
         add             x2, x2, w7, SXTX
         add             x3, x3, w7, SXTX
-        subs            w5, w5, #2
-        b.gt            1b
-
+        subs            w5, w5, #1
+        b.gt            11b
+90:
         ret
 endfunc
-- 
2.43.0


From 528ade3e9c31a9be7551fd524e986a425901e055 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 26 Apr 2023 15:36:07 +0000
Subject: [PATCH 119/157] rgb2rgb: Use asm unconditionally

(cherry picked from commit 7c216c0804836b31c0ea093bb1dde5ab387724b1)
---
 libswscale/aarch64/rgb2rgb.c | 37 ++----------------------------------
 1 file changed, 2 insertions(+), 35 deletions(-)

diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index f10c4ef2de..6a0e2dcc09 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -37,46 +37,13 @@ void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                    uint8_t *vdst, int width, int height, int lumStride,
                    int chromStride, int srcStride, int32_t *rgb2yuv);
 
-// RGB to YUV asm fns process 16 pixels at once so ensure that the output
-// will fit into the stride. ARM64 should cope with unaligned SIMD r/w so
-// don't test for that
-// Fall back to C if we cannot use asm
-
-static inline int chkw(const int width, const int lumStride, const int chromStride)
-{
-//    const int aw = FFALIGN(width, 16);
-//    return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2;
-    return 1;
-}
-
-static void rgb24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-                   uint8_t *vdst, int width, int height, int lumStride,
-                   int chromStride, int srcStride, int32_t *rgb2yuv)
-{
-    if (chkw(width, lumStride, chromStride))
-        ff_rgb24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv);
-    else
-        ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv);
-}
-
-static void bgr24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-                   uint8_t *vdst, int width, int height, int lumStride,
-                   int chromStride, int srcStride, int32_t *bgr2yuv)
-{
-    if (chkw(width, lumStride, chromStride))
-        ff_bgr24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv);
-    else
-        ff_bgr24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv);
-}
-
-
 av_cold void rgb2rgb_init_aarch64(void)
 {
     int cpu_flags = av_get_cpu_flags();
 
     if (have_neon(cpu_flags)) {
         interleaveBytes = ff_interleave_bytes_neon;
-        ff_rgb24toyv12 = rgb24toyv12_check;
-        ff_bgr24toyv12 = bgr24toyv12_check;
+        ff_rgb24toyv12 = ff_rgb24toyv12_aarch64;
+        ff_bgr24toyv12 = ff_bgr24toyv12_aarch64;
     }
 }
-- 
2.43.0


From 50599f9ca10c2ee6a78eb1397fdd8745c025b28b Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 27 Apr 2023 13:01:43 +0000
Subject: [PATCH 120/157] tests/swscale: Add options for width and height on
 the command line

(cherry picked from commit eb8a09779688fc05bf204fdfcd063b04cda07271)
---
 libswscale/tests/swscale.c | 84 ++++++++++++++++++++++++++------------
 1 file changed, 59 insertions(+), 25 deletions(-)

diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c
index 6c38041ddb..4cf41d9f64 100644
--- a/libswscale/tests/swscale.c
+++ b/libswscale/tests/swscale.c
@@ -355,56 +355,71 @@ static int fileTest(const uint8_t * const ref[4], int refStride[4],
     return 0;
 }
 
-#define W 96
-#define H 96
-
 int main(int argc, char **argv)
 {
+    unsigned int W = 96;
+    unsigned int H = 96;
+    unsigned int W2;
+    unsigned int H2;
+    unsigned int S;
     enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE;
     enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE;
-    uint8_t *rgb_data   = av_malloc(W * H * 4);
-    const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL };
-    int rgb_stride[4]   = { 4 * W, 0, 0, 0 };
-    uint8_t *data       = av_malloc(4 * W * H);
-    const uint8_t * const src[4] = { data, data + W * H, data + W * H * 2, data + W * H * 3 };
-    int stride[4]       = { W, W, W, W };
     int x, y;
     struct SwsContext *sws;
     AVLFG rand;
     int res = -1;
     int i;
     FILE *fp = NULL;
-
-    if (!rgb_data || !data)
-        return -1;
+    uint8_t *rgb_data;
+    uint8_t * rgb_src[4] = { NULL };
+    int rgb_stride[4]   = { 0 };
+    uint8_t *data;
+    uint8_t * src[4] = { NULL };
+    int stride[4]       = { 0 };
 
     for (i = 1; i < argc; i += 2) {
+        const char * const arg2 = argv[i+1];
+
         if (argv[i][0] != '-' || i + 1 == argc)
             goto bad_option;
         if (!strcmp(argv[i], "-ref")) {
-            fp = fopen(argv[i + 1], "r");
+            fp = fopen(arg2, "r");
             if (!fp) {
-                fprintf(stderr, "could not open '%s'\n", argv[i + 1]);
+                fprintf(stderr, "could not open '%s'\n", arg2);
                 goto error;
             }
         } else if (!strcmp(argv[i], "-cpuflags")) {
             unsigned flags = av_get_cpu_flags();
-            int ret = av_parse_cpu_caps(&flags, argv[i + 1]);
+            int ret = av_parse_cpu_caps(&flags, arg2);
             if (ret < 0) {
-                fprintf(stderr, "invalid cpu flags %s\n", argv[i + 1]);
+                fprintf(stderr, "invalid cpu flags %s\n", arg2);
                 return ret;
             }
             av_force_cpu_flags(flags);
         } else if (!strcmp(argv[i], "-src")) {
-            srcFormat = av_get_pix_fmt(argv[i + 1]);
+            srcFormat = av_get_pix_fmt(arg2);
             if (srcFormat == AV_PIX_FMT_NONE) {
-                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
+                fprintf(stderr, "invalid pixel format %s\n", arg2);
                 return -1;
             }
         } else if (!strcmp(argv[i], "-dst")) {
-            dstFormat = av_get_pix_fmt(argv[i + 1]);
+            dstFormat = av_get_pix_fmt(arg2);
             if (dstFormat == AV_PIX_FMT_NONE) {
-                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
+                fprintf(stderr, "invalid pixel format %s\n", arg2);
+                return -1;
+            }
+        } else if (!strcmp(argv[i], "-w")) {
+            char * p = NULL;
+            W = strtoul(arg2, &p, 0);
+            if (!W || *p) {
+                fprintf(stderr, "bad width %s\n", arg2);
+                return -1;
+            }
+        } else if (!strcmp(argv[i], "-h")) {
+            char * p = NULL;
+            H = strtoul(arg2, &p, 0);
+            if (!H || *p) {
+                fprintf(stderr, "bad height '%s' (H=%d, *p=%d)\n", arg2, H, *p);
                 return -1;
             }
         } else {
@@ -414,15 +429,34 @@ bad_option:
         }
     }
 
-    sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H,
+    S = (W + 15) & ~15;
+    rgb_data   = av_mallocz(S * H * 4);
+    rgb_src[0] = rgb_data;
+    rgb_stride[0]   = 4 * S;
+    data       = av_mallocz(4 * S * H);
+    src[0] = data;
+    src[1] = data + S * H;
+    src[2] = data + S * H * 2;
+    src[3] = data + S * H * 3;
+    stride[0] = S;
+    stride[1] = S;
+    stride[2] = S;
+    stride[3] = S;
+    H2 = H < 96 ? 8 : H / 12;
+    W2 = W < 96 ? 8 : W / 12;
+
+    if (!rgb_data || !data)
+        return -1;
+
+    sws = sws_getContext(W2, H2, AV_PIX_FMT_RGB32, W, H,
                          AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
 
     av_lfg_init(&rand, 1);
 
     for (y = 0; y < H; y++)
         for (x = 0; x < W * 4; x++)
-            rgb_data[ x + y * 4 * W] = av_lfg_get(&rand);
-    res = sws_scale(sws, rgb_src, rgb_stride, 0, H / 12, (uint8_t * const *) src, stride);
+            rgb_data[ x + y * 4 * S] = av_lfg_get(&rand);
+    res = sws_scale(sws, (const uint8_t * const *)rgb_src, rgb_stride, 0, H2, (uint8_t * const *) src, stride);
     if (res < 0 || res != H) {
         res = -1;
         goto error;
@@ -431,10 +465,10 @@ bad_option:
     av_free(rgb_data);
 
     if(fp) {
-        res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat);
+        res = fileTest((const uint8_t * const *)src, stride, W, H, fp, srcFormat, dstFormat);
         fclose(fp);
     } else {
-        selfTest(src, stride, W, H, srcFormat, dstFormat);
+        selfTest((const uint8_t * const *)src, stride, W, H, srcFormat, dstFormat);
         res = 0;
     }
 error:
-- 
2.43.0


From 4c7e996b5c223978043924fe14525ddeadbbe2a2 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 26 Apr 2023 16:31:23 +0000
Subject: [PATCH 121/157] tests/swscale: Add a timing option

-t <n>   Where n is the number of time to loop the scale op.
         Often useful to do it 10 times or so for better resolution

(cherry picked from commit 50cd60a23a66254f911376602d07b30fcafbde96)
---
 libswscale/tests/swscale.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c
index 4cf41d9f64..12776ffec7 100644
--- a/libswscale/tests/swscale.c
+++ b/libswscale/tests/swscale.c
@@ -23,6 +23,7 @@
 #include <string.h>
 #include <inttypes.h>
 #include <stdarg.h>
+#include <time.h>
 
 #undef HAVE_AV_CONFIG_H
 #include "libavutil/cpu.h"
@@ -78,6 +79,15 @@ struct Results {
     uint32_t crc;
 };
 
+static int time_rep = 0;
+
+static uint64_t utime(void)
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_nsec / 1000 + (uint64_t)ts.tv_sec * 1000000;
+}
+
 // test by ref -> src -> dst -> out & compare out against ref
 // ref & out are YV12
 static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
@@ -174,7 +184,7 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
         goto end;
     }
 
-    printf(" %s %dx%d -> %s %3dx%3d flags=%2d",
+    printf(" %s %4dx%4d -> %s %4dx%4d flags=%2d",
            desc_src->name, srcW, srcH,
            desc_dst->name, dstW, dstH,
            flags);
@@ -182,6 +192,17 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
 
     sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
 
+    if (time_rep != 0)
+    {
+        const uint64_t now = utime();
+        uint64_t done;
+        for (i = 1; i != time_rep; ++i) {
+            sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
+        }
+        done = utime();
+        printf(" T=%7"PRId64"us ", done-now);
+    }
+
     for (i = 0; i < 4 && dstStride[i]; i++)
         crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i],
                      dstStride[i] * dstH);
@@ -419,7 +440,14 @@ int main(int argc, char **argv)
             char * p = NULL;
             H = strtoul(arg2, &p, 0);
             if (!H || *p) {
-                fprintf(stderr, "bad height '%s' (H=%d, *p=%d)\n", arg2, H, *p);
+                fprintf(stderr, "bad height '%s'\n", arg2);
+                return -1;
+            }
+        } else if (!strcmp(argv[i], "-t")) {
+            char * p = NULL;
+            time_rep = (int)strtol(arg2, &p, 0);
+            if (*p) {
+                fprintf(stderr, "bad time repetitions '%s'\n", arg2);
                 return -1;
             }
         } else {
-- 
2.43.0


From 7596d5c8316f58cc0275139328ab69bd2a3ebdfd Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 20 Apr 2023 13:40:36 +0000
Subject: [PATCH 122/157] swscale: RGB->YUV420 fix C template to allow odd
 widths

(cherry picked from commit 08b2023e7b5292df0adc6593e4d20087f9cef5c8)
---
 libswscale/rgb2rgb_template.c | 44 +++++++++++++++++++++++++++++++++++
 libswscale/swscale_unscaled.c | 11 ++++-----
 2 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
index 703de90690..e711589e1e 100644
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -679,6 +679,19 @@ static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
             ydst[2 * i + 1] = Y;
         }
+        if ((width & 1) != 0) {
+            unsigned int b = src[6 * i + 0];
+            unsigned int g = src[6 * i + 1];
+            unsigned int r = src[6 * i + 2];
+
+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
+            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
+            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
+
+            udst[i]     = U;
+            vdst[i]     = V;
+            ydst[2 * i] = Y;
+        }
         ydst += lumStride;
         src  += srcStride;
 
@@ -701,6 +714,15 @@ static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
             ydst[2 * i + 1] = Y;
         }
+        if ((width & 1) != 0) {
+            unsigned int b = src[6 * i + 0];
+            unsigned int g = src[6 * i + 1];
+            unsigned int r = src[6 * i + 2];
+
+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+
+            ydst[2 * i] = Y;
+        }
         udst += chromStride;
         vdst += chromStride;
         ydst += lumStride;
@@ -767,6 +789,19 @@ static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
             ydst[2 * i + 1] = Y;
         }
+        if ((width & 1) != 0) {
+            unsigned int b = src[8 * i + 2];
+            unsigned int g = src[8 * i + 1];
+            unsigned int r = src[8 * i + 0];
+
+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
+            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
+            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
+
+            udst[i]     = U;
+            vdst[i]     = V;
+            ydst[2 * i] = Y;
+        }
         ydst += lumStride;
         src  += srcStride;
 
@@ -789,6 +824,15 @@ static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
             ydst[2 * i + 1] = Y;
         }
+        if ((width & 1) != 0) {
+            unsigned int b = src[8 * i + 2];
+            unsigned int g = src[8 * i + 1];
+            unsigned int r = src[8 * i + 0];
+
+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+
+            ydst[2 * i] = Y;
+        }
         udst += chromStride;
         vdst += chromStride;
         ydst += lumStride;
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index 053c06adf5..52469b2e4a 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -2062,7 +2062,6 @@ void ff_get_unscaled_swscale(SwsContext *c)
     const enum AVPixelFormat dstFormat = c->dstFormat;
     const int flags = c->flags;
     const int dstH = c->dstH;
-    const int dstW = c->dstW;
     int needsDither;
 
     needsDither = isAnyRGB(dstFormat) &&
@@ -2120,12 +2119,12 @@ void ff_get_unscaled_swscale(SwsContext *c)
     /* bgr24toYV12 */
     if (srcFormat == AV_PIX_FMT_BGR24 &&
         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
-        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        !(flags & SWS_ACCURATE_RND))
         c->convert_unscaled = bgr24ToYv12Wrapper;
     /* rgb24toYV12 */
     if (srcFormat == AV_PIX_FMT_RGB24 &&
         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
-        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        !(flags & SWS_ACCURATE_RND))
         c->convert_unscaled = rgb24ToYv12Wrapper;
 
     /* bgrxtoYV12 */
@@ -2136,17 +2135,17 @@ void ff_get_unscaled_swscale(SwsContext *c)
     /* rgbx24toYV12 */
     if (((srcFormat == AV_PIX_FMT_RGBA && dstFormat == AV_PIX_FMT_YUV420P) ||
          (srcFormat == AV_PIX_FMT_RGB0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
-        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        !(flags & SWS_ACCURATE_RND))
         c->convert_unscaled = rgbxToYv12Wrapper;
     /* xbgrtoYV12 */
     if (((srcFormat == AV_PIX_FMT_ABGR && dstFormat == AV_PIX_FMT_YUV420P) ||
          (srcFormat == AV_PIX_FMT_0BGR && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
-        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        !(flags & SWS_ACCURATE_RND))
         c->convert_unscaled = xbgrToYv12Wrapper;
     /* xrgb24toYV12 */
     if (((srcFormat == AV_PIX_FMT_ARGB && dstFormat == AV_PIX_FMT_YUV420P) ||
          (srcFormat == AV_PIX_FMT_0RGB && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
-        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+        !(flags & SWS_ACCURATE_RND))
         c->convert_unscaled = xrgbToYv12Wrapper;
 
     /* RGB/BGR -> RGB/BGR (no dither needed forms) */
-- 
2.43.0


From 4b1ec4de9e461504cff54acd57c351fdd041d510 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 4 May 2023 14:26:14 +0000
Subject: [PATCH 123/157] rtpenc: Add code to send H264 new extradata in
 sidedata

Fixes issue with pi V4L2 H264 encode which cannot create extradata
at init time.

(cherry picked from commit 4f852b4b093f841b64b4934a6f1720e98e4e0f2c)
---
 libavformat/rtpenc.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/libavformat/rtpenc.c b/libavformat/rtpenc.c
index a8d296a154..f67dc2a15a 100644
--- a/libavformat/rtpenc.c
+++ b/libavformat/rtpenc.c
@@ -19,6 +19,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "avc.h"
 #include "avformat.h"
 #include "mpegts.h"
 #include "internal.h"
@@ -585,8 +586,25 @@ static int rtp_write_packet(AVFormatContext *s1, AVPacket *pkt)
         ff_rtp_send_vc2hq(s1, pkt->data, size, st->codecpar->field_order != AV_FIELD_PROGRESSIVE ? 1 : 0);
         break;
     case AV_CODEC_ID_H264:
+    {
+        uint8_t *side_data;
+        int side_data_size = 0;
+
+        side_data = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA,
+                                            &side_data_size);
+
+        if (side_data_size != 0) {
+            int ps_size = side_data_size;
+            uint8_t * ps_buf = NULL;
+
+            ff_avc_write_annexb_extradata(side_data, &ps_buf, &ps_size);
+            av_log(s1, AV_LOG_TRACE, "H264: write side data=%d\n", ps_size);
+            ff_rtp_send_h264_hevc(s1, ps_buf ? ps_buf : side_data, ps_size);
+            av_free(ps_buf);
+        }
         ff_rtp_send_h264_hevc(s1, pkt->data, size);
         break;
+    }
     case AV_CODEC_ID_H261:
         ff_rtp_send_h261(s1, pkt->data, size);
         break;
-- 
2.43.0


From 0decf019069696cc940eedce36070ddbd58847ad Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 5 Jun 2023 08:34:38 +0000
Subject: [PATCH 124/157] rgb2rgb: Fix luma narrow+saturation instruction

(cherry picked from commit 9cdac1c08ad5c0aea28907d1d3fd0bdda387955a)
---
 libswscale/aarch64/rgb2rgb_neon.S | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 978ab443ea..476ca723a0 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -203,11 +203,11 @@ function ff_bgr24toyv12_aarch64, export=1
         shrn            v26.4h, v26.4s, #12
         shrn2           v26.8h, v27.4s, #12
         add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-        uqrshrn         v26.8b, v26.8h, #3
+        sqrshrun        v26.8b, v26.8h, #3
         shrn            v28.4h, v28.4s, #12
         shrn2           v28.8h, v29.4s, #12
         add             v28.8h, v28.8h, v6.8h
-        uqrshrn2        v26.16b, v28.8h, #3
+        sqrshrun2       v26.16b, v28.8h, #3
         // Y0/Y1
 
         // U
@@ -282,11 +282,11 @@ function ff_bgr24toyv12_aarch64, export=1
         shrn            v26.4h, v26.4s, #12
         shrn2           v26.8h, v27.4s, #12
         add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-        uqrshrn         v26.8b, v26.8h, #3
+        sqrshrun        v26.8b, v26.8h, #3
         shrn            v28.4h, v28.4s, #12
         shrn2           v28.8h, v29.4s, #12
         add             v28.8h, v28.8h, v6.8h
-        uqrshrn2        v26.16b, v28.8h, #3
+        sqrshrun2       v26.16b, v28.8h, #3
         // Y0/Y1
 
         // U
@@ -416,11 +416,11 @@ function ff_bgr24toyv12_aarch64, export=1
         shrn            v26.4h, v26.4s, #12
         shrn2           v26.8h, v27.4s, #12
         add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-        uqrshrn         v26.8b, v26.8h, #3
+        sqrshrun        v26.8b, v26.8h, #3
         shrn            v28.4h, v28.4s, #12
         shrn2           v28.8h, v29.4s, #12
         add             v28.8h, v28.8h, v6.8h
-        uqrshrn2        v26.16b, v28.8h, #3
+        sqrshrun2       v26.16b, v28.8h, #3
         // Y0/Y1
 
         subs            w9, w9, #16
@@ -464,11 +464,11 @@ function ff_bgr24toyv12_aarch64, export=1
         shrn            v26.4h, v26.4s, #12
         shrn2           v26.8h, v27.4s, #12
         add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-        uqrshrn         v26.8b, v26.8h, #3
+        sqrshrun        v26.8b, v26.8h, #3
         shrn            v28.4h, v28.4s, #12
         shrn2           v28.8h, v29.4s, #12
         add             v28.8h, v28.8h, v6.8h
-        uqrshrn2        v26.16b, v28.8h, #3
+        sqrshrun2       v26.16b, v28.8h, #3
         // Y0/Y1
 
         // Here:
-- 
2.43.0


From 47ea04bc69618f537ccd6265972da7126864a3a3 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Sun, 4 Jun 2023 13:37:59 +0000
Subject: [PATCH 125/157] v4l2_m2m_dec: Tweak pending count to use dts &
 reorder size

(cherry picked from commit ca438b382c90f9a5f58f4708205e6ac25395db2a)
---
 libavcodec/v4l2_m2m.h     |  1 +
 libavcodec/v4l2_m2m_dec.c | 53 +++++++++++++++++++++++++++++++--------
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
index ded1478a49..a506e69d67 100644
--- a/libavcodec/v4l2_m2m.h
+++ b/libavcodec/v4l2_m2m.h
@@ -115,6 +115,7 @@ typedef struct V4L2m2mContext {
 
     /* req pkt */
     int req_pkt;
+    int reorder_size;
 
     /* Ext data sent */
     int extdata_sent;
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index d124c7b1fc..13af62e819 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -121,13 +121,18 @@ log_dump(void * logctx, int lvl, const void * const data, const size_t len)
 }
 #endif
 
-static int64_t pts_stats_guess(const pts_stats_t * const stats)
+static unsigned int pts_stats_interval(const pts_stats_t * const stats)
+{
+    return stats->last_interval;
+}
+
+static int64_t pts_stats_guess(const pts_stats_t * const stats, const int fail_bad_guess)
 {
     if (stats->last_count <= 1)
         return stats->last_pts;
     if (stats->last_pts == AV_NOPTS_VALUE ||
-            stats->last_interval == 0 ||
-            stats->last_count >= STATS_LAST_COUNT_MAX)
+            fail_bad_guess && (stats->last_interval == 0 ||
+                               stats->last_count >= STATS_LAST_COUNT_MAX))
         return AV_NOPTS_VALUE;
     return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval;
 }
@@ -345,7 +350,7 @@ set_best_effort_pts(AVCodecContext *const avctx,
 {
     pts_stats_add(ps, frame->pts);
 
-    frame->best_effort_timestamp = pts_stats_guess(ps);
+    frame->best_effort_timestamp = pts_stats_guess(ps, 1);
     // If we can't guess from just PTS - try DTS
     if (frame->best_effort_timestamp == AV_NOPTS_VALUE)
         frame->best_effort_timestamp = frame->pkt_dts;
@@ -380,15 +385,25 @@ xlat_init(xlat_track_t * const x)
 }
 
 static int
-xlat_pending(const xlat_track_t * const x)
+xlat_pending(const V4L2m2mContext * const s)
 {
+    const xlat_track_t *const x = &s->xlat;
     unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
     int i;
-    const int64_t now = x->last_pts;
+    const int64_t now = pts_stats_guess(&s->pts_stat, 0);
+    int64_t first_dts = AV_NOPTS_VALUE;
+    int no_dts_count = 0;
+    unsigned int interval = pts_stats_interval(&s->pts_stat);
 
     for (i = 0; i < FF_V4L2_M2M_TRACK_SIZE; ++i, n = (n - 1) & (FF_V4L2_M2M_TRACK_SIZE - 1)) {
         const V4L2m2mTrackEl * const t = x->track_els + n;
 
+        if (first_dts == AV_NOPTS_VALUE)
+            if (t->dts == AV_NOPTS_VALUE)
+                ++no_dts_count;
+            else
+                first_dts = t->dts;
+
         // Discard only set on never-set or flushed entries
         // So if we get here we've never successfully decoded a frame so allow
         // more frames into the buffer before stalling
@@ -408,6 +423,18 @@ xlat_pending(const xlat_track_t * const x)
             break;
     }
 
+    if (first_dts != AV_NOPTS_VALUE && now != AV_NOPTS_VALUE && interval != 0 && s->reorder_size != 0) {
+        const int iframes = (first_dts - now) / (int)interval;
+        const int t = iframes - s->reorder_size + no_dts_count;
+
+//        av_log(s->avctx, AV_LOG_DEBUG, "Last:%"PRId64", Now:%"PRId64", First:%"PRId64", delta=%"PRId64", frames=%d, nodts=%d\n",
+//               x->last_dts, now, first_dts, first_dts - now, iframes, no_dts_count);
+
+        if (iframes > 0 && iframes < 64 && t < i) {
+            return t;
+        }
+    }
+
     return i;
 }
 
@@ -585,12 +612,12 @@ static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
 static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 {
     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-    int src_rv = NQ_OK;
+    int src_rv = -1;
     int dst_rv = 1;  // Non-zero (done), non-negative (error) number
     unsigned int i = 0;
 
     do {
-        const int pending = xlat_pending(&s->xlat);
+        const int pending = xlat_pending(s);
         const int prefer_dq = (pending > 4);
         const int last_src_rv = src_rv;
 
@@ -966,8 +993,10 @@ static uint32_t max_coded_size(const AVCodecContext * const avctx)
 }
 
 static void
-parse_extradata(AVCodecContext *avctx)
+parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s)
 {
+    s->reorder_size = 0;
+
     if (!avctx->extradata || !avctx->extradata_size)
         return;
 
@@ -996,6 +1025,7 @@ parse_extradata(AVCodecContext *avctx)
                     avctx->profile = ff_h264_get_profile(sps);
                     avctx->level = sps->level_idc;
                 }
+                s->reorder_size = sps->num_reorder_frames;
             }
             ff_h264_ps_uninit(&ps);
             break;
@@ -1025,6 +1055,7 @@ parse_extradata(AVCodecContext *avctx)
                 if (sps) {
                     avctx->profile = sps->ptl.general_ptl.profile_idc;
                     avctx->level   = sps->ptl.general_ptl.level_idc;
+                    s->reorder_size = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering;
                 }
             }
             ff_hevc_ps_uninit(&ps);
@@ -1057,12 +1088,12 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
         avctx->ticks_per_frame = 2;
     }
 
-    parse_extradata(avctx);
-
     ret = ff_v4l2_m2m_create_context(priv, &s);
     if (ret < 0)
         return ret;
 
+    parse_extradata(avctx, s);
+
     xlat_init(&s->xlat);
     pts_stats_init(&s->pts_stat, avctx, "decoder");
 
-- 
2.43.0


From 1da8444db4cd670b8425818847fb5fdfcf6fe063 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 7 Jun 2023 11:14:52 +0000
Subject: [PATCH 126/157] v4l2_m2m: Add encode size check

Previously an out of bounds size would fail whilst trying to copy the
buffer with an unhelpful message. This produces a better error at init
time.

(cherry picked from commit 0b61c4617e26f043d28d44c8767f7b9fd4882f97)
---
 libavcodec/v4l2_m2m.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index f802687b1b..28d9ed4988 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -109,6 +109,44 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)
     return AVERROR(EINVAL);
 }
 
+static int check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
+{
+    struct v4l2_format fmt = {.type = s->output.type};
+    int rv;
+    uint32_t pixfmt = ff_v4l2_format_avfmt_to_v4l2(avctx->pix_fmt);
+    unsigned int w;
+    unsigned int h;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
+        fmt.fmt.pix_mp.pixelformat = pixfmt;
+        fmt.fmt.pix_mp.width = avctx->width;
+        fmt.fmt.pix_mp.height = avctx->height;
+    }
+    else {
+        fmt.fmt.pix.pixelformat = pixfmt;
+        fmt.fmt.pix.width = avctx->width;
+        fmt.fmt.pix.height = avctx->height;
+    }
+
+    rv = ioctl(s->fd, VIDIOC_TRY_FMT, &fmt);
+
+    if (rv != 0) {
+        rv = AVERROR(errno);
+        av_log(avctx, AV_LOG_ERROR, "%s: Tryfmt failed: %s\n", __func__, av_err2str(rv));
+        return rv;
+    }
+
+    w = ff_v4l2_get_format_width(&fmt);
+    h = ff_v4l2_get_format_height(&fmt);
+
+    if (w < avctx->width || h < avctx->height) {
+        av_log(avctx, AV_LOG_WARNING, "%s: Size check failed: asked for %dx%d, got: %dx%d\n", __func__, avctx->width, avctx->height, w, h);
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
 static int v4l2_probe_driver(V4L2m2mContext *s)
 {
     void *log_ctx = s->avctx;
@@ -128,6 +166,11 @@ static int v4l2_probe_driver(V4L2m2mContext *s)
         goto done;
     }
 
+    // If being given frames (encode) check that V4L2 can cope with the size
+    if (s->output.av_codec_id == AV_CODEC_ID_RAWVIDEO &&
+        (ret = check_size(s->avctx, s)) != 0)
+        goto done;
+
     ret = ff_v4l2_context_get_format(&s->capture, 1);
     if (ret) {
         av_log(log_ctx, AV_LOG_DEBUG, "v4l2 capture format not supported\n");
-- 
2.43.0


From 031c81388ead3a0e5188a9869d0ef07981e7c616 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 9 Jun 2023 10:28:12 +0000
Subject: [PATCH 127/157] vf_bwdif: Add attributes to ask for vectorization

(cherry picked from commit 281250290ba5c2dcd8676e9a261050e65c10bcb7)
---
 libavfilter/vf_bwdif.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 65c617ebb3..09e68523bb 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -74,10 +74,10 @@ typedef struct ThreadData {
         int temporal_diff1 =(FFABS(prev[mrefs] - c) + FFABS(prev[prefs] - e)) >> 1; \
         int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e)) >> 1; \
         int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \
- \
+ {/*\
         if (!diff) { \
             dst[0] = d; \
-        } else {
+        } else {*/
 
 #define SPAT_CHECK() \
             int b = ((prev2[mrefs2] + next2[mrefs2]) >> 1) - c; \
@@ -89,15 +89,16 @@ typedef struct ThreadData {
             diff = FFMAX3(diff, min, -max);
 
 #define FILTER_LINE() \
+            int i1, i2; \
             SPAT_CHECK() \
-            if (FFABS(c - e) > temporal_diff0) { \
-                interpol = (((coef_hf[0] * (prev2[0] + next2[0]) \
+            /*if (FFABS(c - e) > temporal_diff0)*/ { \
+                i1 = (((coef_hf[0] * (prev2[0] + next2[0]) \
                     - coef_hf[1] * (prev2[mrefs2] + next2[mrefs2] + prev2[prefs2] + next2[prefs2]) \
                     + coef_hf[2] * (prev2[mrefs4] + next2[mrefs4] + prev2[prefs4] + next2[prefs4])) >> 2) \
                     + coef_lf[0] * (c + e) - coef_lf[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
-            } else { \
-                interpol = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
-            }
+            } /*else*/ { \
+                i2 = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
+            }interpol = FFABS(c - e) > temporal_diff0 ? i1:i2;\
 
 #define FILTER_EDGE() \
             if (spat) { \
@@ -111,7 +112,7 @@ typedef struct ThreadData {
             else if (interpol < d - diff) \
                 interpol = d - diff; \
  \
-            dst[0] = av_clip(interpol, 0, clip_max); \
+            dst[0] = !diff ? d : av_clip(interpol, 0, clip_max); \
         } \
  \
         dst++; \
@@ -122,7 +123,7 @@ typedef struct ThreadData {
         next2++; \
     }
 
-static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
+static void __attribute__((optimize("tree-vectorize"))) filter_intra(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
                          int prefs3, int mrefs3, int parity, int clip_max)
 {
     uint8_t *dst = dst1;
@@ -132,7 +133,7 @@ static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
     FILTER_INTRA()
 }
 
-static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+static void __attribute__((optimize("tree-vectorize"))) filter_line_c(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
                           int w, int prefs, int mrefs, int prefs2, int mrefs2,
                           int prefs3, int mrefs3, int prefs4, int mrefs4,
                           int parity, int clip_max)
@@ -150,7 +151,7 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
     FILTER2()
 }
 
-static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
+static void __attribute__((optimize("tree-vectorize"))) filter_edge(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
                         int w, int prefs, int mrefs, int prefs2, int mrefs2,
                         int parity, int clip_max, int spat)
 {
@@ -167,7 +168,7 @@ static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
     FILTER2()
 }
 
-static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mrefs,
+static void __attribute__((optimize("tree-vectorize"))) filter_intra_16bit(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
                                int prefs3, int mrefs3, int parity, int clip_max)
 {
     uint16_t *dst = dst1;
@@ -177,7 +178,7 @@ static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mre
     FILTER_INTRA()
 }
 
-static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1,
+static void __attribute__((optimize("tree-vectorize"))) filter_line_c_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
                                 int w, int prefs, int mrefs, int prefs2, int mrefs2,
                                 int prefs3, int mrefs3, int prefs4, int mrefs4,
                                 int parity, int clip_max)
@@ -195,7 +196,7 @@ static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1
     FILTER2()
 }
 
-static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1,
+static void __attribute__((optimize("tree-vectorize"))) filter_edge_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
                               int parity, int clip_max, int spat)
 {
-- 
2.43.0


From cb02e2f13450ebccb4a71d596a68903a1a77c538 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 13 Jun 2023 13:07:55 +0000
Subject: [PATCH 128/157] v4l2m2m_dec: Fix h264 reorder size if no sps
 initially

(cherry picked from commit 8832f7924bf47cbca0de251d7b406917f958ebf4)
---
 libavcodec/v4l2_m2m_dec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 13af62e819..11c83b2d66 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -1024,8 +1024,8 @@ parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s)
                 if (sps) {
                     avctx->profile = ff_h264_get_profile(sps);
                     avctx->level = sps->level_idc;
+                    s->reorder_size = sps->num_reorder_frames;
                 }
-                s->reorder_size = sps->num_reorder_frames;
             }
             ff_h264_ps_uninit(&ps);
             break;
-- 
2.43.0


From 973f03ebac935f9c5abc96172f45b7696f1170ed Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 30 Jun 2023 18:03:29 +0000
Subject: [PATCH 129/157] sand_fns: Add missing uxtw for neon stride

---
 libavutil/aarch64/rpi_sand_neon.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
index 2f07d9674c..19411cf3f1 100644
--- a/libavutil/aarch64/rpi_sand_neon.S
+++ b/libavutil/aarch64/rpi_sand_neon.S
@@ -469,6 +469,7 @@ endfunc
 function ff_rpi_sand30_lines_to_planar_y16, export=1
                 lsl             w4,  w4,  #7
                 sub             w4,  w4,  #64
+                uxtw            x4,  w4
                 sub             w1,  w1,  w7, lsl #1
                 uxtw            x6,  w6
                 add             x8,  x2,  x6, lsl #7
@@ -634,6 +635,7 @@ endfunc
 function ff_rpi_sand30_lines_to_planar_y8, export=1
                 lsl             w4,  w4,  #7
                 sub             w4,  w4,  #64
+                uxtw            x4,  w4
                 sub             w1,  w1,  w7
                 uxtw            x6,  w6
                 add             x8,  x2,  x6, lsl #7
-- 
2.43.0


From 22024a1907135ffdb613d5c8965d589024d3ac59 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 30 Jun 2023 18:12:16 +0000
Subject: [PATCH 130/157] sand_fns: Rework aarch64 neon
 sand30_lines_to_planar_c16

Previous version could overflow its write buffer on small buffers
which sometimes crashed WPP_F_ericsson_MAIN10_2.

This version is probably faster too
---
 libavutil/aarch64/rpi_sand_neon.S | 329 ++++++++++++++----------------
 1 file changed, 151 insertions(+), 178 deletions(-)

diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
index 19411cf3f1..af7e2a88c4 100644
--- a/libavutil/aarch64/rpi_sand_neon.S
+++ b/libavutil/aarch64/rpi_sand_neon.S
@@ -248,199 +248,172 @@ incomplete_block_loop_end_c8:
     ret
 endfunc
 
-//void ff_rpi_sand30_lines_to_planar_c16(
-//  uint8_t * dst_u,            // [x0]
-//  unsigned int dst_stride_u,  // [w1] == _w*2
-//  uint8_t * dst_v,            // [x2]
-//  unsigned int dst_stride_v,  // [w3] == _w*2
-//  const uint8_t * src,        // [x4]
-//  unsigned int stride1,       // [w5] == 128
-//  unsigned int stride2,       // [w6] 
-//  unsigned int _x,            // [w7] == 0
-//  unsigned int y,             // [sp, #0] == 0
-//  unsigned int _w,            // [sp, #8] -> w3
-//  unsigned int h);            // [sp, #16] -> w7
-
-.macro rpi_sand30_lines_to_planar_c16_block_half
-    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
-
-    xtn v4.4h, v0.4s
-    ushr v0.4s, v0.4s, #10
-    xtn v5.4h, v0.4s
-    ushr v0.4s, v0.4s, #10
-    xtn v6.4h, v0.4s
-    xtn2 v4.8h, v1.4s
-    ushr v1.4s, v1.4s, #10
-    xtn2 v5.8h, v1.4s
-    ushr v1.4s, v1.4s, #10
-    xtn2 v6.8h, v1.4s
-    and v4.16b, v4.16b, v16.16b
-    and v5.16b, v5.16b, v16.16b
-    and v6.16b, v6.16b, v16.16b
-    st3 { v4.8h, v5.8h, v6.8h }, [sp], #48
-    
-    xtn v4.4h, v2.4s
-    ushr v2.4s, v2.4s, #10
-    xtn v5.4h, v2.4s
-    ushr v2.4s, v2.4s, #10
-    xtn v6.4h, v2.4s
-    xtn2 v4.8h, v3.4s
-    ushr v3.4s, v3.4s, #10
-    xtn2 v5.8h, v3.4s
-    ushr v3.4s, v3.4s, #10
-    xtn2 v6.8h, v3.4s
-    and v4.16b, v4.16b, v16.16b
-    and v5.16b, v5.16b, v16.16b
-    and v6.16b, v6.16b, v16.16b
-    st3 { v4.8h, v5.8h, v6.8h }, [sp]
-    sub sp, sp, #48
-.endm
-
-function ff_rpi_sand30_lines_to_planar_c16, export=1
-    stp x19, x20, [sp, #-48]!
-    stp x21, x22, [sp, #16]
-    stp x23, x24, [sp, #32]
-
-    ldr w3, [sp, #48+8]    // w3 = width
-    ldr w7, [sp, #48+16]   // w7 = height
-
-    // reserve space on the stack for intermediate results
-    sub sp, sp, #256
+// Unzip chroma
+//
+// On entry:
+// a0 = V0, U2,  ...
+// a1 = U0, V1,  ...
+// a2 = U1, V2,  ...
+// b0 = V8, U10, ...
+// b1 = U8, V9,  ...
+// b2 = U9, V10, ...
+//
+// On exit:
+// d0 = U0, U3, ...
+// ...
+// a0 = V0, V3, ..
+// ...
+//
+// Reg order for USAND is a1, a0, a2 (i.e. swap natural order of 1st 2 dest regs)
 
-    // number of 128byte blocks per row, w8 = width / 48
-    mov w9, #48
-    udiv w8, w3, w9
+.macro UZPH_C d0, d1, d2, a0, a1, a2, b0, b1, b2
+                uzp1            \d0\().8h, \a1\().8h, \b1\().8h
+                uzp1            \d1\().8h, \a2\().8h, \b2\().8h
+                uzp2            \d2\().8h, \a0\().8h, \b0\().8h
 
-    // remaining pixels (rem_pix) per row, w9 = width - w8 * 48
-    mul w9, w8, w9
-    sub w9, w3, w9
+                uzp1            \a0\().8h, \a0\().8h, \b0\().8h
+                uzp2            \a1\().8h, \a1\().8h, \b1\().8h
+                uzp2            \a2\().8h, \a2\().8h, \b2\().8h
+.endm
 
-    // row offset, the beginning of the next row to process
-    eor w10, w10, w10
+// SAND30 -> 10bit
+.macro USAND10 d0, d1, d2, a0, a1
+                shrn            \d2\().4h, \a0\().4s, #14
+                xtn             \d0\().4h, \a0\().4s
+                shrn            \d1\().4h, \a0\().4s, #10
 
-    // offset to the beginning of the next block, w11 = stride2 * 128 - 128
-    lsl w11, w6, #7
-    sub w11, w11, #128
+                shrn2           \d2\().8h, \a1\().4s, #14
+                xtn2            \d0\().8h, \a1\().4s
+                shrn2           \d1\().8h, \a1\().4s, #10
 
-    // decrease the height by one and in case of remaining pixels increase the block count by one
-    sub w7, w7, #1
-    cmp w9, #0
-    cset w19, ne    // w19 == 1 iff reamining pixels != 0
-    add w8, w8, w19
+                ushr            \d2\().8h, \d2\().8h, #6
+                bic             \d0\().8h, #0xfc,     lsl #8
+                bic             \d1\().8h, #0xfc,     lsl #8
+.endm
 
-    // bytes we have to move dst back by at the end of every row
-    mov w21, #48*2
-    mul w21, w21, w8
-    sub w21, w1, w21
+// void ff_rpi_sand30_lines_to_planar_c16(
+//   uint8_t * dst_u,            // [x0]
+//   unsigned int dst_stride_u,  // [w1]
+//   uint8_t * dst_v,            // [x2]
+//   unsigned int dst_stride_v,  // [w3]
+//   const uint8_t * src,        // [x4]
+//   unsigned int stride1,       // [w5]      128
+//   unsigned int stride2,       // [w6]
+//   unsigned int _x,            // [w7]      0
+//   unsigned int y,             // [sp, #0]
+//   unsigned int _w,            // [sp, #8]  w9
+//   unsigned int h);            // [sp, #16] w10
 
-    mov w20, #0     // w20 = flag, last row processed
+function ff_rpi_sand30_lines_to_planar_c16, export=1
+                ldr             w7,  [sp, #0]                   // y
+                ldr             w8,  [sp, #8]                   // _w
+                ldr             w10, [sp, #16]                  // h
+                lsl             w6,  w6,  #7                    // Fixup stride2
+                sub             w6,  w6,  #64
+                uxtw            x6,  w6
+                sub             w1,  w1,  w8,  LSL #1           // Fixup chroma strides
+                sub             w3,  w3,  w8,  LSL #1
+                lsl             w7,  w7,  #7                    // Add y to src
+                add             x4,  x4,  w7,  UXTW
+10:
+                mov             w13, #0
+                mov             x5,  x4
+                mov             w9,  w8
+1:
+                ld1             {v0.4s-v3.4s}, [x5], #64
+                ld1             {v4.4s-v7.4s}, [x5], x6
 
-    mov x12, #0x03ff03ff03ff03ff
-    dup v16.2d, x12
+                USAND10         v17, v16, v18, v0, v1
+                USAND10         v20, v19, v21, v2, v3
+                UZPH_C          v0, v1, v2, v16, v17, v18, v19, v20, v21
+                USAND10         v23, v22, v24, v4, v5
+                USAND10         v26, v25, v27, v6, v7
+                UZPH_C          v4, v5, v6, v22, v23, v24, v25, v26, v27
 
-    // iterate through rows, row counter = w12 = 0
-    eor w12, w12, w12
-row_loop_c16:
-    cmp w12, w7
-    bge row_loop_c16_fin
+                subs            w9,  w9,  #48
+                blt             2f
 
-    // address of row data = src + row_offset
-    mov x13, x4
-    add x13, x13, x10
+                st3             {v0.8h-v2.8h},   [x0], #48
+                st3             {v4.8h-v6.8h},   [x0], #48
+                st3             {v16.8h-v18.8h}, [x2], #48
+                st3             {v22.8h-v24.8h}, [x2], #48
 
-    eor w14, w14, w14
-block_loop_c16:
-    cmp w14, w8
-    bge block_loop_c16_fin
-
-    rpi_sand30_lines_to_planar_c16_block_half
-
-    ld2 { v0.8h, v1.8h }, [sp], #32
-    ld2 { v2.8h, v3.8h }, [sp], #32
-    ld2 { v4.8h, v5.8h }, [sp]
-    sub sp, sp, #64
-
-    st1 { v0.8h }, [x0], #16
-    st1 { v2.8h }, [x0], #16
-    st1 { v4.8h }, [x0], #16
-    st1 { v1.8h }, [x2], #16
-    st1 { v3.8h }, [x2], #16
-    st1 { v5.8h }, [x2], #16
-
-    rpi_sand30_lines_to_planar_c16_block_half
-
-    ld2 { v0.8h, v1.8h }, [sp], #32
-    ld2 { v2.8h, v3.8h }, [sp], #32
-    ld2 { v4.8h, v5.8h }, [sp]
-    sub sp, sp, #64
-
-    st1 { v0.8h }, [x0], #16
-    st1 { v2.8h }, [x0], #16
-    st1 { v4.8h }, [x0], #16
-    st1 { v1.8h }, [x2], #16
-    st1 { v3.8h }, [x2], #16
-    st1 { v5.8h }, [x2], #16
-
-    add x13, x13, x11 // offset to next block
-    add w14, w14, #1
-    b block_loop_c16
-block_loop_c16_fin:
+                bne             1b
+11:
+                subs            w10, w10, #1
+                add             x4,  x4,  #128
+                add             x0,  x0,  w1,  UXTW
+                add             x2,  x2,  w3,  UXTW
+                bne             10b
+99:
+                ret
 
-    add w10, w10, #128
-    add w12, w12, #1
-    add x0, x0, w21, sxtw  // move dst pointers back by x21
-    add x2, x2, w21, sxtw
-    b row_loop_c16
-row_loop_c16_fin:
-
-    cmp w20, #1
-    beq row_loop_c16_fin2
-    mov w20, #1
-    sub w8, w8, w19 // decrease block count by w19
-    add w7, w7, #1 // increase height
-    b row_loop_c16
-
-row_loop_c16_fin2:
-    sub x0, x0, w21, sxtw // readd x21 in case of the last row
-    sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels
-
-    // last incomplete block to be finished
-    // read operations are fine, stride2 is more than large enough even if rem_pix is 0
-    rpi_sand30_lines_to_planar_c16_block_half
-    ld2 { v0.8h, v1.8h }, [sp], #32
-    ld2 { v2.8h, v3.8h }, [sp], #32
-    ld2 { v4.8h, v5.8h }, [sp], #32
-    rpi_sand30_lines_to_planar_c16_block_half
-    ld2 { v0.8h, v1.8h }, [sp], #32
-    ld2 { v2.8h, v3.8h }, [sp], #32
-    ld2 { v4.8h, v5.8h }, [sp]
-    sub sp, sp, #160
-
-    mov x4, sp
-    eor w20, w20, w20
-rem_pix_c16_loop:
-    cmp w20, w9
-    bge rem_pix_c16_fin
-
-    ldr w22, [x4], #4
-    str w22, [x0], #2
-    lsr w22, w22, #16
-    str w22, [x2], #2 
-
-    add w20, w20, #1
-    b rem_pix_c16_loop
-rem_pix_c16_fin:
-
-    add sp, sp, #256
-
-    ldp x23, x24, [sp, #32]
-    ldp x21, x22, [sp, #16]
-    ldp x19, x20, [sp], #48
-    ret
+// Partial final write
+2:
+                cmp             w9,  #24-48
+                blt             1f
+                st3             {v0.8h  - v2.8h},  [x0], #48
+                st3             {v16.8h - v18.8h}, [x2], #48
+                beq             11b
+                mov             v0.16b,  v4.16b
+                mov             v1.16b,  v5.16b
+                sub             w9,  w9,  #24
+                mov             v2.16b,  v6.16b
+                mov             v16.16b, v22.16b
+                mov             v17.16b, v23.16b
+                mov             v18.16b, v24.16b
+1:
+                cmp             w9,  #12-48
+                blt             1f
+                st3             {v0.4h  - v2.4h},  [x0], #24
+                st3             {v16.4h - v18.4h}, [x2], #24
+                beq             11b
+                mov             v0.2d[0],  v0.2d[1]
+                sub             w9,  w9,  #12
+                mov             v1.2d[0],  v1.2d[1]
+                mov             v2.2d[0],  v2.2d[1]
+                mov             v16.2d[0], v16.2d[1]
+                mov             v17.2d[0], v17.2d[1]
+                mov             v18.2d[0], v18.2d[1]
+1:
+                cmp             w9,  #6-48
+                blt             1f
+                st3             {v0.h  - v2.h}[0],  [x0], #6
+                st3             {v0.h  - v2.h}[1],  [x0], #6
+                st3             {v16.h - v18.h}[0], [x2], #6
+                st3             {v16.h - v18.h}[1], [x2], #6
+                beq             11b
+                mov             v0.s[0],  v0.s[1]
+                sub             w9,  w9,  #6
+                mov             v1.s[0],  v1.s[1]
+                mov             v2.s[0],  v2.s[1]
+                mov             v16.s[0], v16.s[1]
+                mov             v17.s[0], v17.s[1]
+                mov             v18.s[0], v18.s[1]
+1:
+                cmp             w9,  #3-48
+                blt             1f
+                st3             {v0.h  - v2.h}[0],  [x0], #6
+                st3             {v16.h - v18.h}[0], [x2], #6
+                beq             11b
+                mov             v0.h[0],  v0.h[1]
+                sub             w9,  w9,  #3
+                mov             v1.h[0],  v1.h[1]
+                mov             v16.h[0], v16.h[1]
+                mov             v17.h[0], v17.h[1]
+1:
+                cmp             w9,  #2-48
+                blt             1f
+                st2             {v0.h  - v1.h}[0],  [x0], #4
+                st2             {v16.h - v17.h}[0], [x2], #4
+                b               11b
+1:
+                st1             {v0.h}[0],  [x0], #2
+                st1             {v16.h}[0], [x2], #2
+                b               11b
 endfunc
 
 
-
 //void ff_rpi_sand30_lines_to_planar_p010(
 //  uint8_t * dest,
 //  unsigned int dst_stride,
-- 
2.43.0


From 9b07cf08ccafa02bbfdc6aacec5a6fc71c059124 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 30 Jun 2023 19:41:06 +0000
Subject: [PATCH 131/157] sand_fns: Minor optimisations to aarch64 neon

---
 libavutil/aarch64/rpi_sand_neon.S | 140 ++++++------------------------
 1 file changed, 28 insertions(+), 112 deletions(-)

diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
index af7e2a88c4..11658de0c8 100644
--- a/libavutil/aarch64/rpi_sand_neon.S
+++ b/libavutil/aarch64/rpi_sand_neon.S
@@ -279,18 +279,37 @@ endfunc
 // SAND30 -> 10bit
 .macro USAND10 d0, d1, d2, a0, a1
                 shrn            \d2\().4h, \a0\().4s, #14
-                xtn             \d0\().4h, \a0\().4s
                 shrn            \d1\().4h, \a0\().4s, #10
 
                 shrn2           \d2\().8h, \a1\().4s, #14
-                xtn2            \d0\().8h, \a1\().4s
                 shrn2           \d1\().8h, \a1\().4s, #10
+                uzp1            \d0\().8h, \a0\().8h, \a1\().8h
 
                 ushr            \d2\().8h, \d2\().8h, #6
                 bic             \d0\().8h, #0xfc,     lsl #8
                 bic             \d1\().8h, #0xfc,     lsl #8
 .endm
 
+// SAND30 -> 8bit
+.macro USAND8 d0, d1, d2, a0, a1, a2, a3, t0, t1, t2
+                shrn            \d1\().4h,  \a0\().4s,  #12
+                shrn2           \d1\().8h,  \a1\().4s,  #12
+                uzp1            \d0\().8h,  \a0\().8h,  \a1\().8h
+                uzp2            \d2\().8h,  \a0\().8h,  \a1\().8h
+
+                shrn            \t1\().4h,  \a2\().4s,  #12
+                shrn2           \t1\().8h,  \a3\().4s,  #12
+                uzp1            \t0\().8h,  \a2\().8h,  \a3\().8h
+                uzp2            \t2\().8h,  \a2\().8h,  \a3\().8h
+
+                shrn            \d0\().8b,  \d0\().8h,  #2
+                shrn2           \d0\().16b, \t0\().8h,  #2
+                shrn            \d2\().8b,  \d2\().8h,  #6
+                shrn2           \d2\().16b, \t2\().8h,  #6
+                uzp1            \d1\().16b, \d1\().16b, \t1\().16b
+.endm
+
+
 // void ff_rpi_sand30_lines_to_planar_c16(
 //   uint8_t * dst_u,            // [x0]
 //   unsigned int dst_stride_u,  // [w1]
@@ -322,6 +341,7 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1
 1:
                 ld1             {v0.4s-v3.4s}, [x5], #64
                 ld1             {v4.4s-v7.4s}, [x5], x6
+                subs            w9,  w9,  #48
 
                 USAND10         v17, v16, v18, v0, v1
                 USAND10         v20, v19, v21, v2, v3
@@ -330,7 +350,6 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1
                 USAND10         v26, v25, v27, v6, v7
                 UZPH_C          v4, v5, v6, v22, v23, v24, v25, v26, v27
 
-                subs            w9,  w9,  #48
                 blt             2f
 
                 st3             {v0.8h-v2.8h},   [x0], #48
@@ -457,61 +476,10 @@ function ff_rpi_sand30_lines_to_planar_y16, export=1
 
                 subs            w5,  w5,  #96
 
-                // v0, v1
-
-                shrn            v18.4h,  v0.4s,   #14
-                xtn             v16.4h,  v0.4s
-                shrn            v17.4h,  v0.4s,   #10
-
-                shrn2           v18.8h,  v1.4s,   #14
-                xtn2            v16.8h,  v1.4s
-                shrn2           v17.8h,  v1.4s,   #10
-
-                ushr            v18.8h,  v18.8h,  #6
-                bic             v16.8h,  #0xfc,   lsl #8
-                bic             v17.8h,  #0xfc,   lsl #8
-
-                // v2, v3
-
-                shrn            v21.4h,  v2.4s,   #14
-                xtn             v19.4h,  v2.4s
-                shrn            v20.4h,  v2.4s,   #10
-
-                shrn2           v21.8h,  v3.4s,   #14
-                xtn2            v19.8h,  v3.4s
-                shrn2           v20.8h,  v3.4s,   #10
-
-                ushr            v21.8h,  v21.8h,  #6
-                bic             v19.8h,  #0xfc,   lsl #8
-                bic             v20.8h,  #0xfc,   lsl #8
-
-                // v4, v5
-
-                shrn            v24.4h,  v4.4s,   #14
-                xtn             v22.4h,  v4.4s
-                shrn            v23.4h,  v4.4s,   #10
-
-                shrn2           v24.8h,  v5.4s,   #14
-                xtn2            v22.8h,  v5.4s
-                shrn2           v23.8h,  v5.4s,   #10
-
-                ushr            v24.8h,  v24.8h,  #6
-                bic             v22.8h,  #0xfc,   lsl #8
-                bic             v23.8h,  #0xfc,   lsl #8
-
-                // v6, v7
-
-                shrn            v27.4h,  v6.4s,   #14
-                xtn             v25.4h,  v6.4s
-                shrn            v26.4h,  v6.4s,   #10
-
-                shrn2           v27.8h,  v7.4s,   #14
-                xtn2            v25.8h,  v7.4s
-                shrn2           v26.8h,  v7.4s,   #10
-
-                ushr            v27.8h,  v27.8h,  #6
-                bic             v25.8h,  #0xfc,   lsl #8
-                bic             v26.8h,  #0xfc,   lsl #8
+                USAND10         v16, v17, v18, v0, v1
+                USAND10         v19, v20, v21, v2, v3
+                USAND10         v22, v23, v24, v4, v5
+                USAND10         v25, v26, v27, v6, v7
 
                 blt             2f
 
@@ -624,60 +592,8 @@ function ff_rpi_sand30_lines_to_planar_y8, export=1
                 subs            w5,  w5,  #96
 
                 // v0, v1
-
-                shrn            v18.4h,  v0.4s,   #16
-                xtn             v16.4h,  v0.4s
-                shrn            v17.4h,  v0.4s,   #12
-
-                shrn2           v18.8h,  v1.4s,   #16
-                xtn2            v16.8h,  v1.4s
-                shrn2           v17.8h,  v1.4s,   #12
-
-                shrn            v18.8b,  v18.8h,  #6
-                shrn            v16.8b,  v16.8h,  #2
-                xtn             v17.8b,  v17.8h
-
-                // v2, v3
-
-                shrn            v21.4h,  v2.4s,   #16
-                xtn             v19.4h,  v2.4s
-                shrn            v20.4h,  v2.4s,   #12
-
-                shrn2           v21.8h,  v3.4s,   #16
-                xtn2            v19.8h,  v3.4s
-                shrn2           v20.8h,  v3.4s,   #12
-
-                shrn2           v18.16b, v21.8h,  #6
-                shrn2           v16.16b, v19.8h,  #2
-                xtn2            v17.16b, v20.8h
-
-                // v4, v5
-
-                shrn            v24.4h,  v4.4s,   #16
-                xtn             v22.4h,  v4.4s
-                shrn            v23.4h,  v4.4s,   #12
-
-                shrn2           v24.8h,  v5.4s,   #16
-                xtn2            v22.8h,  v5.4s
-                shrn2           v23.8h,  v5.4s,   #12
-
-                shrn            v21.8b,  v24.8h,  #6
-                shrn            v19.8b,  v22.8h,  #2
-                xtn             v20.8b,  v23.8h
-
-                // v6, v7
-
-                shrn            v27.4h,  v6.4s,   #16
-                xtn             v25.4h,  v6.4s
-                shrn            v26.4h,  v6.4s,   #12
-
-                shrn2           v27.8h,  v7.4s,   #16
-                xtn2            v25.8h,  v7.4s
-                shrn2           v26.8h,  v7.4s,   #12
-
-                shrn2           v21.16b, v27.8h,  #6
-                shrn2           v19.16b, v25.8h,  #2
-                xtn2            v20.16b, v26.8h
+                USAND8          v16, v17, v18, v0, v1, v2, v3, v22, v23, v24
+                USAND8          v19, v20, v21, v4, v5, v6, v7, v22, v23, v24
 
                 blt             2f
 
-- 
2.43.0


From d1e05d0a620142a5d42e50d6f4c23ff503447563 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Sat, 1 Jul 2023 18:43:32 +0000
Subject: [PATCH 132/157] sand_fns: Add test for neon to sand30 fns so they can
 be tested by checkasm

---
 libavutil/rpi_sand_fns.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
index b6071e2928..0626bb06cb 100644
--- a/libavutil/rpi_sand_fns.c
+++ b/libavutil/rpi_sand_fns.c
@@ -35,10 +35,12 @@ Authors: John Cox
 #include "frame.h"
 
 #if ARCH_ARM && HAVE_NEON
-#include "arm/rpi_sand_neon.h"
+#include "libavutil/arm/cpu.h"
+#include "libavutil/arm/rpi_sand_neon.h"
 #define HAVE_SAND_ASM 1
 #elif ARCH_AARCH64 && HAVE_NEON
-#include "aarch64/rpi_sand_neon.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavutil/aarch64/rpi_sand_neon.h"
 #define HAVE_SAND_ASM 1
 #else
 #define HAVE_SAND_ASM 0
@@ -97,7 +99,7 @@ void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
     const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
 
 #if HAVE_SAND_ASM
-    if (_x == 0) {
+    if (_x == 0 && have_neon(av_get_cpu_flags())) {
         ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
         return;
     }
@@ -163,7 +165,7 @@ void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_
     const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
 
 #if HAVE_SAND_ASM
-    if (_x == 0) {
+    if (_x == 0 && have_neon(av_get_cpu_flags())) {
         ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v,
                                        src, stride1, stride2, _x, y, _w, h);
         return;
-- 
2.43.0


From 00ae586c9954f5885a2449f4c0ba5fc1a7eadcbb Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Sat, 1 Jul 2023 18:43:57 +0000
Subject: [PATCH 133/157] checkasm: Add tests for rpi_sand sand30 fns

Something of a kludge for function selection as, at the moment, the
rpi_sand fns don't have a jump table that we could use for selection.
---
 tests/checkasm/Makefile   |   3 +-
 tests/checkasm/checkasm.c |   3 +
 tests/checkasm/checkasm.h |   1 +
 tests/checkasm/rpi_sand.c | 118 ++++++++++++++++++++++++++++++++++++++
 tests/fate/checkasm.mak   |   1 +
 5 files changed, 125 insertions(+), 1 deletion(-)
 create mode 100644 tests/checkasm/rpi_sand.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index a6f06c7007..66291baf33 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -59,8 +59,9 @@ CHECKASMOBJS-$(CONFIG_SWSCALE)  += $(SWSCALEOBJS)
 AVUTILOBJS                              += av_tx.o
 AVUTILOBJS                              += fixed_dsp.o
 AVUTILOBJS                              += float_dsp.o
+AVUTILOBJS-$(CONFIG_SAND)               += rpi_sand.o
 
-CHECKASMOBJS-$(CONFIG_AVUTIL)  += $(AVUTILOBJS)
+CHECKASMOBJS-$(CONFIG_AVUTIL)  += $(AVUTILOBJS) $(AVUTILOBJS-yes)
 
 CHECKASMOBJS-$(ARCH_AARCH64)            += aarch64/checkasm.o
 CHECKASMOBJS-$(HAVE_ARMV5TE_EXTERNAL)   += arm/checkasm.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index e96d84a7da..57e0091b80 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -210,6 +210,9 @@ static const struct {
         { "fixed_dsp", checkasm_check_fixed_dsp },
         { "float_dsp", checkasm_check_float_dsp },
         { "av_tx",     checkasm_check_av_tx },
+    #if CONFIG_SAND
+        { "rpi_sand",  checkasm_check_rpi_sand },
+    #endif
 #endif
     { NULL }
 };
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 8744a81218..f4a0d20358 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -73,6 +73,7 @@ void checkasm_check_motion(void);
 void checkasm_check_nlmeans(void);
 void checkasm_check_opusdsp(void);
 void checkasm_check_pixblockdsp(void);
+void checkasm_check_rpi_sand(void);
 void checkasm_check_sbrdsp(void);
 void checkasm_check_synth_filter(void);
 void checkasm_check_sw_gbrp(void);
diff --git a/tests/checkasm/rpi_sand.c b/tests/checkasm/rpi_sand.c
new file mode 100644
index 0000000000..0888714c4c
--- /dev/null
+++ b/tests/checkasm/rpi_sand.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2023 John Cox
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+#include "checkasm.h"
+#include "libavutil/common.h"
+#include "libavutil/rpi_sand_fns.h"
+
+#if ARCH_ARM
+#include "libavutil/arm/cpu.h"
+#include "libavutil/arm/rpi_sand_neon.h"
+#elif ARCH_AARCH64
+#include "libavutil/aarch64/cpu.h"
+#include "libavutil/aarch64/rpi_sand_neon.h"
+#endif
+
+static inline uint32_t pack30(unsigned int a, unsigned int b, unsigned int c)
+{
+    return (a & 0x3ff) | ((b & 0x3ff) << 10) | ((c & 0x3ff) << 20);
+}
+
+void checkasm_check_rpi_sand(void)
+{
+    const unsigned int w = 1280;
+    const unsigned int h = 66;
+    const unsigned int stride1 = 128;
+    const unsigned int stride2 = h*3/2;
+    const unsigned int ssize = ((w+95)/96)*128*h*3/2;
+    const unsigned int ysize = ((w + 32) * (h + 32) * 2);
+
+    uint8_t * sbuf0 = malloc(ssize);
+    uint8_t * sbuf1 = malloc(ssize);
+    uint8_t * ybuf0 = malloc(ysize);
+    uint8_t * ybuf1 = malloc(ysize);
+    uint8_t * vbuf0 = malloc(ysize);
+    uint8_t * vbuf1 = malloc(ysize);
+    uint8_t * yframe0 = (w + 32) * 16 + ybuf0;
+    uint8_t * yframe1 = (w + 32) * 16 + ybuf1;
+    uint8_t * vframe0 = (w + 32) * 16 + vbuf0;
+    uint8_t * vframe1 = (w + 32) * 16 + vbuf1;
+    unsigned int i;
+
+    for (i = 0; i != ssize; i += 4)
+        *(uint32_t*)(sbuf0 + i) = rnd();
+    memcpy(sbuf1, sbuf0, ssize);
+
+    if (check_func(have_neon(av_get_cpu_flags()) ? ff_rpi_sand30_lines_to_planar_y16 : av_rpi_sand30_to_planar_y16, "rpi_sand30_to_planar_y16")) {
+        declare_func(void, uint8_t * dst, const unsigned int dst_stride,
+                     const uint8_t * src,
+                     unsigned int stride1, unsigned int stride2,
+                     unsigned int _x, unsigned int y,
+                     unsigned int _w, unsigned int h);
+
+        memset(ybuf0, 0xbb, ysize);
+        memset(ybuf1, 0xbb, ysize);
+
+        call_ref(yframe0, (w + 32) * 2, sbuf0, stride1, stride2, 0, 0, w, h);
+        call_new(yframe1, (w + 32) * 2, sbuf1, stride1, stride2, 0, 0, w, h);
+
+        if (memcmp(sbuf0, sbuf1, ssize)
+            || memcmp(ybuf0, ybuf1, ysize))
+            fail();
+
+        bench_new(ybuf1, (w + 32) * 2, sbuf1, stride1, stride2, 0, 0, w, h);
+    }
+
+    if (check_func(have_neon(av_get_cpu_flags()) ? ff_rpi_sand30_lines_to_planar_c16 : av_rpi_sand30_to_planar_c16, "rpi_sand30_to_planar_c16")) {
+        declare_func(void, uint8_t * u_dst, const unsigned int u_stride,
+                     uint8_t * v_dst, const unsigned int v_stride,
+                     const uint8_t * src,
+                     unsigned int stride1, unsigned int stride2,
+                     unsigned int _x, unsigned int y,
+                     unsigned int _w, unsigned int h);
+
+        memset(ybuf0, 0xbb, ysize);
+        memset(ybuf1, 0xbb, ysize);
+        memset(vbuf0, 0xbb, ysize);
+        memset(vbuf1, 0xbb, ysize);
+
+        call_ref(yframe0, (w + 32), vframe0, (w + 32), sbuf0, stride1, stride2, 0, 0, w/2, h/2);
+        call_new(yframe1, (w + 32), vframe1, (w + 32), sbuf1, stride1, stride2, 0, 0, w/2, h/2);
+
+        if (memcmp(sbuf0, sbuf1, ssize)
+            || memcmp(ybuf0, ybuf1, ysize)
+            || memcmp(vbuf0, vbuf1, ysize))
+            fail();
+
+        bench_new(yframe1, (w + 32), vframe1, (w + 32), sbuf1, stride1, stride2, 0, 0, w/2, h/2);
+    }
+
+
+    report("sand30");
+
+    free(sbuf0);
+    free(sbuf1);
+    free(ybuf0);
+    free(ybuf1);
+    free(vbuf0);
+    free(vbuf1);
+}
+
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index a4e95541f5..6fda6d227e 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -27,6 +27,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
                 fate-checkasm-motion                                    \
                 fate-checkasm-opusdsp                                   \
                 fate-checkasm-pixblockdsp                               \
+                fate-checkasm-rpi_sand                                  \
                 fate-checkasm-sbrdsp                                    \
                 fate-checkasm-synth_filter                              \
                 fate-checkasm-sw_gbrp                                   \
-- 
2.43.0


From 8e8625274281f673d82dac953172519121cf803c Mon Sep 17 00:00:00 2001
From: James Darnley <jdarnley@obe.tv>
Date: Mon, 20 Feb 2023 20:55:08 +0100
Subject: [PATCH 134/157] avfilter/bwdif: move filter_line init to a dedicated
 function

(cherry picked from commit b503b5a0cf80f38ecf4737c012b621b7e94f242a)
---
 libavfilter/bwdif.h             |  3 ++-
 libavfilter/vf_bwdif.c          | 13 +++++++++----
 libavfilter/x86/vf_bwdif_init.c |  4 +---
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
index 889ff772ed..5749345f78 100644
--- a/libavfilter/bwdif.h
+++ b/libavfilter/bwdif.h
@@ -37,6 +37,7 @@ typedef struct BWDIFContext {
                         int parity, int clip_max, int spat);
 } BWDIFContext;
 
-void ff_bwdif_init_x86(BWDIFContext *bwdif);
+void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
+void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth);
 
 #endif /* AVFILTER_BWDIF_H */
diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 09e68523bb..539fabbd46 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -341,7 +341,14 @@ static int config_props(AVFilterLink *link)
 
     yadif->csp = av_pix_fmt_desc_get(link->format);
     yadif->filter = filter;
-    if (yadif->csp->comp[0].depth > 8) {
+    ff_bwdif_init_filter_line(s, yadif->csp->comp[0].depth);
+
+    return 0;
+}
+
+av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
+{
+    if (bit_depth > 8) {
         s->filter_intra = filter_intra_16bit;
         s->filter_line  = filter_line_c_16bit;
         s->filter_edge  = filter_edge_16bit;
@@ -352,10 +359,8 @@ static int config_props(AVFilterLink *link)
     }
 
 #if ARCH_X86
-    ff_bwdif_init_x86(s);
+    ff_bwdif_init_x86(s, bit_depth);
 #endif
-
-    return 0;
 }
 
 
diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c
index e24e5cd9b1..ba7bc40c3d 100644
--- a/libavfilter/x86/vf_bwdif_init.c
+++ b/libavfilter/x86/vf_bwdif_init.c
@@ -42,11 +42,9 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *ne
                                       int mrefs2, int prefs3, int mrefs3, int prefs4,
                                       int mrefs4, int parity, int clip_max);
 
-av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif)
+av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
 {
-    YADIFContext *yadif = &bwdif->yadif;
     int cpu_flags = av_get_cpu_flags();
-    int bit_depth = (!yadif->csp) ? 8 : yadif->csp->comp[0].depth;
 
     if (bit_depth <= 8) {
         if (EXTERNAL_SSE2(cpu_flags))
-- 
2.43.0


From f38be95f03b06a4b70cd66a641f8d65ccef3cd5a Mon Sep 17 00:00:00 2001
From: James Darnley <jdarnley@obe.tv>
Date: Mon, 20 Feb 2023 20:55:08 +0100
Subject: [PATCH 135/157] checkasm: add test for bwdif

(cherry picked from commit 087faf8cac51e5e20a5f41b36b8d4c2705a10039)
---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/vf_bwdif.c | 84 +++++++++++++++++++++++++++++++++++++++
 tests/fate/checkasm.mak   |  1 +
 5 files changed, 90 insertions(+)
 create mode 100644 tests/checkasm/vf_bwdif.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 66291baf33..2c80d8e661 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -40,6 +40,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC)          += $(AVCODECOBJS-yes)
 # libavfilter tests
 AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o
 AVFILTEROBJS-$(CONFIG_BLEND_FILTER) += vf_blend.o
+AVFILTEROBJS-$(CONFIG_BWDIF_FILTER)      += vf_bwdif.o
 AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
 AVFILTEROBJS-$(CONFIG_EQ_FILTER)         += vf_eq.o
 AVFILTEROBJS-$(CONFIG_GBLUR_FILTER)      += vf_gblur.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 57e0091b80..4f983d7fbc 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -179,6 +179,9 @@ static const struct {
     #if CONFIG_BLEND_FILTER
         { "vf_blend", checkasm_check_blend },
     #endif
+    #if CONFIG_BWDIF_FILTER
+        { "vf_bwdif", checkasm_check_vf_bwdif },
+    #endif
     #if CONFIG_COLORSPACE_FILTER
         { "vf_colorspace", checkasm_check_colorspace },
     #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index f4a0d20358..d69bc43999 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -83,6 +83,7 @@ void checkasm_check_utvideodsp(void);
 void checkasm_check_v210dec(void);
 void checkasm_check_v210enc(void);
 void checkasm_check_vc1dsp(void);
+void checkasm_check_vf_bwdif(void);
 void checkasm_check_vf_eq(void);
 void checkasm_check_vf_gblur(void);
 void checkasm_check_vf_hflip(void);
diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
new file mode 100644
index 0000000000..46224bb575
--- /dev/null
+++ b/tests/checkasm/vf_bwdif.c
@@ -0,0 +1,84 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+#include "checkasm.h"
+#include "libavcodec/internal.h"
+#include "libavfilter/bwdif.h"
+
+#define WIDTH 256
+
+#define randomize_buffers(buf0, buf1, mask, count) \
+    for (size_t i = 0; i < count; i++) \
+        buf0[i] = buf1[i] = rnd() & mask
+
+#define BODY(type, depth)                                                      \
+    do {                                                                       \
+        type prev0[9*WIDTH], prev1[9*WIDTH];                                   \
+        type next0[9*WIDTH], next1[9*WIDTH];                                   \
+        type cur0[9*WIDTH], cur1[9*WIDTH];                                     \
+        type dst0[WIDTH], dst1[WIDTH];                                         \
+        const int stride = WIDTH;                                              \
+        const int mask = (1<<depth)-1;                                         \
+                                                                               \
+        declare_func(void, void *dst, void *prev, void *cur, void *next,       \
+                        int w, int prefs, int mrefs, int prefs2, int mrefs2,   \
+                        int prefs3, int mrefs3, int prefs4, int mrefs4,        \
+                        int parity, int clip_max);                             \
+                                                                               \
+        randomize_buffers(prev0, prev1, mask, 9*WIDTH);                        \
+        randomize_buffers(next0, next1, mask, 9*WIDTH);                        \
+        randomize_buffers( cur0,  cur1, mask, 9*WIDTH);                        \
+                                                                               \
+        call_ref(dst0, prev0 + 4*WIDTH, cur0 + 4*WIDTH, next0 + 4*WIDTH,       \
+                WIDTH, stride, -stride, 2*stride, -2*stride,                   \
+                3*stride, -3*stride, 4*stride, -4*stride,                      \
+                0, mask);                                                      \
+        call_new(dst1, prev1 + 4*WIDTH, cur1 + 4*WIDTH, next1 + 4*WIDTH,       \
+                WIDTH, stride, -stride, 2*stride, -2*stride,                   \
+                3*stride, -3*stride, 4*stride, -4*stride,                      \
+                0, mask);                                                      \
+                                                                               \
+        if (memcmp(dst0, dst1, sizeof dst0)                                    \
+                || memcmp(prev0, prev1, sizeof prev0)                          \
+                || memcmp(next0, next1, sizeof next0)                          \
+                || memcmp( cur0,  cur1, sizeof cur0))                          \
+            fail();                                                            \
+        bench_new(dst1, prev1 + 4*WIDTH, cur1 + 4*WIDTH, next1 + 4*WIDTH,      \
+                WIDTH, stride, -stride, 2*stride, -2*stride,                   \
+                3*stride, -3*stride, 4*stride, -4*stride,                      \
+                0, mask);                                                      \
+    } while (0)
+
+void checkasm_check_vf_bwdif(void)
+{
+    BWDIFContext ctx_8, ctx_10;
+
+    ff_bwdif_init_filter_line(&ctx_8, 8);
+    ff_bwdif_init_filter_line(&ctx_10, 10);
+
+    if (check_func(ctx_8.filter_line, "bwdif8")) {
+        BODY(uint8_t, 8);
+        report("bwdif8");
+    }
+
+    if (check_func(ctx_10.filter_line, "bwdif10")) {
+        BODY(uint16_t, 10);
+        report("bwdif10");
+    }
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index 6fda6d227e..1620ab0be0 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -38,6 +38,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
                 fate-checkasm-v210enc                                   \
                 fate-checkasm-vc1dsp                                    \
                 fate-checkasm-vf_blend                                  \
+                fate-checkasm-vf_bwdif                                  \
                 fate-checkasm-vf_colorspace                             \
                 fate-checkasm-vf_eq                                     \
                 fate-checkasm-vf_gblur                                  \
-- 
2.43.0


From cedee620260d6ac2445968caba28b8c288963279 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 6 Jul 2023 13:56:18 +0000
Subject: [PATCH 136/157] Revert "vf_bwdif: Add attributes to ask for
 vectorization"

This reverts commit 281250290ba5c2dcd8676e9a261050e65c10bcb7.
Will be replaced by hand coded asm as on upstream
---
 libavfilter/vf_bwdif.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 539fabbd46..34e8c5e234 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -74,10 +74,10 @@ typedef struct ThreadData {
         int temporal_diff1 =(FFABS(prev[mrefs] - c) + FFABS(prev[prefs] - e)) >> 1; \
         int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e)) >> 1; \
         int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \
- {/*\
+ \
         if (!diff) { \
             dst[0] = d; \
-        } else {*/
+        } else {
 
 #define SPAT_CHECK() \
             int b = ((prev2[mrefs2] + next2[mrefs2]) >> 1) - c; \
@@ -89,16 +89,15 @@ typedef struct ThreadData {
             diff = FFMAX3(diff, min, -max);
 
 #define FILTER_LINE() \
-            int i1, i2; \
             SPAT_CHECK() \
-            /*if (FFABS(c - e) > temporal_diff0)*/ { \
-                i1 = (((coef_hf[0] * (prev2[0] + next2[0]) \
+            if (FFABS(c - e) > temporal_diff0) { \
+                interpol = (((coef_hf[0] * (prev2[0] + next2[0]) \
                     - coef_hf[1] * (prev2[mrefs2] + next2[mrefs2] + prev2[prefs2] + next2[prefs2]) \
                     + coef_hf[2] * (prev2[mrefs4] + next2[mrefs4] + prev2[prefs4] + next2[prefs4])) >> 2) \
                     + coef_lf[0] * (c + e) - coef_lf[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
-            } /*else*/ { \
-                i2 = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
-            }interpol = FFABS(c - e) > temporal_diff0 ? i1:i2;\
+            } else { \
+                interpol = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
+            }
 
 #define FILTER_EDGE() \
             if (spat) { \
@@ -112,7 +111,7 @@ typedef struct ThreadData {
             else if (interpol < d - diff) \
                 interpol = d - diff; \
  \
-            dst[0] = !diff ? d : av_clip(interpol, 0, clip_max); \
+            dst[0] = av_clip(interpol, 0, clip_max); \
         } \
  \
         dst++; \
@@ -123,7 +122,7 @@ typedef struct ThreadData {
         next2++; \
     }
 
-static void __attribute__((optimize("tree-vectorize"))) filter_intra(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
+static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
                          int prefs3, int mrefs3, int parity, int clip_max)
 {
     uint8_t *dst = dst1;
@@ -133,7 +132,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_intra(void *restr
     FILTER_INTRA()
 }
 
-static void __attribute__((optimize("tree-vectorize"))) filter_line_c(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
+static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
                           int w, int prefs, int mrefs, int prefs2, int mrefs2,
                           int prefs3, int mrefs3, int prefs4, int mrefs4,
                           int parity, int clip_max)
@@ -151,7 +150,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_line_c(void *rest
     FILTER2()
 }
 
-static void __attribute__((optimize("tree-vectorize"))) filter_edge(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
+static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
                         int w, int prefs, int mrefs, int prefs2, int mrefs2,
                         int parity, int clip_max, int spat)
 {
@@ -168,7 +167,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_edge(void *restri
     FILTER2()
 }
 
-static void __attribute__((optimize("tree-vectorize"))) filter_intra_16bit(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
+static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mrefs,
                                int prefs3, int mrefs3, int parity, int clip_max)
 {
     uint16_t *dst = dst1;
@@ -178,7 +177,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_intra_16bit(void
     FILTER_INTRA()
 }
 
-static void __attribute__((optimize("tree-vectorize"))) filter_line_c_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
+static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1,
                                 int w, int prefs, int mrefs, int prefs2, int mrefs2,
                                 int prefs3, int mrefs3, int prefs4, int mrefs4,
                                 int parity, int clip_max)
@@ -196,7 +195,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_line_c_16bit(void
     FILTER2()
 }
 
-static void __attribute__((optimize("tree-vectorize"))) filter_edge_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
+static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1,
                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
                               int parity, int clip_max, int spat)
 {
-- 
2.43.0


From 35dc67e7291cc2f811749bd072905e504b24c12f Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 4 Jul 2023 14:04:39 +0000
Subject: [PATCH 137/157] tests/checkasm: Add test for vf_bwdif filter_intra
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: John Cox <jc@kynesim.co.uk>
Signed-off-by: Martin Storsjö <martin@martin.st>
(cherry picked from commit 7caa8d6b91e738ad2c1ea61746b6c062c470f7d3)
---
 tests/checkasm/vf_bwdif.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
index 46224bb575..034bbabb4c 100644
--- a/tests/checkasm/vf_bwdif.c
+++ b/tests/checkasm/vf_bwdif.c
@@ -20,6 +20,7 @@
 #include "checkasm.h"
 #include "libavcodec/internal.h"
 #include "libavfilter/bwdif.h"
+#include "libavutil/mem_internal.h"
 
 #define WIDTH 256
 
@@ -81,4 +82,40 @@ void checkasm_check_vf_bwdif(void)
         BODY(uint16_t, 10);
         report("bwdif10");
     }
+
+    if (check_func(ctx_8.filter_intra, "bwdif8.intra")) {
+        LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, dst0,  [WIDTH*3]);
+        LOCAL_ALIGNED_16(uint8_t, dst1,  [WIDTH*3]);
+        const int stride = WIDTH;
+        const int mask = (1<<8)-1;
+
+        declare_func(void, void *dst1, void *cur1, int w, int prefs, int mrefs,
+                     int prefs3, int mrefs3, int parity, int clip_max);
+
+        randomize_buffers( cur0,  cur1, mask, 11*WIDTH);
+        memset(dst0, 0xba, WIDTH * 3);
+        memset(dst1, 0xba, WIDTH * 3);
+
+        call_ref(dst0 + stride,
+                 cur0 + stride * 4, WIDTH,
+                 stride, -stride, stride * 3, -stride * 3,
+                 0, mask);
+        call_new(dst1 + stride,
+                 cur0 + stride * 4, WIDTH,
+                 stride, -stride, stride * 3, -stride * 3,
+                 0, mask);
+
+        if (memcmp(dst0, dst1, WIDTH*3)
+                || memcmp( cur0,  cur1, WIDTH*11))
+            fail();
+
+        bench_new(dst1 + stride,
+                  cur0 + stride * 4, WIDTH,
+                  stride, -stride, stride * 3, -stride * 3,
+                  0, mask);
+
+        report("bwdif8.intra");
+    }
 }
-- 
2.43.0


From de129d7f86edd5cabab648dc7fd6d6063d79e0f8 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 4 Jul 2023 14:04:40 +0000
Subject: [PATCH 138/157] avfilter/vf_bwdif: Add neon for filter_intra
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds an outline for aarch neon functions
Adds common macros and consts for aarch64 neon
Exports C filter_intra needed for tail fixup of neon code
Adds neon for filter_intra

Signed-off-by: John Cox <jc@kynesim.co.uk>
Signed-off-by: Martin Storsjö <martin@martin.st>
(cherry picked from commit 5075cfb4e6a21f6b4da9e62bdb0bad4cb32a4673)
---
 libavfilter/aarch64/Makefile                |   2 +
 libavfilter/aarch64/vf_bwdif_init_aarch64.c |  56 ++++++++
 libavfilter/aarch64/vf_bwdif_neon.S         | 136 ++++++++++++++++++++
 libavfilter/bwdif.h                         |   4 +
 libavfilter/vf_bwdif.c                      |   8 +-
 5 files changed, 203 insertions(+), 3 deletions(-)
 create mode 100644 libavfilter/aarch64/vf_bwdif_init_aarch64.c
 create mode 100644 libavfilter/aarch64/vf_bwdif_neon.S

diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
index b58daa3a3f..b68209bc94 100644
--- a/libavfilter/aarch64/Makefile
+++ b/libavfilter/aarch64/Makefile
@@ -1,3 +1,5 @@
+OBJS-$(CONFIG_BWDIF_FILTER)                  += aarch64/vf_bwdif_init_aarch64.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
 
+NEON-OBJS-$(CONFIG_BWDIF_FILTER)             += aarch64/vf_bwdif_neon.o
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
new file mode 100644
index 0000000000..3ffaa07ab3
--- /dev/null
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -0,0 +1,56 @@
+/*
+ * bwdif aarch64 NEON optimisations
+ *
+ * Copyright (c) 2023 John Cox <jc@kynesim.co.uk>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavfilter/bwdif.h"
+#include "libavutil/aarch64/cpu.h"
+
+void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                                int prefs3, int mrefs3, int parity, int clip_max);
+
+
+static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                                int prefs3, int mrefs3, int parity, int clip_max)
+{
+    const int w0 = clip_max != 255 ? 0 : w & ~15;
+
+    ff_bwdif_filter_intra_neon(dst1, cur1, w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
+
+    if (w0 < w)
+        ff_bwdif_filter_intra_c((char *)dst1 + w0, (char *)cur1 + w0,
+                                w - w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
+}
+
+void
+ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
+{
+    const int cpu_flags = av_get_cpu_flags();
+
+    if (bit_depth != 8)
+        return;
+
+    if (!have_neon(cpu_flags))
+        return;
+
+    s->filter_intra = filter_intra_helper;
+}
+
diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
new file mode 100644
index 0000000000..e288efbe6c
--- /dev/null
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -0,0 +1,136 @@
+/*
+ * bwdif aarch64 NEON optimisations
+ *
+ * Copyright (c) 2023 John Cox <jc@kynesim.co.uk>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/aarch64/asm.S"
+
+// Space taken on the stack by an int (32-bit)
+#ifdef __APPLE__
+.set    SP_INT, 4
+#else
+.set    SP_INT, 8
+#endif
+
+.macro SQSHRUNN b, s0, s1, s2, s3, n
+        sqshrun         \s0\().4h, \s0\().4s, #\n - 8
+        sqshrun2        \s0\().8h, \s1\().4s, #\n - 8
+        sqshrun         \s1\().4h, \s2\().4s, #\n - 8
+        sqshrun2        \s1\().8h, \s3\().4s, #\n - 8
+        uzp2            \b\().16b, \s0\().16b, \s1\().16b
+.endm
+
+.macro SMULL4K a0, a1, a2, a3, s0, s1, k
+        smull           \a0\().4s, \s0\().4h, \k
+        smull2          \a1\().4s, \s0\().8h, \k
+        smull           \a2\().4s, \s1\().4h, \k
+        smull2          \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro UMULL4K a0, a1, a2, a3, s0, s1, k
+        umull           \a0\().4s, \s0\().4h, \k
+        umull2          \a1\().4s, \s0\().8h, \k
+        umull           \a2\().4s, \s1\().4h, \k
+        umull2          \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro UMLAL4K a0, a1, a2, a3, s0, s1, k
+        umlal           \a0\().4s, \s0\().4h, \k
+        umlal2          \a1\().4s, \s0\().8h, \k
+        umlal           \a2\().4s, \s1\().4h, \k
+        umlal2          \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro UMLSL4K a0, a1, a2, a3, s0, s1, k
+        umlsl           \a0\().4s, \s0\().4h, \k
+        umlsl2          \a1\().4s, \s0\().8h, \k
+        umlsl           \a2\().4s, \s1\().4h, \k
+        umlsl2          \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro LDR_COEFFS d, t0
+        movrel          \t0, coeffs, 0
+        ld1             {\d\().8h}, [\t0]
+.endm
+
+// static const uint16_t coef_lf[2] = { 4309, 213 };
+// static const uint16_t coef_hf[3] = { 5570, 3801, 1016 };
+// static const uint16_t coef_sp[2] = { 5077, 981 };
+
+const coeffs, align=4   // align 4 means align on 2^4 boundry
+        .hword          4309 * 4, 213 * 4               // lf[0]*4 = v0.h[0]
+        .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
+        .hword          5077, 981                       // sp[0] = v0.h[6]
+endconst
+
+// ============================================================================
+//
+// void ff_bwdif_filter_intra_neon(
+//      void *dst1,     // x0
+//      void *cur1,     // x1
+//      int w,          // w2
+//      int prefs,      // w3
+//      int mrefs,      // w4
+//      int prefs3,     // w5
+//      int mrefs3,     // w6
+//      int parity,     // w7       unused
+//      int clip_max)   // [sp, #0] unused
+
+function ff_bwdif_filter_intra_neon, export=1
+        cmp             w2, #0
+        ble             99f
+
+        LDR_COEFFS      v0, x17
+
+//    for (x = 0; x < w; x++) {
+10:
+
+//        interpol = (coef_sp[0] * (cur[mrefs] + cur[prefs]) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13;
+        ldr             q31, [x1, w4, sxtw]
+        ldr             q30, [x1, w3, sxtw]
+        ldr             q29, [x1, w6, sxtw]
+        ldr             q28, [x1, w5, sxtw]
+
+        uaddl           v20.8h,  v31.8b,  v30.8b
+        uaddl2          v21.8h,  v31.16b, v30.16b
+
+        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[6]
+
+        uaddl           v20.8h,  v29.8b,  v28.8b
+        uaddl2          v21.8h,  v29.16b, v28.16b
+
+        UMLSL4K         v2, v3, v4, v5, v20, v21, v0.h[7]
+
+//        dst[0] = av_clip(interpol, 0, clip_max);
+        SQSHRUNN        v2, v2, v3, v4, v5, 13
+        str             q2, [x0], #16
+
+//        dst++;
+//        cur++;
+//    }
+
+        subs            w2,  w2,  #16
+        add             x1,  x1,  #16
+        bgt             10b
+
+99:
+        ret
+endfunc
diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
index 5749345f78..ae6f6ce223 100644
--- a/libavfilter/bwdif.h
+++ b/libavfilter/bwdif.h
@@ -39,5 +39,9 @@ typedef struct BWDIFContext {
 
 void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
 void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth);
+void ff_bwdif_init_aarch64(BWDIFContext *bwdif, int bit_depth);
+
+void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                             int prefs3, int mrefs3, int parity, int clip_max);
 
 #endif /* AVFILTER_BWDIF_H */
diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 34e8c5e234..6ec8bbab5d 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -122,8 +122,8 @@ typedef struct ThreadData {
         next2++; \
     }
 
-static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
-                         int prefs3, int mrefs3, int parity, int clip_max)
+void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                             int prefs3, int mrefs3, int parity, int clip_max)
 {
     uint8_t *dst = dst1;
     uint8_t *cur = cur1;
@@ -352,13 +352,15 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
         s->filter_line  = filter_line_c_16bit;
         s->filter_edge  = filter_edge_16bit;
     } else {
-        s->filter_intra = filter_intra;
+        s->filter_intra = ff_bwdif_filter_intra_c;
         s->filter_line  = filter_line_c;
         s->filter_edge  = filter_edge;
     }
 
 #if ARCH_X86
     ff_bwdif_init_x86(s, bit_depth);
+#elif ARCH_AARCH64
+    ff_bwdif_init_aarch64(s, bit_depth);
 #endif
 }
 
-- 
2.43.0


From fd247ddb50d7206ea7ad894e96733dd60257eaf4 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 4 Jul 2023 14:04:41 +0000
Subject: [PATCH 139/157] tests/checkasm: Add test for vf_bwdif filter_edge
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: John Cox <jc@kynesim.co.uk>
Signed-off-by: Martin Storsjö <martin@martin.st>
(cherry picked from commit 7ed7c00f55a50ac88589f9e17c172d4a4fce0581)
---
 tests/checkasm/vf_bwdif.c | 54 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
index 034bbabb4c..5fdba09fdc 100644
--- a/tests/checkasm/vf_bwdif.c
+++ b/tests/checkasm/vf_bwdif.c
@@ -83,6 +83,60 @@ void checkasm_check_vf_bwdif(void)
         report("bwdif10");
     }
 
+    {
+        LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, next0, [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, next1, [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, dst0,  [WIDTH*3]);
+        LOCAL_ALIGNED_16(uint8_t, dst1,  [WIDTH*3]);
+        const int stride = WIDTH;
+        const int mask = (1<<8)-1;
+        int spat;
+        int parity;
+
+        for (spat = 0; spat != 2; ++spat) {
+            for (parity = 0; parity != 2; ++parity) {
+                if (check_func(ctx_8.filter_edge, "bwdif8.edge.s%d.p%d", spat, parity)) {
+
+                    declare_func(void, void *dst1, void *prev1, void *cur1, void *next1,
+                                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                                            int parity, int clip_max, int spat);
+
+                    randomize_buffers(prev0, prev1, mask, 11*WIDTH);
+                    randomize_buffers(next0, next1, mask, 11*WIDTH);
+                    randomize_buffers( cur0,  cur1, mask, 11*WIDTH);
+                    memset(dst0, 0xba, WIDTH * 3);
+                    memset(dst1, 0xba, WIDTH * 3);
+
+                    call_ref(dst0 + stride,
+                             prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, WIDTH,
+                             stride, -stride, stride * 2, -stride * 2,
+                             parity, mask, spat);
+                    call_new(dst1 + stride,
+                             prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, WIDTH,
+                             stride, -stride, stride * 2, -stride * 2,
+                             parity, mask, spat);
+
+                    if (memcmp(dst0, dst1, WIDTH*3)
+                            || memcmp(prev0, prev1, WIDTH*11)
+                            || memcmp(next0, next1, WIDTH*11)
+                            || memcmp( cur0,  cur1, WIDTH*11))
+                        fail();
+
+                    bench_new(dst1 + stride,
+                             prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, WIDTH,
+                             stride, -stride, stride * 2, -stride * 2,
+                             parity, mask, spat);
+                }
+            }
+        }
+
+        report("bwdif8.edge");
+    }
+
     if (check_func(ctx_8.filter_intra, "bwdif8.intra")) {
         LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
         LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);
-- 
2.43.0


From be3020e4b5044ef87e49838b35ecfd69b88f304c Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 4 Jul 2023 14:04:42 +0000
Subject: [PATCH 140/157] avfilter/vf_bwdif: Add neon for filter_edge
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds clip and spatial macros for aarch64 neon
Exports C filter_edge needed for tail fixup of neon code
Adds neon for filter_edge

Signed-off-by: John Cox <jc@kynesim.co.uk>
Signed-off-by: Martin Storsjö <martin@martin.st>
(cherry picked from commit 8130df83e0fbd3264fe990fb4e084ecbd452d0b1)
---
 libavfilter/aarch64/vf_bwdif_init_aarch64.c |  20 +++
 libavfilter/aarch64/vf_bwdif_neon.S         | 177 ++++++++++++++++++++
 libavfilter/bwdif.h                         |   4 +
 libavfilter/vf_bwdif.c                      |   8 +-
 4 files changed, 205 insertions(+), 4 deletions(-)

diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
index 3ffaa07ab3..e75cf2f204 100644
--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -24,10 +24,29 @@
 #include "libavfilter/bwdif.h"
 #include "libavutil/aarch64/cpu.h"
 
+void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
+                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                               int parity, int clip_max, int spat);
+
 void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
                                 int prefs3, int mrefs3, int parity, int clip_max);
 
 
+static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
+                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                               int parity, int clip_max, int spat)
+{
+    const int w0 = clip_max != 255 ? 0 : w & ~15;
+
+    ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2,
+                              parity, clip_max, spat);
+
+    if (w0 < w)
+        ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
+                               w - w0, prefs, mrefs, prefs2, mrefs2,
+                               parity, clip_max, spat);
+}
+
 static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
                                 int prefs3, int mrefs3, int parity, int clip_max)
 {
@@ -52,5 +71,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
         return;
 
     s->filter_intra = filter_intra_helper;
+    s->filter_edge  = filter_edge_helper;
 }
 
diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
index e288efbe6c..389302b813 100644
--- a/libavfilter/aarch64/vf_bwdif_neon.S
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -66,6 +66,79 @@
         umlsl2          \a3\().4s, \s1\().8h, \k
 .endm
 
+//      int b = m2s1 - m1;
+//      int f = p2s1 - p1;
+//      int dc = c0s1 - m1;
+//      int de = c0s1 - p1;
+//      int sp_max = FFMIN(p1 - c0s1, m1 - c0s1);
+//      sp_max = FFMIN(sp_max, FFMAX(-b,-f));
+//      int sp_min = FFMIN(c0s1 - p1, c0s1 - m1);
+//      sp_min = FFMIN(sp_min, FFMAX(b,f));
+//      diff = diff == 0 ? 0 : FFMAX3(diff, sp_min, sp_max);
+.macro SPAT_CHECK diff, m2s1, m1, c0s1, p1, p2s1, t0, t1, t2, t3
+        uqsub           \t0\().16b, \p1\().16b, \c0s1\().16b
+        uqsub           \t2\().16b, \m1\().16b, \c0s1\().16b
+        umin            \t2\().16b, \t0\().16b, \t2\().16b
+
+        uqsub           \t1\().16b, \m1\().16b, \m2s1\().16b
+        uqsub           \t3\().16b, \p1\().16b, \p2s1\().16b
+        umax            \t3\().16b, \t3\().16b, \t1\().16b
+        umin            \t3\().16b, \t3\().16b, \t2\().16b
+
+        uqsub           \t0\().16b, \c0s1\().16b, \p1\().16b
+        uqsub           \t2\().16b, \c0s1\().16b, \m1\().16b
+        umin            \t2\().16b, \t0\().16b, \t2\().16b
+
+        uqsub           \t1\().16b, \m2s1\().16b, \m1\().16b
+        uqsub           \t0\().16b, \p2s1\().16b, \p1\().16b
+        umax            \t0\().16b, \t0\().16b, \t1\().16b
+        umin            \t2\().16b, \t2\().16b, \t0\().16b
+
+        cmeq            \t1\().16b, \diff\().16b, #0
+        umax            \diff\().16b, \diff\().16b, \t3\().16b
+        umax            \diff\().16b, \diff\().16b, \t2\().16b
+        bic             \diff\().16b, \diff\().16b, \t1\().16b
+.endm
+
+//      i0 = s0;
+//      if (i0 > d0 + diff0)
+//          i0 = d0 + diff0;
+//      else if (i0 < d0 - diff0)
+//          i0 = d0 - diff0;
+//
+// i0 = s0 is safe
+.macro DIFF_CLIP i0, s0, d0, diff, t0, t1
+        uqadd           \t0\().16b, \d0\().16b, \diff\().16b
+        uqsub           \t1\().16b, \d0\().16b, \diff\().16b
+        umin            \i0\().16b, \s0\().16b, \t0\().16b
+        umax            \i0\().16b, \i0\().16b, \t1\().16b
+.endm
+
+//      i0 = FFABS(m1 - p1) > td0 ? i1 : i2;
+//      DIFF_CLIP
+//
+// i0 = i1 is safe
+.macro INTERPOL i0, i1, i2, m1, d0, p1, td0, diff, t0, t1, t2
+        uabd            \t0\().16b, \m1\().16b, \p1\().16b
+        cmhi            \t0\().16b, \t0\().16b, \td0\().16b
+        bsl             \t0\().16b, \i1\().16b, \i2\().16b
+        DIFF_CLIP       \i0, \t0, \d0, \diff, \t1, \t2
+.endm
+
+.macro PUSH_VREGS
+        stp             d8,  d9,  [sp, #-64]!
+        stp             d10, d11, [sp, #16]
+        stp             d12, d13, [sp, #32]
+        stp             d14, d15, [sp, #48]
+.endm
+
+.macro POP_VREGS
+        ldp             d14, d15, [sp, #48]
+        ldp             d12, d13, [sp, #32]
+        ldp             d10, d11, [sp, #16]
+        ldp             d8,  d9,  [sp], #64
+.endm
+
 .macro LDR_COEFFS d, t0
         movrel          \t0, coeffs, 0
         ld1             {\d\().8h}, [\t0]
@@ -81,6 +154,110 @@ const coeffs, align=4   // align 4 means align on 2^4 boundry
         .hword          5077, 981                       // sp[0] = v0.h[6]
 endconst
 
+// ============================================================================
+//
+// void ff_bwdif_filter_edge_neon(
+//      void *dst1,     // x0
+//      void *prev1,    // x1
+//      void *cur1,     // x2
+//      void *next1,    // x3
+//      int w,          // w4
+//      int prefs,      // w5
+//      int mrefs,      // w6
+//      int prefs2,     // w7
+//      int mrefs2,     // [sp, #0]
+//      int parity,     // [sp, #SP_INT]
+//      int clip_max,   // [sp, #SP_INT*2]  unused
+//      int spat);      // [sp, #SP_INT*3]
+
+function ff_bwdif_filter_edge_neon, export=1
+        // Sanity check w
+        cmp             w4, #0
+        ble             99f
+
+// #define prev2 cur
+//     const uint8_t * restrict next2 = parity ? prev : next;
+
+        ldr             w8,  [sp, #0]                   // mrefs2
+
+        ldr             w17, [sp, #SP_INT]              // parity
+        ldr             w16, [sp, #SP_INT*3]            // spat
+        cmp             w17, #0
+        csel            x17, x1, x3, ne
+
+//     for (x = 0; x < w; x++) {
+
+10:
+//        int m1 = cur[mrefs];
+//        int d = (prev2[0] + next2[0]) >> 1;
+//        int p1 = cur[prefs];
+//        int temporal_diff0 = FFABS(prev2[0] - next2[0]);
+//        int temporal_diff1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
+//        int temporal_diff2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
+//        int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2);
+        ldr             q31, [x2]
+        ldr             q21, [x17]
+        uhadd           v16.16b, v31.16b, v21.16b       // d0 = v16
+        uabd            v17.16b, v31.16b, v21.16b       // td0 = v17
+        ldr             q24, [x2, w6, sxtw]             // m1 = v24
+        ldr             q22, [x2, w5, sxtw]             // p1 = v22
+
+        ldr             q0,  [x1, w6, sxtw]             // prev[mrefs]
+        ldr             q2,  [x1, w5, sxtw]             // prev[prefs]
+        ldr             q1,  [x3, w6, sxtw]             // next[mrefs]
+        ldr             q3,  [x3, w5, sxtw]             // next[prefs]
+
+        ushr            v29.16b, v17.16b, #1
+
+        uabd            v31.16b, v0.16b,  v24.16b
+        uabd            v30.16b, v2.16b,  v22.16b
+        uhadd           v0.16b,  v31.16b, v30.16b       // td1 = q0
+
+        uabd            v31.16b, v1.16b,  v24.16b
+        uabd            v30.16b, v3.16b,  v22.16b
+        uhadd           v1.16b,  v31.16b, v30.16b       // td2 = q1
+
+        umax            v0.16b,  v0.16b,  v29.16b
+        umax            v0.16b,  v0.16b,  v1.16b        // diff = v0
+
+//        if (spat) {
+//            SPAT_CHECK()
+//        }
+//        i0 = (m1 + p1) >> 1;
+        cbz             w16, 1f
+
+        ldr             q31, [x2,  w8, sxtw]
+        ldr             q18, [x17, w8, sxtw]
+        ldr             q30, [x2,  w7, sxtw]
+        ldr             q19, [x17, w7, sxtw]
+        uhadd           v18.16b, v18.16b, v31.16b
+        uhadd           v19.16b, v19.16b, v30.16b
+
+        SPAT_CHECK      v0, v18, v24, v16, v22, v19, v31, v30, v29, v28
+
+1:
+        uhadd           v2.16b,  v22.16b, v24.16b
+
+        // i0 = v2, s0 = v2, d0 = v16, diff = v0, t0 = v31, t1 = v30
+        DIFF_CLIP       v2, v2, v16, v0, v31, v30
+
+//        dst[0] = av_clip(interpol, 0, clip_max);
+        str             q2, [x0], #16
+
+//        dst++;
+//        cur++;
+//    }
+        subs            w4,  w4,  #16
+        add             x1,  x1,  #16
+        add             x2,  x2,  #16
+        add             x3,  x3,  #16
+        add             x17, x17, #16
+        bgt             10b
+
+99:
+        ret
+endfunc
+
 // ============================================================================
 //
 // void ff_bwdif_filter_intra_neon(
diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
index ae6f6ce223..ae1616d366 100644
--- a/libavfilter/bwdif.h
+++ b/libavfilter/bwdif.h
@@ -41,6 +41,10 @@ void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
 void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth);
 void ff_bwdif_init_aarch64(BWDIFContext *bwdif, int bit_depth);
 
+void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
+                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                            int parity, int clip_max, int spat);
+
 void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
                              int prefs3, int mrefs3, int parity, int clip_max);
 
diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 6ec8bbab5d..688c2d2572 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -150,9 +150,9 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
     FILTER2()
 }
 
-static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
-                        int w, int prefs, int mrefs, int prefs2, int mrefs2,
-                        int parity, int clip_max, int spat)
+void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
+                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                            int parity, int clip_max, int spat)
 {
     uint8_t *dst   = dst1;
     uint8_t *prev  = prev1;
@@ -354,7 +354,7 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
     } else {
         s->filter_intra = ff_bwdif_filter_intra_c;
         s->filter_line  = filter_line_c;
-        s->filter_edge  = filter_edge;
+        s->filter_edge  = ff_bwdif_filter_edge_c;
     }
 
 #if ARCH_X86
-- 
2.43.0


From d0c6dedf5c5db2cdd0635f34b98716cc9d944937 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 4 Jul 2023 14:04:43 +0000
Subject: [PATCH 141/157] avfilter/vf_bwdif: Add neon for filter_line
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Exports C filter_line needed for tail fixup of neon code
Adds neon for filter_line

Signed-off-by: John Cox <jc@kynesim.co.uk>
Signed-off-by: Martin Storsjö <martin@martin.st>
(cherry picked from commit 94cb94a2c0910d364a7181fc5cc0e9556b777d0a)
---
 libavfilter/aarch64/vf_bwdif_init_aarch64.c |  21 ++
 libavfilter/aarch64/vf_bwdif_neon.S         | 203 ++++++++++++++++++++
 libavfilter/bwdif.h                         |   5 +
 libavfilter/vf_bwdif.c                      |  10 +-
 4 files changed, 234 insertions(+), 5 deletions(-)

diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
index e75cf2f204..21e67884ab 100644
--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
 void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
                                 int prefs3, int mrefs3, int parity, int clip_max);
 
+void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
+                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                               int prefs3, int mrefs3, int prefs4, int mrefs4,
+                               int parity, int clip_max);
+
+
+static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
+                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                               int prefs3, int mrefs3, int prefs4, int mrefs4,
+                               int parity, int clip_max)
+{
+    const int w0 = clip_max != 255 ? 0 : w & ~15;
+
+    ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1,
+                              w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
+
+    if (w0 < w)
+        ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
+                               w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
+}
 
 static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
                                int w, int prefs, int mrefs, int prefs2, int mrefs2,
@@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
         return;
 
     s->filter_intra = filter_intra_helper;
+    s->filter_line  = filter_line_helper;
     s->filter_edge  = filter_edge_helper;
 }
 
diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
index 389302b813..f185e94e3c 100644
--- a/libavfilter/aarch64/vf_bwdif_neon.S
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -154,6 +154,209 @@ const coeffs, align=4   // align 4 means align on 2^4 boundry
         .hword          5077, 981                       // sp[0] = v0.h[6]
 endconst
 
+// ===========================================================================
+//
+// void filter_line(
+//      void *dst1,     // x0
+//      void *prev1,    // x1
+//      void *cur1,     // x2
+//      void *next1,    // x3
+//      int w,          // w4
+//      int prefs,      // w5
+//      int mrefs,      // w6
+//      int prefs2,     // w7
+//      int mrefs2,     // [sp, #0]
+//      int prefs3,     // [sp, #SP_INT]
+//      int mrefs3,     // [sp, #SP_INT*2]
+//      int prefs4,     // [sp, #SP_INT*3]
+//      int mrefs4,     // [sp, #SP_INT*4]
+//      int parity,     // [sp, #SP_INT*5]
+//      int clip_max)   // [sp, #SP_INT*6]
+
+function ff_bwdif_filter_line_neon, export=1
+        // Sanity check w
+        cmp             w4, #0
+        ble             99f
+
+        // Rearrange regs to be the same as line3 for ease of debug!
+        mov             w10, w4                         // w10 = loop count
+        mov             w9,  w6                         // w9  = mref
+        mov             w12, w7                         // w12 = pref2
+        mov             w11, w5                         // w11 = pref
+        ldr             w8,  [sp, #0]                   // w8 =  mref2
+        ldr             w7,  [sp, #SP_INT*2]            // w7  = mref3
+        ldr             w6,  [sp, #SP_INT*4]            // w6  = mref4
+        ldr             w13, [sp, #SP_INT]              // w13 = pref3
+        ldr             w14, [sp, #SP_INT*3]            // w14 = pref4
+
+        mov             x4,  x3
+        mov             x3,  x2
+        mov             x2,  x1
+
+        LDR_COEFFS      v0, x17
+
+// #define prev2 cur
+//        const uint8_t * restrict next2 = parity ? prev : next;
+        ldr             w17, [sp, #SP_INT*5]            // parity
+        cmp             w17, #0
+        csel            x17, x2, x4, ne
+
+        PUSH_VREGS
+
+//         for (x = 0; x < w; x++) {
+//             int diff0, diff2;
+//             int d0, d2;
+//             int temporal_diff0, temporal_diff2;
+//
+//             int i1, i2;
+//             int j1, j2;
+//             int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4;
+
+10:
+//             c0 = prev2[0] + next2[0];            // c0 = v20, v21
+//             d0  = c0 >> 1;                       // d0 = v10
+//             temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11
+        ldr             q31, [x3]
+        ldr             q21, [x17]
+        uhadd           v10.16b, v31.16b, v21.16b
+        uabd            v11.16b, v31.16b, v21.16b
+        uaddl           v20.8h,  v21.8b,  v31.8b
+        uaddl2          v21.8h,  v21.16b, v31.16b
+
+        ldr             q31, [x3, w6, sxtw]
+        ldr             q23, [x17, w6, sxtw]
+
+//             i1 = coef_hf[0] * c0;                // i1 = v2-v5
+        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[2]
+
+        ldr             q30, [x3, w14, sxtw]
+        ldr             q25, [x17, w14, sxtw]
+
+//             m4 = prev2[mrefs4] + next2[mrefs4];  // m4 = v22,v23
+        uaddl           v22.8h,  v23.8b,  v31.8b
+        uaddl2          v23.8h,  v23.16b, v31.16b
+
+//             p4 = prev2[prefs4] + next2[prefs4];  // p4 = v24,v25, (p4 >> 1) = v12
+        uhadd           v12.16b, v25.16b, v30.16b
+        uaddl           v24.8h,  v25.8b,  v30.8b
+        uaddl2          v25.8h,  v25.16b, v30.16b
+
+//             m3 = cur[mrefs3];                    // m3 = v20
+        ldr             q20, [x3, w7, sxtw]
+
+//             p3 = cur[prefs3];                    // p3 = v21
+        ldr             q21, [x3, w13, sxtw]
+
+//             i1 += coef_hf[2] * (m4 + p4);        // (-m4:v22,v23) (-p4:v24,v25)
+        add             v22.8h,  v22.8h,  v24.8h
+        add             v23.8h,  v23.8h,  v25.8h
+        UMLAL4K         v2, v3, v4, v5, v22, v23, v0.h[4]
+
+        ldr             q29, [x3, w8, sxtw]
+        ldr             q23, [x17, w8, sxtw]
+
+//             i1 -= coef_lf[1] * 4 * (m3 + p3);    // -
+        uaddl           v30.8h,  v20.8b,  v21.8b
+        uaddl2          v31.8h,  v20.16b, v21.16b
+
+        UMLSL4K         v2, v3, v4, v5, v30, v31, v0.h[1]
+
+        ldr             q31, [x3, w12, sxtw]
+        ldr             q27, [x17, w12, sxtw]
+
+//             m2 = prev2[mrefs2] + next2[mrefs2];  // m2 = v22,v23, (m2 >> 1) = v13
+        uhadd           v13.16b, v23.16b, v29.16b
+        uaddl           v22.8h,  v23.8b,  v29.8b
+        uaddl2          v23.8h,  v23.16b, v29.16b
+
+//             m1 = cur[mrefs];                     // m1 = v24
+        ldr             q24, [x3, w9, sxtw]
+
+//             p2 = prev2[prefs2] + next2[prefs2];  // p2 = v26, v27
+//             temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14
+//             d2  = p2 >> 1;                       // d2 = v15
+        uabd            v14.16b, v31.16b, v27.16b
+        uhadd           v15.16b, v31.16b, v27.16b
+        uaddl           v26.8h,  v27.8b,  v31.8b
+        uaddl2          v27.8h,  v27.16b, v31.16b
+
+//             i1 -= coef_hf[1] * (m2 + p2);        // (-m2:v22,v23*) (-p2:v26*,v27*)
+        add             v22.8h,  v22.8h,  v26.8h
+        add             v23.8h,  v23.8h,  v27.8h
+        UMLSL4K         v2, v3, v4, v5, v22, v23, v0.h[3]
+
+//             p1 = cur[prefs];                     // p1 = v22
+        ldr             q22, [x3, w11, sxtw]
+
+//             i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17
+        uaddl           v18.8h,  v22.8b,  v24.8b
+        uaddl2          v19.8h,  v22.16b, v24.16b
+        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
+
+        uaddl           v18.8h,  v20.8b,  v21.8b
+        uaddl2          v19.8h,  v20.16b, v21.16b
+        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
+
+        SQSHRUNN        v17, v28, v29, v30, v31, 13
+
+//             i1 += coef_lf[0] * 4 * (m1 + p1);    // p1 = v22, m1 = v24
+        uaddl           v26.8h,  v24.8b,  v22.8b
+        uaddl2          v27.8h,  v24.16b, v22.16b
+        UMLAL4K         v2, v3, v4, v5, v26, v27, v0.h[0]
+
+        ldr             q31, [x2, w9, sxtw]
+        ldr             q29, [x4, w9, sxtw]
+
+        ldr             q30, [x2, w11, sxtw]
+        ldr             q28, [x4, w11, sxtw]
+
+//             i1 >>= 15;                            // i1 = v2, -v3, -v4*, -v5*
+        SQSHRUNN        v2, v2, v3, v4, v5, 15
+
+//             {
+//                 int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
+//                 int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
+        uabd            v30.16b, v22.16b, v30.16b
+        uabd            v31.16b, v24.16b, v31.16b
+        uabd            v28.16b, v22.16b, v28.16b
+        uabd            v29.16b, v24.16b, v29.16b
+        uhadd           v31.16b, v31.16b, v30.16b
+        uhadd           v29.16b, v29.16b, v28.16b
+
+//                 diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18
+        ushr            v18.16b, v11.16b, #1
+        umax            v18.16b, v18.16b, v31.16b
+        umax            v18.16b, v18.16b, v29.16b
+
+        // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15
+        SPAT_CHECK      v18, v13, v24, v10, v22, v15, v31, v30, v29, v28
+
+        // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18
+        INTERPOL        v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29
+
+//                 dst[0] = av_clip_uint8(interpol);
+        str             q2,  [x0], #16
+//             }
+//
+//             dst++;
+//             cur++;
+//             prev++;
+//             prev2++;
+//             next++;
+//         }
+
+        subs            w10, w10, #16
+        add             x2,  x2,  #16
+        add             x3,  x3,  #16
+        add             x4,  x4,  #16
+        add             x17, x17, #16
+        bgt             10b
+
+        POP_VREGS
+99:
+        ret
+endfunc
+
 // ============================================================================
 //
 // void ff_bwdif_filter_edge_neon(
diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
index ae1616d366..cce99953f3 100644
--- a/libavfilter/bwdif.h
+++ b/libavfilter/bwdif.h
@@ -48,4 +48,9 @@ void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
 void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
                              int prefs3, int mrefs3, int parity, int clip_max);
 
+void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                            int prefs3, int mrefs3, int prefs4, int mrefs4,
+                            int parity, int clip_max);
+
 #endif /* AVFILTER_BWDIF_H */
diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 688c2d2572..2dc47f9614 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -132,10 +132,10 @@ void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs
     FILTER_INTRA()
 }
 
-static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
-                          int w, int prefs, int mrefs, int prefs2, int mrefs2,
-                          int prefs3, int mrefs3, int prefs4, int mrefs4,
-                          int parity, int clip_max)
+void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                            int prefs3, int mrefs3, int prefs4, int mrefs4,
+                            int parity, int clip_max)
 {
     uint8_t *dst   = dst1;
     uint8_t *prev  = prev1;
@@ -353,7 +353,7 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
         s->filter_edge  = filter_edge_16bit;
     } else {
         s->filter_intra = ff_bwdif_filter_intra_c;
-        s->filter_line  = filter_line_c;
+        s->filter_line  = ff_bwdif_filter_line_c;
         s->filter_edge  = ff_bwdif_filter_edge_c;
     }
 
-- 
2.43.0


From 8c103438c8790c2b181f10d0e252d513a5439047 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 4 Jul 2023 14:04:44 +0000
Subject: [PATCH 142/157] avfilter/vf_bwdif: Add a filter_line3 method for
 optimisation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an optional filter_line3 to the available optimisations.

filter_line3 is equivalent to filter_line, memcpy, filter_line

filter_line shares quite a number of loads and some calculations in
common with its next iteration and testing shows that using aarch64
neon filter_line3s performance is 30% better than two filter_lines
and a memcpy.

Adds a test for vf_bwdif filter_line3 to checkasm

Rounds job start lines down to a multiple of 4. This means that if
filter_line3 exists then filter_line will not sometimes be called
once at the end of a slice depending on thread count. The final slice
may do up to 3 extra lines but filter_edge is faster than filter_line
so it is unlikely to create any noticable thread load variation.

Signed-off-by: John Cox <jc@kynesim.co.uk>
Signed-off-by: Martin Storsjö <martin@martin.st>
(cherry picked from commit 697533e76dbea8cc7fd6a0642bc60050cc05ead8)
---
 libavfilter/bwdif.h       |  7 ++++
 libavfilter/vf_bwdif.c    | 44 +++++++++++++++++++--
 tests/checkasm/vf_bwdif.c | 81 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 129 insertions(+), 3 deletions(-)

diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
index cce99953f3..496cec72ef 100644
--- a/libavfilter/bwdif.h
+++ b/libavfilter/bwdif.h
@@ -35,6 +35,9 @@ typedef struct BWDIFContext {
     void (*filter_edge)(void *dst, void *prev, void *cur, void *next,
                         int w, int prefs, int mrefs, int prefs2, int mrefs2,
                         int parity, int clip_max, int spat);
+    void (*filter_line3)(void *dst, int dstride,
+                         const void *prev, const void *cur, const void *next, int prefs,
+                         int w, int parity, int clip_max);
 } BWDIFContext;
 
 void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
@@ -53,4 +56,8 @@ void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
                             int prefs3, int mrefs3, int prefs4, int mrefs4,
                             int parity, int clip_max);
 
+void ff_bwdif_filter_line3_c(void * dst1, int d_stride,
+                             const void * prev1, const void * cur1, const void * next1, int s_stride,
+                             int w, int parity, int clip_max);
+
 #endif /* AVFILTER_BWDIF_H */
diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 2dc47f9614..9847d38b6a 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -150,6 +150,31 @@ void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
     FILTER2()
 }
 
+#define NEXT_LINE()\
+    dst += d_stride; \
+    prev += prefs; \
+    cur  += prefs; \
+    next += prefs;
+
+void ff_bwdif_filter_line3_c(void * dst1, int d_stride,
+                             const void * prev1, const void * cur1, const void * next1, int s_stride,
+                             int w, int parity, int clip_max)
+{
+    const int prefs = s_stride;
+    uint8_t * dst  = dst1;
+    const uint8_t * prev = prev1;
+    const uint8_t * cur  = cur1;
+    const uint8_t * next = next1;
+
+    ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w,
+                           prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max);
+    NEXT_LINE();
+    memcpy(dst, cur, w);
+    NEXT_LINE();
+    ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w,
+                           prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max);
+}
+
 void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
                             int w, int prefs, int mrefs, int prefs2, int mrefs2,
                             int parity, int clip_max, int spat)
@@ -212,6 +237,13 @@ static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1,
     FILTER2()
 }
 
+// Round job start line down to multiple of 4 so that if filter_line3 exists
+// and the frame is a multiple of 4 high then filter_line will never be called
+static inline int job_start(const int jobnr, const int nb_jobs, const int h)
+{
+    return jobnr >= nb_jobs ? h : ((h * jobnr) / nb_jobs) & ~3;
+}
+
 static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
 {
     BWDIFContext *s = ctx->priv;
@@ -221,8 +253,8 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
     int clip_max = (1 << (yadif->csp->comp[td->plane].depth)) - 1;
     int df = (yadif->csp->comp[td->plane].depth + 7) / 8;
     int refs = linesize / df;
-    int slice_start = (td->h *  jobnr   ) / nb_jobs;
-    int slice_end   = (td->h * (jobnr+1)) / nb_jobs;
+    int slice_start = job_start(jobnr, nb_jobs, td->h);
+    int slice_end   = job_start(jobnr + 1, nb_jobs, td->h);
     int y;
 
     for (y = slice_start; y < slice_end; y++) {
@@ -244,6 +276,11 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
                                refs << 1, -(refs << 1),
                                td->parity ^ td->tff, clip_max,
                                (y < 2) || ((y + 3) > td->h) ? 0 : 1);
+            } else if (s->filter_line3 && y + 2 < slice_end && y + 6 < td->h) {
+                s->filter_line3(dst, td->frame->linesize[td->plane],
+                                prev, cur, next, linesize, td->w,
+                                td->parity ^ td->tff, clip_max);
+                y += 2;
             } else {
                 s->filter_line(dst, prev, cur, next, td->w,
                                refs, -refs, refs << 1, -(refs << 1),
@@ -280,7 +317,7 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic,
         td.plane = i;
 
         ff_filter_execute(ctx, filter_slice, &td, NULL,
-                          FFMIN(h, ff_filter_get_nb_threads(ctx)));
+                          FFMIN((h+3)/4, ff_filter_get_nb_threads(ctx)));
     }
     if (yadif->current_field == YADIF_FIELD_END) {
         yadif->current_field = YADIF_FIELD_NORMAL;
@@ -347,6 +384,7 @@ static int config_props(AVFilterLink *link)
 
 av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
 {
+    s->filter_line3 = 0;
     if (bit_depth > 8) {
         s->filter_intra = filter_intra_16bit;
         s->filter_line  = filter_line_c_16bit;
diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
index 5fdba09fdc..3399cacdf7 100644
--- a/tests/checkasm/vf_bwdif.c
+++ b/tests/checkasm/vf_bwdif.c
@@ -28,6 +28,10 @@
     for (size_t i = 0; i < count; i++) \
         buf0[i] = buf1[i] = rnd() & mask
 
+#define randomize_overflow_check(buf0, buf1, mask, count) \
+    for (size_t i = 0; i < count; i++) \
+        buf0[i] = buf1[i] = (rnd() & 1) != 0 ? mask : 0;
+
 #define BODY(type, depth)                                                      \
     do {                                                                       \
         type prev0[9*WIDTH], prev1[9*WIDTH];                                   \
@@ -83,6 +87,83 @@ void checkasm_check_vf_bwdif(void)
         report("bwdif10");
     }
 
+    if (!ctx_8.filter_line3)
+        ctx_8.filter_line3 = ff_bwdif_filter_line3_c;
+
+    {
+        LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, next0, [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, next1, [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);
+        LOCAL_ALIGNED_16(uint8_t, dst0,  [WIDTH*3]);
+        LOCAL_ALIGNED_16(uint8_t, dst1,  [WIDTH*3]);
+        const int stride = WIDTH;
+        const int mask = (1<<8)-1;
+        int parity;
+
+        for (parity = 0; parity != 2; ++parity) {
+            if (check_func(ctx_8.filter_line3, "bwdif8.line3.rnd.p%d", parity)) {
+
+                declare_func(void, void * dst1, int d_stride,
+                                          const void * prev1, const void * cur1, const void * next1, int prefs,
+                                          int w, int parity, int clip_max);
+
+                randomize_buffers(prev0, prev1, mask, 11*WIDTH);
+                randomize_buffers(next0, next1, mask, 11*WIDTH);
+                randomize_buffers( cur0,  cur1, mask, 11*WIDTH);
+
+                call_ref(dst0, stride,
+                         prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, stride,
+                         WIDTH, parity, mask);
+                call_new(dst1, stride,
+                         prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
+                         WIDTH, parity, mask);
+
+                if (memcmp(dst0, dst1, WIDTH*3)
+                        || memcmp(prev0, prev1, WIDTH*11)
+                        || memcmp(next0, next1, WIDTH*11)
+                        || memcmp( cur0,  cur1, WIDTH*11))
+                    fail();
+
+                bench_new(dst1, stride,
+                         prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
+                         WIDTH, parity, mask);
+            }
+        }
+
+        // Use just 0s and ~0s to try to provoke bad cropping or overflow
+        // Parity makes no difference to this test so just test 0
+        if (check_func(ctx_8.filter_line3, "bwdif8.line3.overflow")) {
+
+            declare_func(void, void * dst1, int d_stride,
+                                      const void * prev1, const void * cur1, const void * next1, int prefs,
+                                      int w, int parity, int clip_max);
+
+            randomize_overflow_check(prev0, prev1, mask, 11*WIDTH);
+            randomize_overflow_check(next0, next1, mask, 11*WIDTH);
+            randomize_overflow_check( cur0,  cur1, mask, 11*WIDTH);
+
+            call_ref(dst0, stride,
+                     prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, stride,
+                     WIDTH, 0, mask);
+            call_new(dst1, stride,
+                     prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
+                     WIDTH, 0, mask);
+
+            if (memcmp(dst0, dst1, WIDTH*3)
+                    || memcmp(prev0, prev1, WIDTH*11)
+                    || memcmp(next0, next1, WIDTH*11)
+                    || memcmp( cur0,  cur1, WIDTH*11))
+                fail();
+
+            // No point to benching
+        }
+
+        report("bwdif8.line3");
+    }
+
     {
         LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
         LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);
-- 
2.43.0


From d589bee58a2aa9cb5b41b8eb1a70c4d4c23b94c3 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 4 Jul 2023 14:04:45 +0000
Subject: [PATCH 143/157] avfilter/vf_bwdif: Add neon for filter_line3
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: John Cox <jc@kynesim.co.uk>
Signed-off-by: Martin Storsjö <martin@martin.st>
(cherry picked from commit f00222e81f7d6a59d977fbb280d67989818e0ad2)
---
 libavfilter/aarch64/vf_bwdif_init_aarch64.c |  28 ++
 libavfilter/aarch64/vf_bwdif_neon.S         | 272 ++++++++++++++++++++
 2 files changed, 300 insertions(+)

diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
index 21e67884ab..f52bc4b9b4 100644
--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -36,6 +36,33 @@ void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
                                int prefs3, int mrefs3, int prefs4, int mrefs4,
                                int parity, int clip_max);
 
+void ff_bwdif_filter_line3_neon(void * dst1, int d_stride,
+                                const void * prev1, const void * cur1, const void * next1, int s_stride,
+                                int w, int parity, int clip_max);
+
+
+static void filter_line3_helper(void * dst1, int d_stride,
+                                const void * prev1, const void * cur1, const void * next1, int s_stride,
+                                int w, int parity, int clip_max)
+{
+    // Asm works on 16 byte chunks
+    // If w is a multiple of 16 then all is good - if not then if width rounded
+    // up to nearest 16 will fit in both src & dst strides then allow the asm
+    // to write over the padding bytes as that is almost certainly faster than
+    // having to invoke the C version to clean up the tail.
+    const int w1 = FFALIGN(w, 16);
+    const int w0 = clip_max != 255 ? 0 :
+                   d_stride <= w1 && s_stride <= w1 ? w : w & ~15;
+
+    ff_bwdif_filter_line3_neon(dst1, d_stride,
+                               prev1, cur1, next1, s_stride,
+                               w0, parity, clip_max);
+
+    if (w0 < w)
+        ff_bwdif_filter_line3_c((char *)dst1 + w0, d_stride,
+                                (const char *)prev1 + w0, (const char *)cur1 + w0, (const char *)next1 + w0, s_stride,
+                                w - w0, parity, clip_max);
+}
 
 static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
                                int w, int prefs, int mrefs, int prefs2, int mrefs2,
@@ -93,5 +120,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
     s->filter_intra = filter_intra_helper;
     s->filter_line  = filter_line_helper;
     s->filter_edge  = filter_edge_helper;
+    s->filter_line3 = filter_line3_helper;
 }
 
diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
index f185e94e3c..ae9aab20cd 100644
--- a/libavfilter/aarch64/vf_bwdif_neon.S
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -154,6 +154,278 @@ const coeffs, align=4   // align 4 means align on 2^4 boundry
         .hword          5077, 981                       // sp[0] = v0.h[6]
 endconst
 
+// ===========================================================================
+//
+// void ff_bwdif_filter_line3_neon(
+//         void * dst1,         // x0
+//         int d_stride,        // w1
+//         const void * prev1,  // x2
+//         const void * cur1,   // x3
+//         const void * next1,  // x4
+//         int s_stride,        // w5
+//         int w,               // w6
+//         int parity,          // w7
+//         int clip_max);       // [sp, #0] (Ignored)
+
+function ff_bwdif_filter_line3_neon, export=1
+        // Sanity check w
+        cmp             w6, #0
+        ble             99f
+
+        LDR_COEFFS      v0, x17
+
+// #define prev2 cur
+//        const uint8_t * restrict next2 = parity ? prev : next;
+        cmp             w7, #0
+        csel            x17, x2, x4, ne
+
+        // We want all the V registers - save all the ones we must
+        PUSH_VREGS
+
+        // Some rearrangement of initial values for nice layout of refs in regs
+        mov             w10, w6                         // w10 = loop count
+        neg             w9,  w5                         // w9  = mref
+        lsl             w8,  w9,  #1                    // w8 =  mref2
+        add             w7,  w9,  w9, LSL #1            // w7  = mref3
+        lsl             w6,  w9,  #2                    // w6  = mref4
+        mov             w11, w5                         // w11 = pref
+        lsl             w12, w5,  #1                    // w12 = pref2
+        add             w13, w5,  w5, LSL #1            // w13 = pref3
+        lsl             w14, w5,  #2                    // w14 = pref4
+        add             w15, w5,  w5, LSL #2            // w15 = pref5
+        add             w16, w14, w12                   // w16 = pref6
+
+        lsl             w5,  w1,  #1                    // w5 = d_stride * 2
+
+//         for (x = 0; x < w; x++) {
+//             int diff0, diff2;
+//             int d0, d2;
+//             int temporal_diff0, temporal_diff2;
+//
+//             int i1, i2;
+//             int j1, j2;
+//             int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4;
+
+10:
+//             c0 = prev2[0] + next2[0];                // c0 = v20, v21
+//             d0  = c0 >> 1;                           // d0 = v10
+//             temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11
+        ldr             q31, [x3]
+        ldr             q21, [x17]
+        uhadd           v10.16b, v31.16b, v21.16b
+        uabd            v11.16b, v31.16b, v21.16b
+        uaddl           v20.8h,  v21.8b,  v31.8b
+        uaddl2          v21.8h,  v21.16b, v31.16b
+
+        ldr             q31, [x3, w6, sxtw]
+        ldr             q23, [x17, w6, sxtw]
+
+//             i1 = coef_hf[0] * c0;                    // i1 = v2-v5
+        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[2]
+
+        ldr             q30, [x3, w14, sxtw]
+        ldr             q25, [x17, w14, sxtw]
+
+//             m4 = prev2[mrefs4] + next2[mrefs4];      // m4 = v22,v23
+        uaddl           v22.8h,  v23.8b,  v31.8b
+        uaddl2          v23.8h,  v23.16b, v31.16b
+
+//             p4 = prev2[prefs4] + next2[prefs4];      // p4 = v24,v25, (p4 >> 1) = v12
+        uhadd           v12.16b, v25.16b, v30.16b
+        uaddl           v24.8h,  v25.8b,  v30.8b
+        uaddl2          v25.8h,  v25.16b, v30.16b
+
+//             j1 = -coef_hf[1] * (c0 + p4);            // j1 = v6-v9  (-c0:v20,v21)
+        add             v20.8h,  v20.8h,  v24.8h
+        add             v21.8h,  v21.8h,  v25.8h
+        SMULL4K         v6, v7, v8, v9, v20, v21, v0.h[5]
+
+//             m3 = cur[mrefs3];                        // m3 = v20
+        ldr             q20, [x3, w7, sxtw]
+
+//             p3 = cur[prefs3];                        // p3 = v21
+        ldr             q21, [x3, w13, sxtw]
+
+//             i1 += coef_hf[2] * (m4 + p4);            // (-m4:v22,v23) (-p4:v24,v25)
+        add             v22.8h,  v22.8h,  v24.8h
+        add             v23.8h,  v23.8h,  v25.8h
+        UMLAL4K         v2, v3, v4, v5, v22, v23, v0.h[4]
+
+        ldr             q29, [x3, w8, sxtw]
+        ldr             q23, [x17, w8, sxtw]
+
+//             i1 -= coef_lf[1] * 4 * (m3 + p3);        // -
+        uaddl           v30.8h,  v20.8b,  v21.8b
+        uaddl2          v31.8h,  v20.16b, v21.16b
+
+        ldr             q28, [x3, w16, sxtw]
+        ldr             q25, [x17, w16, sxtw]
+
+        UMLSL4K         v2, v3, v4, v5, v30, v31, v0.h[1]
+
+//             m2 = prev2[mrefs2] + next2[mrefs2];      // m2 = v22,v23, (m2 >> 1) = v13
+        uhadd           v13.16b, v23.16b, v29.16b
+        uaddl           v22.8h,  v23.8b,  v29.8b
+        uaddl2          v23.8h,  v23.16b, v29.16b
+
+        ldr             q31, [x3, w12, sxtw]
+        ldr             q27, [x17, w12, sxtw]
+
+//             p6 = prev2[prefs6] + next2[prefs6];      // p6 = v24,v25
+        uaddl           v24.8h,  v25.8b,  v28.8b
+        uaddl2          v25.8h,  v25.16b, v28.16b
+
+//             j1 += coef_hf[2] * (m2 + p6);            // (-p6:v24,v25)
+        add             v24.8h,  v24.8h,  v22.8h
+        add             v25.8h,  v25.8h,  v23.8h
+        UMLAL4K         v6, v7, v8, v9, v24, v25, v0.h[4]
+
+//             m1 = cur[mrefs];                         // m1 = v24
+        ldr             q24, [x3, w9, sxtw]
+
+//             p5 = cur[prefs5];                        // p5 = v25
+        ldr             q25, [x3, w15, sxtw]
+
+//             p2 = prev2[prefs2] + next2[prefs2];      // p2 = v26, v27
+//             temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14
+//             d2  = p2 >> 1;                           // d2 = v15
+        uabd            v14.16b, v31.16b, v27.16b
+        uhadd           v15.16b, v31.16b, v27.16b
+        uaddl           v26.8h,  v27.8b,  v31.8b
+        uaddl2          v27.8h,  v27.16b, v31.16b
+
+//             j1 += coef_hf[0] * p2;                   // -
+        UMLAL4K         v6, v7, v8, v9, v26, v27, v0.h[2]
+
+//             i1 -= coef_hf[1] * (m2 + p2);            // (-m2:v22,v23*) (-p2:v26*,v27*)
+        add             v22.8h,  v22.8h,  v26.8h
+        add             v23.8h,  v23.8h,  v27.8h
+        UMLSL4K         v2, v3, v4, v5, v22, v23, v0.h[3]
+
+//             p1 = cur[prefs];                         // p1 = v22
+        ldr             q22, [x3, w11, sxtw]
+
+//             j1 -= coef_lf[1] * 4 * (m1 + p5);        // -
+        uaddl           v26.8h,  v24.8b,  v25.8b
+        uaddl2          v27.8h,  v24.16b, v25.16b
+        UMLSL4K         v6, v7, v8, v9, v26, v27, v0.h[1]
+
+//             j2 = (coef_sp[0] * (p1 + p3) - coef_sp[1]  * (m1 + p5)) >> 13; // (-p5:v25*) j2=v16
+        uaddl           v18.8h,  v22.8b,  v21.8b
+        uaddl2          v19.8h,  v22.16b, v21.16b
+        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
+
+        uaddl           v18.8h,  v24.8b,  v25.8b
+        uaddl2          v19.8h,  v24.16b, v25.16b
+        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
+
+        SQSHRUNN        v16, v28, v29, v30, v31, 13
+
+//             i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17
+        uaddl           v18.8h,  v22.8b,  v24.8b
+        uaddl2          v19.8h,  v22.16b, v24.16b
+        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
+
+        uaddl           v18.8h,  v20.8b,  v21.8b
+        uaddl2          v19.8h,  v20.16b, v21.16b
+        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
+
+        SQSHRUNN        v17, v28, v29, v30, v31, 13
+
+//             i1 += coef_lf[0] * 4 * (m1 + p1);        // p1 = v22, m1 = v24
+        uaddl           v26.8h,  v24.8b,  v22.8b
+        uaddl2          v27.8h,  v24.16b, v22.16b
+        UMLAL4K         v2, v3, v4, v5, v26, v27, v0.h[0]
+
+        ldr             q31, [x2, w9, sxtw]
+        ldr             q29, [x4, w9, sxtw]
+
+//             j1 += coef_lf[0] * 4 * (p1 + p3);        // p1 = v22, p3 = v21
+        uaddl           v26.8h,  v21.8b,  v22.8b
+        uaddl2          v27.8h,  v21.16b, v22.16b
+        UMLAL4K         v6, v7, v8, v9, v26, v27, v0.h[0]
+
+        ldr             q30, [x2, w11, sxtw]
+        ldr             q28, [x4, w11, sxtw]
+
+//             i1 >>= 15;                               // i1 = v2, -v3, -v4*, -v5*
+        SQSHRUNN        v2, v2, v3, v4, v5, 15
+
+//             j1 >>= 15;                               // j1 = v3, -v6*, -v7*, -v8*, -v9*
+        SQSHRUNN        v3, v6, v7, v8, v9, 15
+
+//             {
+//                 int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
+//                 int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
+        uabd            v30.16b, v22.16b, v30.16b
+        uabd            v31.16b, v24.16b, v31.16b
+        uabd            v28.16b, v22.16b, v28.16b
+        uabd            v29.16b, v24.16b, v29.16b
+        uhadd           v31.16b, v31.16b, v30.16b
+        uhadd           v29.16b, v29.16b, v28.16b
+
+        ldr             q27, [x2, w13, sxtw]
+        ldr             q26, [x4, w13, sxtw]
+
+//                 diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18
+        ushr            v18.16b, v11.16b, #1
+        umax            v18.16b, v18.16b, v31.16b
+        umax            v18.16b, v18.16b, v29.16b
+//             }                                        // v28, v30 preserved for next block
+//             {  // tdiff2 = v14
+//                 int t1 =(FFABS(prev[prefs] - p1) + FFABS(prev[prefs3] - p3)) >> 1;
+//                 int t2 =(FFABS(next[prefs] - p1) + FFABS(next[prefs3] - p3)) >> 1;
+        uabd            v31.16b, v21.16b, v27.16b
+        uabd            v29.16b, v21.16b, v26.16b
+        uhadd           v31.16b, v31.16b, v30.16b
+        uhadd           v29.16b, v29.16b, v28.16b
+
+//                 diff2 = FFMAX3(temporal_diff2 >> 1, t1, t2); // diff2=v19
+        ushr            v19.16b, v14.16b, #1
+        umax            v19.16b, v19.16b, v31.16b
+        umax            v19.16b, v19.16b, v29.16b
+//             }
+
+        // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15
+        SPAT_CHECK      v18, v13, v24, v10, v22, v15, v31, v30, v29, v28
+
+        //  diff2 = v19, d0 = v10, p1 = v22, d2 = v15, p3 = v21, (p4 >> 1) = v12
+        SPAT_CHECK      v19, v10, v22, v15, v21, v12, v31, v30, v29, v28
+
+        // j1 = v3, j2 = v16, p1 = v22, d2 = v15, p3 = v21, td2 = v14, diff2 = v19
+        INTERPOL        v3, v3, v16, v22, v15, v21, v14, v19, v31, v30, v29
+
+//                 dst[d_stride * 2] = av_clip_uint8(interpol);
+        str             q3,  [x0, w5, sxtw]
+
+//             dst[d_stride] = p1;
+        str             q22, [x0, w1, sxtw]
+
+        // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18
+        INTERPOL        v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29
+
+//                 dst[0] = av_clip_uint8(interpol);
+        str             q2,  [x0], #16
+//             }
+//
+//             dst++;
+//             cur++;
+//             prev++;
+//             prev2++;
+//             next++;
+//         }
+        subs            w10, w10, #16
+        add             x2,  x2,  #16
+        add             x3,  x3,  #16
+        add             x4,  x4,  #16
+        add             x17, x17, #16
+        bgt             10b
+
+        POP_VREGS
+99:
+        ret
+endfunc
+
 // ===========================================================================
 //
 // void filter_line(
-- 
2.43.0


From c8b3bd4b16c7645058cd8c31ef01ecb96782423e Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 24 Jul 2023 16:39:06 +0100
Subject: [PATCH 144/157] weak_link: Fix ref count init

(cherry picked from commit d6de45b15a0c96bfdc96bbc441963a60945e5eba)
---
 libavcodec/weak_link.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavcodec/weak_link.c b/libavcodec/weak_link.c
index f234a985b9..5a79e89ed7 100644
--- a/libavcodec/weak_link.c
+++ b/libavcodec/weak_link.c
@@ -19,6 +19,7 @@ struct ff_weak_link_master * ff_weak_link_new(void * p)
     struct ff_weak_link_master * w = malloc(sizeof(*w));
     if (!w)
         return NULL;
+    atomic_init(&w->ref_count, 0);
     w->ptr = p;
     if (pthread_rwlock_init(&w->lock, NULL)) {
         free(w);
-- 
2.43.0


From 3360c5dce7281b02c138e103046ccc9aeab87434 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Mon, 24 Jul 2023 17:28:06 +0100
Subject: [PATCH 145/157] v4l2_m2m: Check fd before attempting to close (fix
 valgrind warn)

(cherry picked from commit befa42878d054d1fba53d5da14406faaae224daf)
---
 libavcodec/v4l2_m2m.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index 28d9ed4988..238ceea235 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -340,8 +340,11 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
     ff_v4l2_context_release(&s->output);
 
     dmabufs_ctl_unref(&s->db_ctl);
-    close(s->fd);
-    s->fd = -1;
+
+    if (s->fd != -1) {
+        close(s->fd);
+        s->fd = -1;
+    }
 
     s->self_ref = NULL;
     // This is only called on avctx close so after this point we don't have that
-- 
2.43.0


From ec4a50f849f9069c66f9efad81074d999258ef57 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 26 Jul 2023 16:29:39 +0000
Subject: [PATCH 146/157] v4l2_req_devscan: Fix udev leak

(cherry picked from commit 53b17ffd8a8890ef483163f3c9b0f96b437303f1)
---
 libavcodec/v4l2_req_devscan.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_req_devscan.c b/libavcodec/v4l2_req_devscan.c
index cfa94d55c4..ee8527ba1f 100644
--- a/libavcodec/v4l2_req_devscan.c
+++ b/libavcodec/v4l2_req_devscan.c
@@ -437,12 +437,14 @@ int devscan_build(void * const dc, struct devscan **pscan)
     }
 
     udev_enumerate_unref(enumerate);
+    udev_unref(udev);
 
     *pscan = scan;
     return 0;
 
 fail:
-    udev_unref(udev);
+    if (udev)
+        udev_unref(udev);
     devscan_delete(&scan);
     return ret;
 }
-- 
2.43.0


From da7f874b0982a76f9b0f2c3805cfaec242bcf3de Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 26 Jul 2023 16:42:27 +0000
Subject: [PATCH 147/157] v4l2_m2m: Fix device_ref leak

(cherry picked from commit bfea15c07b4301cd1208981c8f221e5e3a598b34)
---
 libavcodec/v4l2_m2m.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
index 238ceea235..add64b8e63 100644
--- a/libavcodec/v4l2_m2m.c
+++ b/libavcodec/v4l2_m2m.c
@@ -338,6 +338,7 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
     }
 
     ff_v4l2_context_release(&s->output);
+    av_buffer_unref(&s->device_ref);
 
     dmabufs_ctl_unref(&s->db_ctl);
 
-- 
2.43.0


From b97c2f9f412cc8c9f20e666e093386639d60249f Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 28 Jul 2023 16:10:01 +0000
Subject: [PATCH 148/157] v4l2_m2m_dec: Avoid structure init warnings when
 struct changes

(cherry picked from commit 8a836af420ed8c8dba90e2fd88691bcaa0668f8a)
---
 libavcodec/v4l2_m2m_dec.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 11c83b2d66..584e0b8825 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -1004,11 +1004,13 @@ parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s)
 #if CONFIG_H264_DECODER
         case AV_CODEC_ID_H264:
         {
-            H264ParamSets ps = {{NULL}};
+            H264ParamSets ps;
             int is_avc = 0;
             int nal_length_size = 0;
             int ret;
 
+            memset(&ps, 0, sizeof(ps));
+
             ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size,
                                            &ps, &is_avc, &nal_length_size,
                                            avctx->err_recognition, avctx);
@@ -1034,12 +1036,15 @@ parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s)
 #if CONFIG_HEVC_DECODER
         case AV_CODEC_ID_HEVC:
         {
-            HEVCParamSets ps = {{NULL}};
-            HEVCSEI sei = {{{{0}}}};
+            HEVCParamSets ps;
+            HEVCSEI sei;
             int is_nalff = 0;
             int nal_length_size = 0;
             int ret;
 
+            memset(&ps, 0, sizeof(ps));
+            memset(&sei, 0, sizeof(sei));
+
             ret = ff_hevc_decode_extradata(avctx->extradata, avctx->extradata_size,
                                            &ps, &sei, &is_nalff, &nal_length_size,
                                            avctx->err_recognition, 0, avctx);
-- 
2.43.0


From 6f5778610980f02f0d5705e33e86a3dde45cd236 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Sat, 22 Jul 2023 12:33:50 +0000
Subject: [PATCH 149/157] v4l2_m2m_dec: Avoid calling get_format if no V4L2
 decoder device

Move the get_format callback to after the decoder device has been found.
This means that get_format will never be called if there is no chance
that init will succeed which helps programs (such as VLC) that do
significant processing in that callback to avoid it. It also means that
the list of formats availible can actually represent reality.

(cherry picked from commit 3b27cb41d7df73c054452fa49269988d4df32409)
---
 libavcodec/v4l2_context.c |  41 +++++++++++++
 libavcodec/v4l2_context.h |  13 ++++
 libavcodec/v4l2_m2m_dec.c | 122 ++++++++++++++++++++++++++++----------
 3 files changed, 145 insertions(+), 31 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 79a31cf930..978a487ca9 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -1064,6 +1064,47 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout)
     return 0;
 }
 
+// Return 0 terminated list of drm fourcc video formats for this context
+// NULL if none found or error
+// Returned list is malloced so must be freed
+uint32_t * ff_v4l2_context_enum_drm_formats(V4L2Context *ctx, unsigned int *pN)
+{
+    unsigned int i;
+    unsigned int n = 0;
+    unsigned int size = 0;
+    uint32_t * e = NULL;
+    *pN = 0;
+
+    for (i = 0; i < 1024; ++i) {
+        struct v4l2_fmtdesc fdesc = {
+            .index = i,
+            .type = ctx->type
+        };
+
+        if (ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc))
+            return e;
+
+        if (n + 1 >= size) {
+            unsigned int newsize = (size == 0) ? 16 : size * 2;
+            uint32_t * t = av_realloc(e, newsize * sizeof(*t));
+            if (!t)
+                return e;
+            e = t;
+            size = newsize;
+        }
+
+        e[n] = fdesc.pixelformat;
+        e[++n] = 0;
+        if (pN)
+            *pN = n;
+    }
+
+    // If we've looped 1024 times we are clearly confused
+    *pN = 0;
+    av_free(e);
+    return NULL;
+}
+
 int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
 {
     struct v4l2_format_update fmt = { 0 };
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
index 5afed3e6ec..f4240f7ddd 100644
--- a/libavcodec/v4l2_context.h
+++ b/libavcodec/v4l2_context.h
@@ -151,6 +151,19 @@ int ff_v4l2_context_set_format(V4L2Context* ctx);
  */
 int ff_v4l2_context_get_format(V4L2Context* ctx, int probe);
 
+/**
+ * Get the list of drm fourcc pixel formats for this context
+ *
+ * @param[in] ctx A pointer to a V4L2Context. See V4L2Context
+ *       description for required variables.
+ * @param[in] pN A pointer to receive the number of formats
+ *       found. May be NULL if not wanted.
+ * @return Pointer to malloced list of zero terminated formats,
+ *         NULL if none or error. As list is malloced it must be
+ *         freed.
+ */
+uint32_t * ff_v4l2_context_enum_drm_formats(V4L2Context *ctx, unsigned int *pN);
+
 /**
  * Releases a V4L2Context.
  *
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index 584e0b8825..c4f38cc24e 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -873,10 +873,9 @@ check_profile(AVCodecContext *const avctx, V4L2m2mContext *const s)
 };
 
 static int
-check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
+check_size(AVCodecContext * const avctx, V4L2m2mContext * const s, const uint32_t fcc)
 {
     unsigned int i;
-    const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format);
     const uint32_t w = avctx->coded_width;
     const uint32_t h = avctx->coded_height;
 
@@ -1073,12 +1072,91 @@ parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s)
     }
 }
 
+static int
+choose_capture_format(AVCodecContext * const avctx, V4L2m2mContext * const s)
+{
+    const V4L2m2mPriv * const priv = avctx->priv_data;
+    unsigned int fmts_n;
+    uint32_t *fmts = ff_v4l2_context_enum_drm_formats(&s->capture, &fmts_n);
+    enum AVPixelFormat *fmts2 = NULL;
+    enum AVPixelFormat t;
+    enum AVPixelFormat gf_pix_fmt;
+    unsigned int i;
+    unsigned int n = 0;
+    unsigned int pref_n = 1;
+    int rv = AVERROR(ENOENT);
+
+    if (!fmts)
+        return AVERROR(ENOENT);
+
+    if ((fmts2 = av_malloc(sizeof(*fmts2) * (fmts_n + 2))) == NULL) {
+        rv = AVERROR(ENOMEM);
+        goto error;
+    }
+
+    // Filter for formats that are supported by ffmpeg and
+    // can accomodate the stream size
+    fmts2[n++] = AV_PIX_FMT_DRM_PRIME;
+    for (i = 0; i != fmts_n; ++i) {
+        const enum AVPixelFormat f = ff_v4l2_format_v4l2_to_avfmt(fmts[i], AV_CODEC_ID_RAWVIDEO);
+        if (f == AV_PIX_FMT_NONE)
+            continue;
+
+        if (check_size(avctx, s, fmts[i]) != 0)
+            continue;
+
+        if (f == priv->pix_fmt)
+            pref_n = n;
+        fmts2[n++] = f;
+    }
+    fmts2[n] = AV_PIX_FMT_NONE;
+
+    if (n < 2) {
+        av_log(avctx, AV_LOG_DEBUG, "%s: No usable formats found\n", __func__);
+        goto error;
+    }
+
+    // Put preferred s/w format at the end - ff_get_format will put it in sw_pix_fmt
+    t = fmts2[n - 1];
+    fmts2[n - 1] = fmts2[pref_n];
+    fmts2[pref_n] = t;
+
+    gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
+    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
+           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
+           avctx->coded_width, avctx->coded_height,
+           gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
+
+    if (gf_pix_fmt == AV_PIX_FMT_NONE)
+        goto error;
+
+    if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
+        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
+        s->capture.av_pix_fmt = avctx->sw_pix_fmt;
+        s->output_drm = 1;
+    }
+    else {
+        avctx->pix_fmt = gf_pix_fmt;
+        s->capture.av_pix_fmt = gf_pix_fmt;
+        s->output_drm = 0;
+    }
+
+    // Get format converts capture.av_pix_fmt back into a V4L2 format in the context
+    if ((rv = ff_v4l2_context_get_format(&s->capture, 0)) != 0)
+        goto error;
+    rv = ff_v4l2_context_set_format(&s->capture);
+
+error:
+    av_free(fmts2);
+    av_free(fmts);
+    return rv;
+}
+
 static av_cold int v4l2_decode_init(AVCodecContext *avctx)
 {
     V4L2Context *capture, *output;
     V4L2m2mContext *s;
     V4L2m2mPriv *priv = avctx->priv_data;
-    int gf_pix_fmt;
     int ret;
 
     av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
@@ -1122,28 +1200,8 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
     capture->av_pix_fmt = avctx->pix_fmt;
     capture->min_buf_size = 0;
 
-    /* the client requests the codec to generate DRM frames:
-     *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
-     *       check the ff_v4l2_buffer_to_avframe conversion function.
-     *   - the DRM frame format is passed in the DRM frame descriptor layer.
-     *       check the v4l2_get_drm_frame function.
-     */
-
-    avctx->sw_pix_fmt = avctx->pix_fmt;
-    gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
-    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
-           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
-           avctx->coded_width, avctx->coded_height,
-           gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
-
-    if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
-        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
-        s->output_drm = 1;
-    }
-    else {
-        capture->av_pix_fmt = gf_pix_fmt;
-        s->output_drm = 0;
-    }
+    capture->av_pix_fmt = AV_PIX_FMT_NONE;
+    s->output_drm = 0;
 
     s->db_ctl = NULL;
     if (priv->dmabuf_alloc != NULL && strcmp(priv->dmabuf_alloc, "v4l2") != 0) {
@@ -1185,19 +1243,21 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
         return ret;
     }
 
-    if ((ret = v4l2_prepare_decoder(s)) < 0)
-        return ret;
-
     if ((ret = get_quirks(avctx, s)) != 0)
         return ret;
 
-    if ((ret = check_size(avctx, s)) != 0)
-        return ret;
-
     if ((ret = check_profile(avctx, s)) != 0) {
         av_log(avctx, AV_LOG_WARNING, "Profile %d not supported by decode\n", avctx->profile);
         return ret;
     }
+
+    // Size check done as part of format filtering
+    if ((ret = choose_capture_format(avctx, s)) != 0)
+        return ret;
+
+    if ((ret = v4l2_prepare_decoder(s)) < 0)
+        return ret;
+
     return 0;
 }
 
-- 
2.43.0


From 6d7a3b2bd0d517bfb0b46d8641e9aafc26e273b1 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 10 Aug 2023 06:26:35 +0000
Subject: [PATCH 150/157] v4l2_req_dmabufs: Fix crash on free if dmabuf
 imported

Thanks to Ratchanan Srirattanamet for finding this
---
 libavcodec/v4l2_req_dmabufs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
index acc0366e76..017c3892a5 100644
--- a/libavcodec/v4l2_req_dmabufs.c
+++ b/libavcodec/v4l2_req_dmabufs.c
@@ -232,7 +232,8 @@ void dmabuf_free(struct dmabuf_h * dh)
     request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
 #endif
 
-    dh->fns->buf_free(dh);
+    if (dh->fns != NULL && dh->fns->buf_free)
+        dh->fns->buf_free(dh);
 
     if (dh->mapptr != MAP_FAILED && dh->mapptr != NULL)
         munmap(dh->mapptr, dh->size);
-- 
2.43.0


From 423fe12423d41de6af2897dbb7aeacd3b8fe2fdc Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 10 Aug 2023 06:34:47 +0000
Subject: [PATCH 151/157] aarch64/rgb2rgb_neon: Fix bgr24->yuv matrix read to
 flip correct way

---
 libswscale/aarch64/rgb2rgb_neon.S | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 476ca723a0..077d1dd593 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -92,15 +92,12 @@ endfunc
 
 function ff_rgb24toyv12_aarch64, export=1
         ldr             x15, [sp, #8]
-        ld1             {v3.s}[2], [x15], #4
-        ld1             {v3.s}[1], [x15], #4
-        ld1             {v3.s}[0], [x15], #4
-        ld1             {v4.s}[2], [x15], #4
-        ld1             {v4.s}[1], [x15], #4
-        ld1             {v4.s}[0], [x15], #4
-        ld1             {v5.s}[2], [x15], #4
-        ld1             {v5.s}[1], [x15], #4
-        ld1             {v5.s}[0], [x15]
+        ld3             {v3.s, v4.s, v5.s}[0], [x15], #12
+        ld3             {v3.s, v4.s, v5.s}[1], [x15], #12
+        ld3             {v3.s, v4.s, v5.s}[2], [x15]
+        mov             v6.16b, v3.16b
+        mov             v3.16b, v5.16b
+        mov             v5.16b, v6.16b
         b               99f
 endfunc
 
-- 
2.43.0


From 50345e5817644354b14da7ba14d7a498d9c4b179 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 10 Aug 2023 06:36:51 +0000
Subject: [PATCH 152/157] aarch64/rgb2rgb_neon: Add macros to make common code
 explicit

---
 libswscale/aarch64/rgb2rgb_neon.S | 276 ++++++++++--------------------
 1 file changed, 95 insertions(+), 181 deletions(-)

diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 077d1dd593..0956800b41 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -78,6 +78,67 @@ function ff_interleave_bytes_neon, export=1
         ret
 endfunc
 
+// Expand rgb2 into r0+r1/g0+g1/b0+b1
+.macro XRGB3Y r0, g0, b0, r1, g1, b1, r2, g2, b2
+        uxtl            \r0\().8h, \r2\().8b
+        uxtl            \g0\().8h, \g2\().8b
+        uxtl            \b0\().8h, \b2\().8b
+
+        uxtl2           \r1\().8h, \r2\().16b
+        uxtl2           \g1\().8h, \g2\().16b
+        uxtl2           \b1\().8h, \b2\().16b
+.endm
+
+// Expand rgb2 into r0+r1/g0+g1/b0+b1
+// and pick every other el to put back into rgb2 for chroma
+.macro XRGB3YC r0, g0, b0, r1, g1, b1, r2, g2, b2
+        XRGB3Y          \r0, \g0, \b0, \r1, \g1, \b1, \r2, \g2, \b2
+
+        bic             \r2\().8h, #0xff, LSL #8
+        bic             \g2\().8h, #0xff, LSL #8
+        bic             \b2\().8h, #0xff, LSL #8
+.endm
+
+.macro SMLAL3 d0, d1, s0, s1, s2, c0, c1, c2
+        smull           \d0\().4s, \s0\().4h, \c0
+        smlal           \d0\().4s, \s1\().4h, \c1
+        smlal           \d0\().4s, \s2\().4h, \c2
+        smull2          \d1\().4s, \s0\().8h, \c0
+        smlal2          \d1\().4s, \s1\().8h, \c1
+        smlal2          \d1\().4s, \s2\().8h, \c2
+.endm
+
+// d0 may be s0
+// s0, s2 corrupted
+.macro SHRN_Y d0, s0, s1, s2, s3, k128h
+        shrn            \s0\().4h, \s0\().4s, #12
+        shrn2           \s0\().8h, \s1\().4s, #12
+        add             \s0\().8h, \s0\().8h, \k128h\().8h     // +128 (>> 3 = 16)
+        sqrshrun        \d0\().8b, \s0\().8h, #3
+        shrn            \s2\().4h, \s2\().4s, #12
+        shrn2           \s2\().8h, \s3\().4s, #12
+        add             \s2\().8h, \s2\().8h, \k128h\().8h
+        sqrshrun2       \d0\().16b, v28.8h, #3
+.endm
+
+.macro SHRN_C d0, s0, s1, k128b
+        shrn            \s0\().4h, \s0\().4s, #14
+        shrn2           \s0\().8h, \s1\().4s, #14
+        sqrshrn         \s0\().8b, \s0\().8h, #1
+        add             \d0\().8b, \s0\().8b, \k128b\().8b     // +128
+.endm
+
+.macro STB2V s0, n, a
+        st1             {\s0\().b}[(\n+0)], [\a], #1
+        st1             {\s0\().b}[(\n+1)], [\a], #1
+.endm
+
+.macro STB4V s0, n, a
+        STB2V           \s0, (\n+0), \a
+        STB2V           \s0, (\n+2), \a
+.endm
+
+
 // void ff_rgb24toyv12_aarch64(
 //              const uint8_t *src,             // x0
 //              uint8_t *ydst,                  // x1
@@ -111,7 +172,7 @@ endfunc
 //              int lumStride,                  // w6
 //              int chromStride,                // w7
 //              int srcStr,                     // [sp, #0]
-//              int32_t *rgb2yuv);              // [sp, #8]
+//              int32_t *rgb2yuv);              // [sp, #8] (including Mac)
 
 // regs
 // v0-2         Src bytes - reused as chroma src
@@ -130,13 +191,12 @@ endfunc
 // v30          V out
 // v31          V tmp
 
-// Assumes Little Endian in tail stores & conversion matrix
-
 function ff_bgr24toyv12_aarch64, export=1
         ldr             x15, [sp, #8]
         ld3             {v3.s, v4.s, v5.s}[0], [x15], #12
         ld3             {v3.s, v4.s, v5.s}[1], [x15], #12
         ld3             {v3.s, v4.s, v5.s}[2], [x15]
+
 99:
         ldr             w14, [sp, #0]
         movi            v7.8b, #128
@@ -167,73 +227,29 @@ function ff_bgr24toyv12_aarch64, export=1
         b.le            13f
 
 10:
-        uxtl            v16.8h, v0.8b
-        uxtl            v17.8h, v1.8b
-        uxtl            v18.8h, v2.8b
-
-        uxtl2           v20.8h, v0.16b
-        uxtl2           v21.8h, v1.16b
-        uxtl2           v22.8h, v2.16b
-
-        bic             v0.8h, #0xff, LSL #8
-        bic             v1.8h, #0xff, LSL #8
-        bic             v2.8h, #0xff, LSL #8
+        XRGB3YC         v16, v17, v18,  v20, v21, v22,  v0, v1, v2
 
         // Testing shows it is faster to stack the smull/smlal ops together
         // rather than interleave them between channels and indeed even the
         // shift/add sections seem happier not interleaved
 
         // Y0
-        smull           v26.4s, v16.4h, v3.h[0]
-        smlal           v26.4s, v17.4h, v4.h[0]
-        smlal           v26.4s, v18.4h, v5.h[0]
-        smull2          v27.4s, v16.8h, v3.h[0]
-        smlal2          v27.4s, v17.8h, v4.h[0]
-        smlal2          v27.4s, v18.8h, v5.h[0]
+        SMLAL3          v26, v27, v16, v17, v18, v3.h[0], v4.h[0], v5.h[0]
         // Y1
-        smull           v28.4s, v20.4h, v3.h[0]
-        smlal           v28.4s, v21.4h, v4.h[0]
-        smlal           v28.4s, v22.4h, v5.h[0]
-        smull2          v29.4s, v20.8h, v3.h[0]
-        smlal2          v29.4s, v21.8h, v4.h[0]
-        smlal2          v29.4s, v22.8h, v5.h[0]
-        shrn            v26.4h, v26.4s, #12
-        shrn2           v26.8h, v27.4s, #12
-        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-        sqrshrun        v26.8b, v26.8h, #3
-        shrn            v28.4h, v28.4s, #12
-        shrn2           v28.8h, v29.4s, #12
-        add             v28.8h, v28.8h, v6.8h
-        sqrshrun2       v26.16b, v28.8h, #3
-        // Y0/Y1
+        SMLAL3          v28, v29, v20, v21, v22, v3.h[0], v4.h[0], v5.h[0]
+        SHRN_Y          v26, v26, v27, v28, v29, v6
 
         // U
         // Vector subscript *2 as we loaded into S but are only using H
-        smull           v24.4s, v0.4h, v3.h[2]
-        smlal           v24.4s, v1.4h, v4.h[2]
-        smlal           v24.4s, v2.4h, v5.h[2]
-        smull2          v25.4s, v0.8h, v3.h[2]
-        smlal2          v25.4s, v1.8h, v4.h[2]
-        smlal2          v25.4s, v2.8h, v5.h[2]
+        SMLAL3          v24, v25, v0, v1, v2, v3.h[2], v4.h[2], v5.h[2]
 
         // V
-        smull           v30.4s, v0.4h, v3.h[4]
-        smlal           v30.4s, v1.4h, v4.h[4]
-        smlal           v30.4s, v2.4h, v5.h[4]
-        smull2          v31.4s, v0.8h, v3.h[4]
-        smlal2          v31.4s, v1.8h, v4.h[4]
-        smlal2          v31.4s, v2.8h, v5.h[4]
+        SMLAL3          v30, v31, v0, v1, v2, v3.h[4], v4.h[4], v5.h[4]
 
         ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
 
-        shrn            v24.4h, v24.4s, #14
-        shrn2           v24.8h, v25.4s, #14
-        sqrshrn         v24.8b, v24.8h, #1
-        add             v24.8b, v24.8b, v7.8b     // +128
-        shrn            v30.4h, v30.4s, #14
-        shrn2           v30.8h, v31.4s, #14
-        sqrshrn         v30.8b, v30.8h, #1
-        add             v30.8b, v30.8b, v7.8b     // +128
+        SHRN_C          v24, v24, v25, v7
+        SHRN_C          v30, v30, v31, v7
 
         subs            w9, w9, #16
 
@@ -250,69 +266,21 @@ function ff_bgr24toyv12_aarch64, export=1
 13:
         // Body is simple copy of main loop body minus preload
 
-        uxtl            v16.8h, v0.8b
-        uxtl            v17.8h, v1.8b
-        uxtl            v18.8h, v2.8b
-
-        uxtl2           v20.8h, v0.16b
-        uxtl2           v21.8h, v1.16b
-        uxtl2           v22.8h, v2.16b
-
-        bic             v0.8h, #0xff, LSL #8
-        bic             v1.8h, #0xff, LSL #8
-        bic             v2.8h, #0xff, LSL #8
-
+        XRGB3YC         v16, v17, v18,  v20, v21, v22,  v0, v1, v2
         // Y0
-        smull           v26.4s, v16.4h, v3.h[0]
-        smlal           v26.4s, v17.4h, v4.h[0]
-        smlal           v26.4s, v18.4h, v5.h[0]
-        smull2          v27.4s, v16.8h, v3.h[0]
-        smlal2          v27.4s, v17.8h, v4.h[0]
-        smlal2          v27.4s, v18.8h, v5.h[0]
+        SMLAL3          v26, v27, v16, v17, v18, v3.h[0], v4.h[0], v5.h[0]
         // Y1
-        smull           v28.4s, v20.4h, v3.h[0]
-        smlal           v28.4s, v21.4h, v4.h[0]
-        smlal           v28.4s, v22.4h, v5.h[0]
-        smull2          v29.4s, v20.8h, v3.h[0]
-        smlal2          v29.4s, v21.8h, v4.h[0]
-        smlal2          v29.4s, v22.8h, v5.h[0]
-        shrn            v26.4h, v26.4s, #12
-        shrn2           v26.8h, v27.4s, #12
-        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-        sqrshrun        v26.8b, v26.8h, #3
-        shrn            v28.4h, v28.4s, #12
-        shrn2           v28.8h, v29.4s, #12
-        add             v28.8h, v28.8h, v6.8h
-        sqrshrun2       v26.16b, v28.8h, #3
-        // Y0/Y1
-
+        SMLAL3          v28, v29, v20, v21, v22, v3.h[0], v4.h[0], v5.h[0]
+        SHRN_Y          v26, v26, v27, v28, v29, v6
         // U
-        // Vector subscript *2 as we loaded into S but are only using H
-        smull           v24.4s, v0.4h, v3.h[2]
-        smlal           v24.4s, v1.4h, v4.h[2]
-        smlal           v24.4s, v2.4h, v5.h[2]
-        smull2          v25.4s, v0.8h, v3.h[2]
-        smlal2          v25.4s, v1.8h, v4.h[2]
-        smlal2          v25.4s, v2.8h, v5.h[2]
-
+        SMLAL3          v24, v25, v0, v1, v2, v3.h[2], v4.h[2], v5.h[2]
         // V
-        smull           v30.4s, v0.4h, v3.h[4]
-        smlal           v30.4s, v1.4h, v4.h[4]
-        smlal           v30.4s, v2.4h, v5.h[4]
-        smull2          v31.4s, v0.8h, v3.h[4]
-        smlal2          v31.4s, v1.8h, v4.h[4]
-        smlal2          v31.4s, v2.8h, v5.h[4]
+        SMLAL3          v30, v31, v0, v1, v2, v3.h[4], v4.h[4], v5.h[4]
 
         cmp             w9, #-16
 
-        shrn            v24.4h, v24.4s, #14
-        shrn2           v24.8h, v25.4s, #14
-        sqrshrn         v24.8b, v24.8h, #1
-        add             v24.8b, v24.8b, v7.8b     // +128
-        shrn            v30.4h, v30.4s, #14
-        shrn2           v30.8h, v31.4s, #14
-        sqrshrn         v30.8b, v30.8h, #1
-        add             v30.8b, v30.8b, v7.8b     // +128
+        SHRN_C          v24, v24, v25, v7
+        SHRN_C          v30, v30, v31, v7
 
         // Here:
         // w9 == 0      width % 16 == 0, tail done
@@ -347,14 +315,14 @@ function ff_bgr24toyv12_aarch64, export=1
 2:
         tbz             w9, #3, 1f
         st1             {v26.8b},    [x11], #8
-        st1             {v24.s}[0],  [x12], #4
-        st1             {v30.s}[0],  [x13], #4
+        STB4V           v24, 0, x12
+        STB4V           v30, 0, x13
 1:      tbz             w9, #2, 1f
-        st1             {v26.s}[2],  [x11], #4
-        st1             {v24.h}[2],  [x12], #2
-        st1             {v30.h}[2],  [x13], #2
+        STB4V           v26  8, x11
+        STB2V           v24, 4, x12
+        STB2V           v30, 4, x13
 1:      tbz             w9, #1, 1f
-        st1             {v26.h}[6],  [x11], #2
+        STB2V           v26, 12, x11
         st1             {v24.b}[6],  [x12], #1
         st1             {v30.b}[6],  [x13], #1
 1:      tbz             w9, #0, 1f
@@ -381,44 +349,15 @@ function ff_bgr24toyv12_aarch64, export=1
         b.le            13f
 
 10:
-        uxtl            v16.8h, v0.8b
-        uxtl            v17.8h, v1.8b
-        uxtl            v18.8h, v2.8b
-
-        uxtl2           v20.8h, v0.16b
-        uxtl2           v21.8h, v1.16b
-        uxtl2           v22.8h, v2.16b
-
-        // Testing shows it is faster to stack the smull/smlal ops together
-        // rather than interleave them between channels and indeed even the
-        // shift/add sections seem happier not interleaved
-
+        XRGB3Y          v16, v17, v18,  v20, v21, v22,  v0, v1, v2
         // Y0
-        smull           v26.4s, v16.4h, v3.h[0]
-        smlal           v26.4s, v17.4h, v4.h[0]
-        smlal           v26.4s, v18.4h, v5.h[0]
-        smull2          v27.4s, v16.8h, v3.h[0]
-        smlal2          v27.4s, v17.8h, v4.h[0]
-        smlal2          v27.4s, v18.8h, v5.h[0]
+        SMLAL3          v26, v27, v16, v17, v18, v3.h[0], v4.h[0], v5.h[0]
         // Y1
-        smull           v28.4s, v20.4h, v3.h[0]
-        smlal           v28.4s, v21.4h, v4.h[0]
-        smlal           v28.4s, v22.4h, v5.h[0]
-        smull2          v29.4s, v20.8h, v3.h[0]
-        smlal2          v29.4s, v21.8h, v4.h[0]
-        smlal2          v29.4s, v22.8h, v5.h[0]
+        SMLAL3          v28, v29, v20, v21, v22, v3.h[0], v4.h[0], v5.h[0]
 
         ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
 
-        shrn            v26.4h, v26.4s, #12
-        shrn2           v26.8h, v27.4s, #12
-        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-        sqrshrun        v26.8b, v26.8h, #3
-        shrn            v28.4h, v28.4s, #12
-        shrn2           v28.8h, v29.4s, #12
-        add             v28.8h, v28.8h, v6.8h
-        sqrshrun2       v26.16b, v28.8h, #3
-        // Y0/Y1
+        SHRN_Y          v26, v26, v27, v28, v29, v6
 
         subs            w9, w9, #16
 
@@ -433,40 +372,15 @@ function ff_bgr24toyv12_aarch64, export=1
 13:
         // Body is simple copy of main loop body minus preload
 
-        uxtl            v16.8h, v0.8b
-        uxtl            v17.8h, v1.8b
-        uxtl            v18.8h, v2.8b
-
-        uxtl2           v20.8h, v0.16b
-        uxtl2           v21.8h, v1.16b
-        uxtl2           v22.8h, v2.16b
-
+        XRGB3Y          v16, v17, v18,  v20, v21, v22,  v0, v1, v2
         // Y0
-        smull           v26.4s, v16.4h, v3.h[0]
-        smlal           v26.4s, v17.4h, v4.h[0]
-        smlal           v26.4s, v18.4h, v5.h[0]
-        smull2          v27.4s, v16.8h, v3.h[0]
-        smlal2          v27.4s, v17.8h, v4.h[0]
-        smlal2          v27.4s, v18.8h, v5.h[0]
+        SMLAL3          v26, v27, v16, v17, v18, v3.h[0], v4.h[0], v5.h[0]
         // Y1
-        smull           v28.4s, v20.4h, v3.h[0]
-        smlal           v28.4s, v21.4h, v4.h[0]
-        smlal           v28.4s, v22.4h, v5.h[0]
-        smull2          v29.4s, v20.8h, v3.h[0]
-        smlal2          v29.4s, v21.8h, v4.h[0]
-        smlal2          v29.4s, v22.8h, v5.h[0]
+        SMLAL3          v28, v29, v20, v21, v22, v3.h[0], v4.h[0], v5.h[0]
 
         cmp             w9, #-16
 
-        shrn            v26.4h, v26.4s, #12
-        shrn2           v26.8h, v27.4s, #12
-        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-        sqrshrun        v26.8b, v26.8h, #3
-        shrn            v28.4h, v28.4s, #12
-        shrn2           v28.8h, v29.4s, #12
-        add             v28.8h, v28.8h, v6.8h
-        sqrshrun2       v26.16b, v28.8h, #3
-        // Y0/Y1
+        SHRN_Y          v26, v26, v27, v28, v29, v6
 
         // Here:
         // w9 == 0      width % 16 == 0, tail done
@@ -500,9 +414,9 @@ function ff_bgr24toyv12_aarch64, export=1
         tbz             w9, #3, 1f
         st1             {v26.8b},    [x11], #8
 1:      tbz             w9, #2, 1f
-        st1             {v26.s}[2],  [x11], #4
+        STB4V           v26, 8,  x11
 1:      tbz             w9, #1, 1f
-        st1             {v26.h}[6],  [x11], #2
+        STB2V           v26, 12, x11
 1:      tbz             w9, #0, 1f
         st1             {v26.b}[14], [x11]
 1:
-- 
2.43.0


From 0c910b7c4a71b9150da23aa7dd144f1406fd9b11 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Thu, 10 Aug 2023 08:11:21 +0000
Subject: [PATCH 153/157] v4l2_req_media: Fix dmabuf fd leak in MMAP mode

---
 libavcodec/v4l2_req_media.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c
index 1a9944774a..0394bb2b23 100644
--- a/libavcodec/v4l2_req_media.c
+++ b/libavcodec/v4l2_req_media.c
@@ -1205,8 +1205,10 @@ qe_import_from_buf(struct mediabufs_ctl *const mbc, struct qent_base * const be,
                     .plane = i,
                     .flags = O_RDWR, // *** Arguably O_RDONLY would be fine
                 };
-                if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0)
+                if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0) {
                     be->dh[i] = dmabuf_import(xbuf.fd, planes[i].length);
+                    close(xbuf.fd); // dmabuf_import dups the fd so close this one
+                }
             }
             else {
                 be->dh[i] = dmabuf_import_mmap(
-- 
2.43.0


From c0189f293ccd0449063768e37de227ddc720b523 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 6 Sep 2023 14:36:41 +0100
Subject: [PATCH 154/157] v4l2m2m_dec: Having calculated available pixfmt
 actually pass them to user

---
 libavcodec/v4l2_m2m_dec.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
index c4f38cc24e..f67dd23ba1 100644
--- a/libavcodec/v4l2_m2m_dec.c
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -1099,6 +1099,7 @@ choose_capture_format(AVCodecContext * const avctx, V4L2m2mContext * const s)
     fmts2[n++] = AV_PIX_FMT_DRM_PRIME;
     for (i = 0; i != fmts_n; ++i) {
         const enum AVPixelFormat f = ff_v4l2_format_v4l2_to_avfmt(fmts[i], AV_CODEC_ID_RAWVIDEO);
+        av_log(avctx, AV_LOG_TRACE, "VLC pix %s -> %s\n", av_fourcc2str(fmts[i]), av_get_pix_fmt_name(f));
         if (f == AV_PIX_FMT_NONE)
             continue;
 
@@ -1121,7 +1122,7 @@ choose_capture_format(AVCodecContext * const avctx, V4L2m2mContext * const s)
     fmts2[n - 1] = fmts2[pref_n];
     fmts2[pref_n] = t;
 
-    gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
+    gf_pix_fmt = ff_get_format(avctx, fmts2);
     av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
            avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
            avctx->coded_width, avctx->coded_height,
-- 
2.43.0


From 708cf9dbfc0941dce0ec2aa39552570d47cdbb60 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Wed, 6 Sep 2023 14:45:16 +0100
Subject: [PATCH 155/157] v4l2m2m: Simplify reinit - also fixes fmt selection

---
 libavcodec/v4l2_context.c | 41 +++++++++++++++------------------------
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
index 978a487ca9..ed126f8f2b 100644
--- a/libavcodec/v4l2_context.c
+++ b/libavcodec/v4l2_context.c
@@ -28,6 +28,7 @@
 #include <fcntl.h>
 #include <poll.h>
 #include "libavutil/avassert.h"
+#include "libavutil/pixdesc.h"
 #include "libavcodec/avcodec.h"
 #include "decode.h"
 #include "v4l2_buffers.h"
@@ -357,13 +358,23 @@ static int do_source_change(V4L2m2mContext * const s)
 
     s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
 
-    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n",
+    av_log(avctx, AV_LOG_DEBUG, "Source change: Fmt: %s, SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n",
+           av_fourcc2str(ff_v4l2_get_format_pixelformat(&cap_fmt)),
            s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
            s->capture.width, s->capture.height,
            s->capture.selection.width, s->capture.selection.height,
            s->capture.selection.left, s->capture.selection.top, reinit);
 
-    if (reinit) {
+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
+    if (ret)
+        av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n");
+    s->draining = 0;
+
+    if (!reinit) {
+        /* Buffers are OK so just stream off to ack */
+        av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__);
+    }
+    else {
         if (avctx)
             ret = ff_set_dimensions(s->avctx,
                                     s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width,
@@ -371,11 +382,7 @@ static int do_source_change(V4L2m2mContext * const s)
         if (ret < 0)
             av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n");
 
-        ret = ff_v4l2_m2m_codec_reinit(s);
-        if (ret) {
-            av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
-            return AVERROR(EINVAL);
-        }
+        ff_v4l2_context_release(&s->capture);
 
         if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) ||
             s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) {
@@ -388,26 +395,10 @@ static int do_source_change(V4L2m2mContext * const s)
         // Update pixel format - should only actually do something on initial change
         s->capture.av_pix_fmt =
             ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
-        if (s->output_drm) {
-            avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
-            avctx->sw_pix_fmt = s->capture.av_pix_fmt;
-        }
-        else
-            avctx->pix_fmt = s->capture.av_pix_fmt;
-
-        goto reinit_run;
+        avctx->pix_fmt = s->output_drm ? AV_PIX_FMT_DRM_PRIME : s->capture.av_pix_fmt;
+        avctx->sw_pix_fmt = s->capture.av_pix_fmt;
     }
 
-    /* Buffers are OK so just stream off to ack */
-    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__);
-
-    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
-    if (ret)
-        av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n");
-    s->draining = 0;
-
-    /* reinit executed */
-reinit_run:
     ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON);
     return 1;
 }
-- 
2.43.0


From 0b67bbd460b6a7fc701fbcc2ab9358fba497df65 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Fri, 8 Sep 2023 12:13:38 +0000
Subject: [PATCH 156/157] v4l2: Add (more) RGB formats to DRM & V4L2

---
 libavcodec/v4l2_buffers.c | 33 +++++++++++++++++++++++++++++++++
 libavcodec/v4l2_fmt.c     |  8 ++++++++
 2 files changed, 41 insertions(+)

diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
index 8d80d19788..e7b5732216 100644
--- a/libavcodec/v4l2_buffers.c
+++ b/libavcodec/v4l2_buffers.c
@@ -390,6 +390,39 @@ static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
     }
 
     switch (avbuf->context->av_pix_fmt) {
+    case AV_PIX_FMT_0BGR:
+        layer->format = DRM_FORMAT_RGBX8888;
+        break;
+    case AV_PIX_FMT_RGB0:
+        layer->format = DRM_FORMAT_XBGR8888;
+        break;
+    case AV_PIX_FMT_0RGB:
+        layer->format = DRM_FORMAT_BGRX8888;
+        break;
+    case AV_PIX_FMT_BGR0:
+        layer->format = DRM_FORMAT_XRGB8888;
+        break;
+
+    case AV_PIX_FMT_ABGR:
+        layer->format = DRM_FORMAT_RGBA8888;
+        break;
+    case AV_PIX_FMT_RGBA:
+        layer->format = DRM_FORMAT_ABGR8888;
+        break;
+    case AV_PIX_FMT_ARGB:
+        layer->format = DRM_FORMAT_BGRA8888;
+        break;
+    case AV_PIX_FMT_BGRA:
+        layer->format = DRM_FORMAT_ARGB8888;
+        break;
+
+    case AV_PIX_FMT_BGR24:
+        layer->format = DRM_FORMAT_BGR888;
+        break;
+    case AV_PIX_FMT_RGB24:
+        layer->format = DRM_FORMAT_RGB888;
+        break;
+
     case AV_PIX_FMT_YUYV422:
 
         layer->format = DRM_FORMAT_YUYV;
diff --git a/libavcodec/v4l2_fmt.c b/libavcodec/v4l2_fmt.c
index 6df47e3f5a..c820a1d522 100644
--- a/libavcodec/v4l2_fmt.c
+++ b/libavcodec/v4l2_fmt.c
@@ -42,6 +42,14 @@ static const struct fmt_conversion {
     { AV_FMT(RGB24),       AV_CODEC(RAWVIDEO),    V4L2_FMT(RGB24) },
     { AV_FMT(BGR0),        AV_CODEC(RAWVIDEO),    V4L2_FMT(BGR32) },
     { AV_FMT(0RGB),        AV_CODEC(RAWVIDEO),    V4L2_FMT(RGB32) },
+    { AV_FMT(BGR0),        AV_CODEC(RAWVIDEO),    V4L2_FMT(BGRX32) },
+    { AV_FMT(RGB0),        AV_CODEC(RAWVIDEO),    V4L2_FMT(RGBX32) },
+    { AV_FMT(0BGR),        AV_CODEC(RAWVIDEO),    V4L2_FMT(XBGR32) },
+    { AV_FMT(0RGB),        AV_CODEC(RAWVIDEO),    V4L2_FMT(XRGB32) },
+    { AV_FMT(BGRA),        AV_CODEC(RAWVIDEO),    V4L2_FMT(BGRA32) },
+    { AV_FMT(RGBA),        AV_CODEC(RAWVIDEO),    V4L2_FMT(RGBA32) },
+    { AV_FMT(ABGR),        AV_CODEC(RAWVIDEO),    V4L2_FMT(ABGR32) },
+    { AV_FMT(ARGB),        AV_CODEC(RAWVIDEO),    V4L2_FMT(ARGB32) },
     { AV_FMT(GRAY8),       AV_CODEC(RAWVIDEO),    V4L2_FMT(GREY) },
     { AV_FMT(YUV420P),     AV_CODEC(RAWVIDEO),    V4L2_FMT(YUV420) },
     { AV_FMT(YUYV422),     AV_CODEC(RAWVIDEO),    V4L2_FMT(YUYV) },
-- 
2.43.0


From 6a0c34cdb44c698e84a428eadc246e281c71e39b Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 24 Oct 2023 12:54:02 +0100
Subject: [PATCH 157/157] dmabuf: Use vidbuf_cached for dmabuf allocation

Gates usage to kernel 6.1.57 and later as that is when the rpivid iommu
patch was merged.

(cherry picked from commit 9a898f4ea127b30f1ca81eb98dfba3dd101db179)
---
 libavcodec/v4l2_req_dmabufs.c  | 73 ++++++++++++++++++++++++++--------
 libavcodec/v4l2_req_dmabufs.h  |  1 +
 libavcodec/v4l2_request_hevc.c | 49 ++++++++++++-----------
 3 files changed, 83 insertions(+), 40 deletions(-)

diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
index 017c3892a5..9a4b69d3fa 100644
--- a/libavcodec/v4l2_req_dmabufs.c
+++ b/libavcodec/v4l2_req_dmabufs.c
@@ -15,11 +15,12 @@
 #include "v4l2_req_dmabufs.h"
 #include "v4l2_req_utils.h"
 
-#define DMABUF_NAME1  "/dev/dma_heap/linux,cma"
-#define DMABUF_NAME2  "/dev/dma_heap/reserved"
-
 #define TRACE_ALLOC 0
 
+#ifndef __O_CLOEXEC
+#define __O_CLOEXEC 0
+#endif
+
 struct dmabufs_ctl;
 struct dmabuf_h;
 
@@ -297,23 +298,33 @@ struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc)
 //
 // Alloc dmabuf via CMA
 
-static int ctl_cma_new(struct dmabufs_ctl * dbsc)
+static int ctl_cma_new2(struct dmabufs_ctl * dbsc, const char * const * names)
 {
-    while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 &&
-           errno == EINTR)
-        /* Loop */;
-
-    if (dbsc->fd == -1) {
-        while ((dbsc->fd = open(DMABUF_NAME2, O_RDWR)) == -1 &&
+    for (; *names != NULL; ++names)
+    {
+        while ((dbsc->fd = open(*names, O_RDWR | __O_CLOEXEC)) == -1 &&
                errno == EINTR)
             /* Loop */;
-        if (dbsc->fd == -1) {
-            request_log("Unable to open either %s or %s\n",
-                    DMABUF_NAME1, DMABUF_NAME2);
-            return -1;
+        if (dbsc->fd != -1)
+        {
+            request_debug(NULL, "%s: Using dma_heap device %s\n", __func__, *names);
+            return 0;
         }
+        request_debug(NULL, "%s: Not using dma_heap device %s: %s\n", __func__, *names, strerror(errno));
     }
-    return 0;
+    request_log("Unable to open any dma_heap device\n");
+    return -1;
+}
+
+static int ctl_cma_new(struct dmabufs_ctl * dbsc)
+{
+    static const char * const names[] = {
+        "/dev/dma_heap/linux,cma",
+        "/dev/dma_heap/reserved",
+        NULL
+    };
+
+    return ctl_cma_new2(dbsc, names);
 }
 
 static void ctl_cma_free(struct dmabufs_ctl * dbsc)
@@ -321,7 +332,6 @@ static void ctl_cma_free(struct dmabufs_ctl * dbsc)
     if (dbsc->fd != -1)
         while (close(dbsc->fd) == -1 && errno == EINTR)
             /* loop */;
-
 }
 
 static int buf_cma_alloc(struct dmabufs_ctl * const dbsc, struct dmabuf_h * dh, size_t size)
@@ -347,6 +357,10 @@ static int buf_cma_alloc(struct dmabufs_ctl * const dbsc, struct dmabuf_h * dh,
 
     dh->fd = data.fd;
     dh->size = (size_t)data.len;
+
+//    fprintf(stderr, "%s: size=%#zx, ftell=%#zx\n", __func__,
+//            dh->size, (size_t)lseek(dh->fd, 0, SEEK_END));
+
     return 0;
 }
 
@@ -364,7 +378,32 @@ static const struct dmabuf_fns dmabuf_cma_fns = {
 
 struct dmabufs_ctl * dmabufs_ctl_new(void)
 {
-    request_debug(NULL, "Dmabufs using CMA\n");;
+    request_debug(NULL, "Dmabufs using CMA\n");
     return dmabufs_ctl_new2(&dmabuf_cma_fns);
 }
 
+static int ctl_cma_new_vidbuf_cached(struct dmabufs_ctl * dbsc)
+{
+    static const char * const names[] = {
+        "/dev/dma_heap/vidbuf_cached",
+        "/dev/dma_heap/linux,cma",
+        "/dev/dma_heap/reserved",
+        NULL
+    };
+
+    return ctl_cma_new2(dbsc, names);
+}
+
+static const struct dmabuf_fns dmabuf_vidbuf_cached_fns = {
+    .buf_alloc  = buf_cma_alloc,
+    .buf_free   = buf_cma_free,
+    .ctl_new    = ctl_cma_new_vidbuf_cached,
+    .ctl_free   = ctl_cma_free,
+};
+
+struct dmabufs_ctl * dmabufs_ctl_new_vidbuf_cached(void)
+{
+    request_debug(NULL, "Dmabufs using Vidbuf\n");
+    return dmabufs_ctl_new2(&dmabuf_vidbuf_cached_fns);
+}
+
diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h
index 381ba2708d..8c1ab0b5df 100644
--- a/libavcodec/v4l2_req_dmabufs.h
+++ b/libavcodec/v4l2_req_dmabufs.h
@@ -7,6 +7,7 @@ struct dmabufs_ctl;
 struct dmabuf_h;
 
 struct dmabufs_ctl * dmabufs_ctl_new(void);
+struct dmabufs_ctl * dmabufs_ctl_new_vidbuf_cached(void);
 void dmabufs_ctl_unref(struct dmabufs_ctl ** const pdbsc);
 struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc);
 
diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
index db7ed13b6d..5b37319d6a 100644
--- a/libavcodec/v4l2_request_hevc.c
+++ b/libavcodec/v4l2_request_hevc.c
@@ -176,17 +176,6 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
     av_log(avctx, AV_LOG_DEBUG, "Trying V4L2 devices: %s,%s\n",
            decdev_media_path(decdev), decdev_video_path(decdev));
 
-    if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) {
-        av_log(avctx, AV_LOG_DEBUG, "Unable to open dmabufs - try mmap buffers\n");
-        src_memtype = MEDIABUFS_MEMORY_MMAP;
-        dst_memtype = MEDIABUFS_MEMORY_MMAP;
-    }
-    else {
-        av_log(avctx, AV_LOG_DEBUG, "Dmabufs opened - try dmabuf buffers\n");
-        src_memtype = MEDIABUFS_MEMORY_DMABUF;
-        dst_memtype = MEDIABUFS_MEMORY_DMABUF;
-    }
-
     if ((ctx->pq = pollqueue_new()) == NULL) {
         av_log(avctx, AV_LOG_ERROR, "Unable to create pollqueue\n");
         goto fail1;
@@ -202,6 +191,25 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
         goto fail3;
     }
 
+    // Version test for functional Pi5 HEVC iommu.
+    // rpivid kernel patch was merged in 6.1.57
+    // *** Remove when it is unlikely that there are any broken kernels left
+    if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(6,1,57))
+        ctx->dbufs = dmabufs_ctl_new_vidbuf_cached();
+    else
+        ctx->dbufs = dmabufs_ctl_new();
+
+    if (ctx->dbufs == NULL) {
+        av_log(avctx, AV_LOG_DEBUG, "Unable to open dmabufs - try mmap buffers\n");
+        src_memtype = MEDIABUFS_MEMORY_MMAP;
+        dst_memtype = MEDIABUFS_MEMORY_MMAP;
+    }
+    else {
+        av_log(avctx, AV_LOG_DEBUG, "Dmabufs opened - try dmabuf buffers\n");
+        src_memtype = MEDIABUFS_MEMORY_DMABUF;
+        dst_memtype = MEDIABUFS_MEMORY_DMABUF;
+    }
+
     // Ask for an initial bitbuf size of max size / 4
     // We will realloc if we need more
     // Must use sps->h/w as avctx contains cropped size
@@ -229,23 +237,15 @@ retry_src_memtype:
         goto fail4;
     }
 
-    if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) {
-        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
+    if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0)
         ctx->fns = &V2(ff_v4l2_req_hevc, 4);
-    }
 #if CONFIG_V4L2_REQ_HEVC_VX
-    else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
-        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
+    else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0)
         ctx->fns = &V2(ff_v4l2_req_hevc, 3);
-    }
-    else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
-        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n");
+    else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0)
         ctx->fns = &V2(ff_v4l2_req_hevc, 2);
-    }
-    else if (V2(ff_v4l2_req_hevc, 1).probe(avctx, ctx) == 0) {
-        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n");
+    else if (V2(ff_v4l2_req_hevc, 1).probe(avctx, ctx) == 0)
         ctx->fns = &V2(ff_v4l2_req_hevc, 1);
-    }
 #endif
     else {
         av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n");
@@ -253,6 +253,9 @@ retry_src_memtype:
         goto fail4;
     }
 
+    av_log(avctx, AV_LOG_DEBUG, "%s probed successfully: driver v %#x\n",
+           ctx->fns->name, mediabufs_ctl_driver_version(ctx->mbufs));
+
     if (mediabufs_dst_fmt_set(ctx->mbufs, sps->width, sps->height, dst_fmt_accept_cb, avctx)) {
         char tbuf1[5];
         av_log(avctx, AV_LOG_ERROR, "Failed to set destination format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
-- 
2.43.0