diff --git a/config/path b/config/path
index 0bfa8e49da1..f9832e29a34 100644
--- a/config/path
+++ b/config/path
@@ -163,7 +163,7 @@ SED="sed -i"
 . config/optimize
 
 if [ -z "$CCACHE_DIR" ]; then
-    export CCACHE_DIR=$HOME/.ccache-openelec
+    export CCACHE_DIR=$HOME/.ccache-openelec-$OS_VERSION
 fi
 export MAKEFLAGS=-j$CONCURRENCY_MAKE_LEVEL
 export PKG_CONFIG=$ROOT/$TOOLCHAIN/bin/pkg-config
diff --git a/config/version b/config/version
index ca30912d59f..2c718ded003 100644
--- a/config/version
+++ b/config/version
@@ -5,5 +5,5 @@
   OS_VERSION="8.0"
 
 # ADDON_VERSION: Addon version
-  ADDON_VERSION="8.1"
+  ADDON_VERSION="8.2"
 
diff --git a/distributions/OpenELEC/options b/distributions/OpenELEC/options
index daf5322c55f..45cb1086bd2 100644
--- a/distributions/OpenELEC/options
+++ b/distributions/OpenELEC/options
@@ -75,7 +75,7 @@
 # for a list of additinoal drivers see packages/linux-drivers
 # Space separated list is supported,
 # e.g. ADDITIONAL_DRIVERS="DRIVER1 DRIVER2"
-  ADDITIONAL_DRIVERS="RTL8192DU RTL8192EU RTL8812AU mt7610u" # todo: dvbhdhomerun
+  ADDITIONAL_DRIVERS="RTL8192DU RTL8192EU RTL8812AU mt7610u libhdhomerun"
 
 # build and install bluetooth support (yes / no)
   BLUETOOTH_SUPPORT="yes"
@@ -145,11 +145,6 @@
 # Windowmanager to use (ratpoison / fluxbox / none)
   WINDOWMANAGER="fluxbox"
 
-# Xorg Graphic drivers to use (all / i915,i965,r200,r300,r600,nvidia)
-# Space separated list is supported,
-# e.g. GRAPHIC_DRIVERS="i915 i965 r300 r600 radeonsi nvidia"
-  GRAPHIC_DRIVERS="r200 r300 r600 radeonsi i915 i965 nvidia nvidia-legacy"
-
 # build and install remote support (yes / no)
   REMOTE_SUPPORT="yes"
 
diff --git a/packages/addons/depends/graphics/glfw/package.mk b/packages/addons/depends/graphics/glfw/package.mk
index f8c2d0d7890..8da18cf9f6b 100644
--- a/packages/addons/depends/graphics/glfw/package.mk
+++ b/packages/addons/depends/graphics/glfw/package.mk
@@ -37,9 +37,10 @@ if [ ! "$OPENGL" = "mesa" ] ; then
 fi
 
 make_target() {
-  make x11 PREFIX=$SYSROOT_PREFIX/usr
+  make lib/x11/Makefile.x11
+  make -C lib/x11 -f Makefile.x11 PREFIX=$SYSROOT_PREFIX/usr libglfw.a
 }
 
 makeinstall_target() {
-  make x11-install PREFIX=$SYSROOT_PREFIX/usr
+  make -C lib/x11 -f Makefile.x11 PREFIX=$SYSROOT_PREFIX/usr install
 }
diff --git a/packages/addons/depends/multimedia/vdr/patches/vdr-40_caid-buffer-v3.patch b/packages/addons/depends/multimedia/vdr/patches/vdr-40_caid-buffer-v3.patch
new file mode 100644
index 00000000000..e695ae0fe8c
--- /dev/null
+++ b/packages/addons/depends/multimedia/vdr/patches/vdr-40_caid-buffer-v3.patch
@@ -0,0 +1,153 @@
+Description: dynamically resize buffer for caids
+Forwarded: yes
+Author: Lars Hanisch <dvb@flensrocker.de>
+
+diff --git a/ci.c b/ci.c
+index ffc7ff7..8bfae23 100644
+--- a/ci.c
++++ b/ci.c
+@@ -25,6 +25,8 @@
+ #include "skins.h"
+ #include "tools.h"
+ 
++#define CAID_BUFSIZE 1024
++
+ // Set these to 'true' for debug output:
+ static bool DumpTPDUDataTransfer = false;
+ static bool DebugProtocol = false;
+@@ -763,9 +765,12 @@ private:
+   int transponder;
+   int programNumber;
+   int caSystemIds[MAXCASYSTEMIDS + 1]; // list is zero terminated!
++  uint8_t *caDescriptors;
++  int caBufSize;
+   void AddCaDescriptors(int Length, const uint8_t *Data);
+ public:
+   cCiCaPmt(uint8_t CmdId, int Source, int Transponder, int ProgramNumber, const int *CaSystemIds);
++  ~cCiCaPmt(void);
+   uint8_t CmdId(void) { return cmdId; }
+   void SetListManagement(uint8_t ListManagement);
+   uint8_t ListManagement(void) { return capmt[0]; }
+@@ -784,8 +789,15 @@ cCiCaPmt::cCiCaPmt(uint8_t CmdId, int Source, int Transponder, int ProgramNumber
+          caSystemIds[i] = CaSystemIds[i];
+      }
+   caSystemIds[i] = 0;
+-  uint8_t caDescriptors[512];
+-  int caDescriptorsLength = GetCaDescriptors(source, transponder, programNumber, caSystemIds, sizeof(caDescriptors), caDescriptors, 0);
++  caBufSize = CAID_BUFSIZE;
++  caDescriptors = new uint8_t[caBufSize];
++  int caDescriptorsLength = GetCaDescriptors(source, transponder, programNumber, caSystemIds, caBufSize, caDescriptors, 0);
++  if (caDescriptorsLength < 0) {
++     delete [] caDescriptors;
++     caBufSize = -caDescriptorsLength + 8;
++     caDescriptors = new uint8_t[caBufSize];
++     caDescriptorsLength = GetCaDescriptors(source, transponder, programNumber, caSystemIds, caBufSize, caDescriptors, 0);
++     }
+   length = 0;
+   capmt[length++] = CPLM_ONLY;
+   capmt[length++] = (ProgramNumber >> 8) & 0xFF;
+@@ -797,6 +809,11 @@ cCiCaPmt::cCiCaPmt(uint8_t CmdId, int Source, int Transponder, int ProgramNumber
+   AddCaDescriptors(caDescriptorsLength, caDescriptors);
+ }
+ 
++cCiCaPmt::~cCiCaPmt(void)
++{
++  delete [] caDescriptors;
++}
++
+ void cCiCaPmt::SetListManagement(uint8_t ListManagement)
+ {
+   capmt[0] = ListManagement;
+@@ -805,21 +822,34 @@ void cCiCaPmt::SetListManagement(uint8_t ListManagement)
+ void cCiCaPmt::AddPid(int Pid, uint8_t StreamType)
+ {
+   if (Pid) {
+-     uint8_t caDescriptors[512];
+-     int caDescriptorsLength = GetCaDescriptors(source, transponder, programNumber, caSystemIds, sizeof(caDescriptors), caDescriptors, Pid);
+-     //XXX buffer overflow check???
+-     capmt[length++] = StreamType;
+-     capmt[length++] = (Pid >> 8) & 0xFF;
+-     capmt[length++] =  Pid       & 0xFF;
+-     esInfoLengthPos = length;
+-     capmt[length++] = 0x00; // ES_info_length H (at ES level)
+-     capmt[length++] = 0x00; // ES_info_length L
+-     AddCaDescriptors(caDescriptorsLength, caDescriptors);
++     int caDescriptorsLength = GetCaDescriptors(source, transponder, programNumber, caSystemIds, caBufSize, caDescriptors, Pid);
++     if (caDescriptorsLength < 0) {
++       delete [] caDescriptors;
++       caBufSize = -caDescriptorsLength + 8;
++       caDescriptors = new uint8_t[caBufSize];
++       caDescriptorsLength = GetCaDescriptors(source, transponder, programNumber, caSystemIds, caBufSize, caDescriptors, Pid);
++       }
++     if (length + 5 < int(sizeof(capmt))) {
++        capmt[length++] = StreamType;
++        capmt[length++] = (Pid >> 8) & 0xFF;
++        capmt[length++] =  Pid       & 0xFF;
++        esInfoLengthPos = length;
++        capmt[length++] = 0x00; // ES_info_length H (at ES level)
++        capmt[length++] = 0x00; // ES_info_length L
++        AddCaDescriptors(caDescriptorsLength, caDescriptors);
++        }
++     else
++        esyslog("ERROR: buffer overflow in CA descriptor");
+      }
+ }
+ 
+ void cCiCaPmt::AddCaDescriptors(int Length, const uint8_t *Data)
+ {
++  if (Length < 0) {
++     dsyslog("DEBUG: calling AddCaDescriptors with Length %d", Length);
++     return;
++     }
++
+   if (esInfoLengthPos) {
+      if (length + Length < int(sizeof(capmt))) {
+         if (Length || cmdId == CPCI_QUERY) {
+diff --git a/pat.c b/pat.c
+index 98d306e..9dfbc62 100644
+--- a/pat.c
++++ b/pat.c
+@@ -165,21 +165,25 @@ int cCaDescriptors::GetCaDescriptors(const int *CaSystemIds, int BufSize, uchar
+      return 0;
+   if (BufSize > 0 && Data) {
+      int length = 0;
++     bool tooSmall = false;
+      for (cCaDescriptor *d = caDescriptors.First(); d; d = caDescriptors.Next(d)) {
+          if (EsPid < 0 || d->EsPid() == EsPid) {
+             const int *caids = CaSystemIds;
+             do {
+                if (*caids == 0xFFFF || d->CaSystem() == *caids) {
+-                  if (length + d->Length() <= BufSize) {
++                  if (length + d->Length() <= BufSize)
+                      memcpy(Data + length, d->Data(), d->Length());
+-                     length += d->Length();
+-                     }
+                   else
+-                     return -1;
++                     tooSmall = true;
++                  length += d->Length();
+                   }
+                } while (*++caids);
+             }
+          }
++     if (tooSmall) {
++        dsyslog("DEBUG: buffer for ca-descriptors too small (%d, needed %d)", BufSize, length);
++        return -length;
++        }
+      return length;
+      }
+   return -1;
+diff --git a/pat.h b/pat.h
+index 19e60dc..8bf0738 100644
+--- a/pat.h
++++ b/pat.h
+@@ -45,7 +45,7 @@ int GetCaDescriptors(int Source, int Transponder, int ServiceId, const int *CaSy
+          ///< are copied that match one of the given CA system IDs (or all of them, if CaSystemIds
+          ///< is 0xFFFF).
+          ///< Returns the number of bytes copied into Data (0 if no CA descriptors are
+-         ///< available), or -1 if BufSize was too small to hold all CA descriptors.
++         ///< available), or -(NeededBufSize) if BufSize was too small to hold all CA descriptors.
+ 
+ int GetCaPids(int Source, int Transponder, int ServiceId, const int *CaSystemIds, int BufSize, int *Pids);
+          ///< Gets all CA pids for a given channel.
+
diff --git a/packages/addons/official/debug/htop/changelog.txt b/packages/addons/official/debug/htop/changelog.txt
new file mode 100644
index 00000000000..937b186b22a
--- /dev/null
+++ b/packages/addons/official/debug/htop/changelog.txt
@@ -0,0 +1 @@
+see https://github.com/OpenELEC/OpenELEC.tv/tree/master/packages/addons/official
\ No newline at end of file
diff --git a/packages/addons/official/debug/htop/icon/icon.png b/packages/addons/official/debug/htop/icon/icon.png
new file mode 100644
index 00000000000..07a2f447960
Binary files /dev/null and b/packages/addons/official/debug/htop/icon/icon.png differ
diff --git a/packages/addons/official/debug/htop/package.mk b/packages/addons/official/debug/htop/package.mk
new file mode 100644
index 00000000000..9f84979469c
--- /dev/null
+++ b/packages/addons/official/debug/htop/package.mk
@@ -0,0 +1,61 @@
+################################################################################
+#      This file is part of OpenELEC - http://www.openelec.tv
+#      Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv)
+#
+#  OpenELEC is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  OpenELEC is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
+################################################################################
+
+PKG_NAME="htop"
+PKG_VERSION="2.0.2"
+PKG_REV="0"
+PKG_ARCH="any"
+PKG_LICENSE="GPL"
+PKG_SITE="http://htop.sourceforge.net/"
+PKG_URL="http://hisham.hm/htop/releases/$PKG_VERSION/$PKG_NAME-$PKG_VERSION.tar.gz"
+PKG_DEPENDS_TARGET="toolchain netbsd-curses"
+PKG_PRIORITY="optional"
+PKG_SECTION="debug/tools"
+PKG_SHORTDESC="htop: Htop is an ncurses based interactive process viewer for Linux."
+PKG_LONGDESC="Htop is an ncurses based interactive process viewer for Linux."
+
+PKG_IS_ADDON="yes"
+PKG_ADDON_TYPE="xbmc.python.script"
+PKG_ADDON_PROVIDES=""
+
+PKG_AUTORECONF="yes"
+
+PKG_CONFIGURE_OPTS_TARGET="ac_cv_lib_curses_refresh=yes \
+                           --enable-cgroup \
+                           --disable-vserver \
+                           --disable-unicode \
+                           --enable-proc \
+                           --disable-hwloc \
+                           --with-gnu-ld"
+
+pre_configure_target() {
+# htop fails to build in subdirs
+  cd $ROOT/$PKG_BUILD
+    rm -rf .$HOST_NAME
+
+  export LIBS="-lncurses -lterminfo"
+}
+
+makeinstall_target() {
+  : # nop
+}
+
+addon() {
+  mkdir -p $ADDON_BUILD/$PKG_ADDON_ID/bin
+    cp -P $PKG_BUILD/htop $ADDON_BUILD/$PKG_ADDON_ID/bin
+}
diff --git a/packages/addons/official/debug/htop/patches/htop-curses.patch b/packages/addons/official/debug/htop/patches/htop-curses.patch
new file mode 100644
index 00000000000..b3b2584df2a
--- /dev/null
+++ b/packages/addons/official/debug/htop/patches/htop-curses.patch
@@ -0,0 +1,26 @@
+diff -Naur htop-2.0.2/configure.ac htop-2.0.2.patch/configure.ac
+--- htop-2.0.2/configure.ac	2016-07-21 21:54:31.000000000 +0200
++++ htop-2.0.2.patch/configure.ac	2017-01-02 13:49:04.712274123 +0100
+@@ -191,9 +191,10 @@
+       HTOP_CHECK_LIB([ncursesw6], [addnwstr], [HAVE_LIBNCURSESW],
+        HTOP_CHECK_LIB([ncursesw], [addnwstr], [HAVE_LIBNCURSESW],
+         HTOP_CHECK_LIB([ncurses], [addnwstr], [HAVE_LIBNCURSESW],
++         HTOP_CHECK_LIB([curses], [addnwstr], [HAVE_LIBNCURSESW],
+       missing_libraries="$missing_libraries libncursesw"
+       AC_MSG_ERROR([You may want to use --disable-unicode or install libncursesw.])
+-   ))))))
++   )))))))
+ 
+    AC_CHECK_HEADERS([ncursesw/curses.h],[:],
+       [AC_CHECK_HEADERS([ncurses/ncurses.h],[:],
+@@ -204,8 +205,9 @@
+     HTOP_CHECK_SCRIPT([ncurses], [refresh], [HAVE_LIBNCURSES], "ncurses5-config",
+      HTOP_CHECK_LIB([ncurses6],  [refresh], [HAVE_LIBNCURSES],
+       HTOP_CHECK_LIB([ncurses],  [refresh], [HAVE_LIBNCURSES],
++       HTOP_CHECK_LIB([curses],  [refresh], [HAVE_LIBNCURSES],
+       missing_libraries="$missing_libraries libncurses"
+-   ))))
++   )))))
+    
+    AC_CHECK_HEADERS([curses.h],[:],
+       [AC_CHECK_HEADERS([ncurses/curses.h],[:],
diff --git a/packages/addons/official/debug/htop/source/default.py b/packages/addons/official/debug/htop/source/default.py
new file mode 100644
index 00000000000..3b4290d2f95
--- /dev/null
+++ b/packages/addons/official/debug/htop/source/default.py
@@ -0,0 +1,21 @@
+################################################################################
+#      This file is part of OpenELEC - http://www.openelec.tv
+#      Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv)
+#
+#  OpenELEC is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  OpenELEC is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
+################################################################################
+import xbmcgui
+
+dialog = xbmcgui.Dialog()
+dialog.ok('', 'This is a console-only addon')
diff --git a/packages/addons/official/debug/strace/changelog.txt b/packages/addons/official/debug/strace/changelog.txt
new file mode 100644
index 00000000000..937b186b22a
--- /dev/null
+++ b/packages/addons/official/debug/strace/changelog.txt
@@ -0,0 +1 @@
+see https://github.com/OpenELEC/OpenELEC.tv/tree/master/packages/addons/official
\ No newline at end of file
diff --git a/packages/addons/official/debug/strace/icon/icon.png b/packages/addons/official/debug/strace/icon/icon.png
new file mode 100644
index 00000000000..a712c98d0f2
Binary files /dev/null and b/packages/addons/official/debug/strace/icon/icon.png differ
diff --git a/packages/addons/official/debug/strace/package.mk b/packages/addons/official/debug/strace/package.mk
new file mode 100644
index 00000000000..9e740f2caa0
--- /dev/null
+++ b/packages/addons/official/debug/strace/package.mk
@@ -0,0 +1,45 @@
+################################################################################
+#      This file is part of OpenELEC - http://www.openelec.tv
+#      Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv)
+#
+#  OpenELEC is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  OpenELEC is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
+################################################################################
+
+PKG_NAME="strace"
+PKG_VERSION="4.16"
+PKG_REV="0"
+PKG_ARCH="any"
+PKG_LICENSE="BSD"
+PKG_SITE="http://sourceforge.net/projects/strace/"
+PKG_URL="$SOURCEFORGE_SRC/strace/strace/$PKG_VERSION/$PKG_NAME-$PKG_VERSION.tar.xz"
+PKG_DEPENDS_TARGET="toolchain"
+PKG_PRIORITY="optional"
+PKG_SECTION="debug/tools"
+PKG_SHORTDESC="strace: Trace system calls and signals"
+PKG_LONGDESC="In the simplest case strace runs the specified command until it exits. It intercepts and records the system calls which are called by a process and the signals which are received by a process. The name of each system call, its arguments and its return value are printed on standard error or to the file specified with the -o option."
+
+PKG_IS_ADDON="yes"
+PKG_ADDON_TYPE="xbmc.python.script"
+PKG_ADDON_PROVIDES=""
+
+PKG_AUTORECONF="no"
+
+makeinstall_target() {
+  : # nothing to do here
+}
+
+addon() {
+  mkdir -p $ADDON_BUILD/$PKG_ADDON_ID/bin
+    cp -P $PKG_BUILD/.$TARGET_NAME/strace $ADDON_BUILD/$PKG_ADDON_ID/bin
+}
diff --git a/packages/addons/official/debug/strace/source/default.py b/packages/addons/official/debug/strace/source/default.py
new file mode 100644
index 00000000000..3b4290d2f95
--- /dev/null
+++ b/packages/addons/official/debug/strace/source/default.py
@@ -0,0 +1,21 @@
+################################################################################
+#      This file is part of OpenELEC - http://www.openelec.tv
+#      Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv)
+#
+#  OpenELEC is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  OpenELEC is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
+################################################################################
+import xbmcgui
+
+dialog = xbmcgui.Dialog()
+dialog.ok('', 'This is a console-only addon')
diff --git a/packages/addons/official/driver/hdhomerun/changelog.txt b/packages/addons/official/driver/hdhomerun/changelog.txt
index 3aaf851ab81..937b186b22a 100644
--- a/packages/addons/official/driver/hdhomerun/changelog.txt
+++ b/packages/addons/official/driver/hdhomerun/changelog.txt
@@ -1,60 +1 @@
-8.1.0
-  rebuild for OpenELEC-8.0
-8.0.0
-  rebuild for OpenELEC-8.0
-7.0.0
-  rebuild for OpenELEC-7.0
-6.0.0
-  rebuild for OpenELEC-6.0
-4.3.2
-  allow running scripts from addon settings
-4.3.1
-  rebuild for addon api bump
-4.3.0
-  rebuild for addon api bump
-4.1.2
-  clean up
-4.1.0
-  rebuild for addon api bump
-4.0.1
-  rebuild
-4.0.0
-  rebuild for OpenELEC-4.0
-3.1.6
-  rebuild
-3.1.5
-  get tuner id with command 'hdhomerun_config discover' and
-  set number of tuners manually
-3.1.4
-  get tuner id from /var/log/messages
-3.1.3
-  fixed parsing dvbhdhomerun.log file with timestamp
-3.1.2
-  new addon settings option to enable suspend/resume driver actions
-  new addon settings option to enable logging
-3.1.1
-  rebuild for OpenELEC-3.2
-3.0.5
-  rebuild
-3.0.4
-  fixed "bad substitution" error with busybox/ash
-3.0.3
-  improved python script for modifying tuners
-3.0.2
-  added addon settings for
-    modifying tuner type (DVB-C, DVB-T, ATSC)
-    setting delays
-3.0.1
-  bump addon version
-  binary files are stored with OpenELEC image
-  addon is used to start userspace program
-2.1.3
-  bump addon version for new kernel
-2.1.2
-  dvbhdhomerun upgraded to 0.0.10
-2.1.1
-  rebuild for addon version 2.1
-2.0.1
-  starting userspace DVB drivers from Tvheadend/VDR
-2.0.0
-  initial version of HDHomeRun driver
+see https://github.com/OpenELEC/OpenELEC.tv/tree/master/packages/addons/official
\ No newline at end of file
diff --git a/packages/addons/official/driver/hdhomerun/package.mk b/packages/addons/official/driver/hdhomerun/package.mk
index bc65ae14ade..fbcc87707f6 100644
--- a/packages/addons/official/driver/hdhomerun/package.mk
+++ b/packages/addons/official/driver/hdhomerun/package.mk
@@ -28,11 +28,12 @@ PKG_PRIORITY="optional"
 PKG_SECTION="driver/dvb"
 PKG_SHORTDESC="A linux DVB driver for the HDHomeRun (http://www.silicondust.com)."
 PKG_LONGDESC="A linux DVB driver for the HDHomeRun (http://www.silicondust.com)."
+
 PKG_AUTORECONF="no"
+
 PKG_IS_ADDON="yes"
 PKG_ADDON_TYPE="xbmc.python.script"
 PKG_ADDON_PROVIDES=""
-PKG_ADDON_REPOVERSION="8.1"
 
 make_target() {
   : # nothing to do here
diff --git a/packages/addons/official/driver/imon-mce/changelog.txt b/packages/addons/official/driver/imon-mce/changelog.txt
index 8c26edc565d..937b186b22a 100644
--- a/packages/addons/official/driver/imon-mce/changelog.txt
+++ b/packages/addons/official/driver/imon-mce/changelog.txt
@@ -1,44 +1 @@
-8.1.0
-  rebuild for OpenELEC-8.0
-
-8.0.0
-  rebuild for OpenELEC-8.0
-
-7.0.0
-  rebuild for OpenELEC-7.0
-
-6.0.0
-  rebuild for OpenELEC-6.0
-
-4.3.1
- fix keymaps path
-
-4.3.0
- rebuild for addon api bump
-
-4.1.0
- rebuild for addon api bump
-
-4.0.1
- rebuild
-
-4.0.0
-  rebuild for OpenELEC-4.0
-
-3.1.1
-  rebuild for OpenELEC-3.2
-
-3.0.1
-- bump addon version
-
-2.1.1
-- update for addon version 2.1
-
-2.0.0
-- prepare for OpenELEC-2.0 release
-
-1.90.1
-- depends on xbmc.python API 2.0
-
-1.90.0
-- initial version imon-mce
+see https://github.com/OpenELEC/OpenELEC.tv/tree/master/packages/addons/official
\ No newline at end of file
diff --git a/packages/addons/official/driver/imon-mce/package.mk b/packages/addons/official/driver/imon-mce/package.mk
index 9ac2ccac9c1..e3a09d7735a 100644
--- a/packages/addons/official/driver/imon-mce/package.mk
+++ b/packages/addons/official/driver/imon-mce/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="imon-mce"
-PKG_VERSION="8.1"
+PKG_VERSION="8.2"
 PKG_REV="0"
 PKG_ARCH="any"
 PKG_LICENSE="OSS"
@@ -32,7 +32,6 @@ PKG_LONGDESC="imon-mce is a driver to add support for MCE remotes to the imon dr
 PKG_IS_ADDON="yes"
 PKG_ADDON_TYPE="xbmc.service"
 PKG_ADDON_PROVIDES=""
-PKG_ADDON_REPOVERSION="8.1"
 
 PKG_AUTORECONF="no"
 
diff --git a/packages/addons/official/driver/sundtek-mediatv/changelog.txt b/packages/addons/official/driver/sundtek-mediatv/changelog.txt
index 575b224b733..937b186b22a 100644
--- a/packages/addons/official/driver/sundtek-mediatv/changelog.txt
+++ b/packages/addons/official/driver/sundtek-mediatv/changelog.txt
@@ -1,74 +1 @@
-8.1.0
-  rebuild for OpenELEC-8.0
-8.0.0
-  rebuild for OpenELEC-8.0
-7.0.0
-  rebuild for OpenELEC-7.0
-6.0.1
-  update driver for new devices support
-6.0.0
-  rebuild for OpenELEC-6.0
-4.3.4
-  fixed xbmc/kodi rebranding error
-  some fixes
-4.3.3
-  addon transformed to service addon
-  new addon settings option to enable to check for new driver at boot
-4.3.2
-  new addon settings option to enable to update driver from web with latest version
-4.3.1
-  update driver
-4.3.0
-  rebuild for addon api bump
-4.1.5
-  dont touch /storage/.profile
-4.1.4
-  enable hw pid filter by default
-4.1.3
-  include driver bin/libs
-4.1.2
-  clean up. sorry. no analog tv anymore.
-4.1.1
-  clean up
-4.1.0
-  rebuild for addon api bump
-4.0.1
-  rebuild
-4.0.0
-  rebuild for OpenELEC-4.0
-3.1.3
-  rebuild
-3.1.2
-  new addon settings option to enable suspend/resume driver actions
-3.1.1
-  rebuild for OpenELEC-3.2
-3.0.6
-  rebuild
-3.0.5
-  fixed "bad substitution" error with busybox/ash
-3.0.4
-  improved python script for modifying tuners
-3.0.3
-  added addon settings for modifying tuner type (DVB-C, DVB-T)
-3.0.2
-  added addon settings
-    enable HW PID filter (enabled for RPi by default)
-    enable IR receiver
-    allow sharing local tuner over network
-    use network tuners
-  sundtek's binaries are downloaded on first run
-  automatically download new version of the binaries if available
-  added preload library to .profile (to run mediaclient, w_scan, ... from console)
-2.1.4
-  support for Raspberry Pi
-2.1.3
-  support for network tuner
-2.1.2
-  support for multiple Sundtek tuners
-  support for working with other tuners (kernel drivers)
-2.1.1
-  update to addon version 2.1
-2.0.5
-  starting userspace DVB drivers from Tvheadend/VDR
-1.90.0
-  initial version of Sundtek MediaTV DVB driver
+see https://github.com/OpenELEC/OpenELEC.tv/tree/master/packages/addons/official
\ No newline at end of file
diff --git a/packages/addons/official/driver/sundtek-mediatv/package.mk b/packages/addons/official/driver/sundtek-mediatv/package.mk
index 6e4a1cc4d36..c8bd7397977 100644
--- a/packages/addons/official/driver/sundtek-mediatv/package.mk
+++ b/packages/addons/official/driver/sundtek-mediatv/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="sundtek-mediatv"
-PKG_VERSION="8.1"
+PKG_VERSION="8.2"
 PKG_REV="0"
 PKG_ARCH="any"
 PKG_LICENSE="nonfree"
@@ -28,11 +28,12 @@ PKG_PRIORITY="optional"
 PKG_SECTION="driver/dvb"
 PKG_SHORTDESC="Sundtek USB Stick DVB userspace driver"
 PKG_LONGDESC="Driver for Sundtek MediaTV Pro (DVB-C, DVB-T, AnalogTV, Composite, S-Video, FM-Radio USB Stick) and Sundtek SkyTV Ultimate (DVB-S/S2 USB)."
+
 PKG_IS_ADDON="yes"
 PKG_ADDON_TYPE="xbmc.service"
 PKG_ADDON_PROVIDES=""
+
 PKG_AUTORECONF="no"
-PKG_ADDON_REPOVERSION="8.1"
 
 make_target() {
   mkdir -p $ROOT/$PKG_BUILD
diff --git a/packages/addons/official/kodi-addons/peripheral.joystick/package.mk b/packages/addons/official/kodi-addons/peripheral.joystick/package.mk
index 3f429a3ce63..f6d56bbbca8 100644
--- a/packages/addons/official/kodi-addons/peripheral.joystick/package.mk
+++ b/packages/addons/official/kodi-addons/peripheral.joystick/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="peripheral.joystick"
-PKG_VERSION="0c47f0e"
+PKG_VERSION="b464260"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/addons/official/kodi-addons/pvr.dvblink/package.mk b/packages/addons/official/kodi-addons/pvr.dvblink/package.mk
index 2a85f0166a3..3ccbdafc2aa 100644
--- a/packages/addons/official/kodi-addons/pvr.dvblink/package.mk
+++ b/packages/addons/official/kodi-addons/pvr.dvblink/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.dvblink"
-PKG_VERSION="63cf195"
+PKG_VERSION="2029eec"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/addons/official/kodi-addons/pvr.hts/package.mk b/packages/addons/official/kodi-addons/pvr.hts/package.mk
index be7565e20b5..4406e4388d5 100644
--- a/packages/addons/official/kodi-addons/pvr.hts/package.mk
+++ b/packages/addons/official/kodi-addons/pvr.hts/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.hts"
-PKG_VERSION="5bf84c3"
+PKG_VERSION="cab4353"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/addons/official/kodi-addons/pvr.vdr.vnsi/package.mk b/packages/addons/official/kodi-addons/pvr.vdr.vnsi/package.mk
index 5e003d252a5..9bd7b8a2bf6 100644
--- a/packages/addons/official/kodi-addons/pvr.vdr.vnsi/package.mk
+++ b/packages/addons/official/kodi-addons/pvr.vdr.vnsi/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.vdr.vnsi"
-PKG_VERSION="960f2d3"
+PKG_VERSION="7a17f89"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/addons/official/kodi-addons/screensavers.rsxs/package.mk b/packages/addons/official/kodi-addons/screensavers.rsxs/package.mk
index a8ad4389587..d39b556cb06 100644
--- a/packages/addons/official/kodi-addons/screensavers.rsxs/package.mk
+++ b/packages/addons/official/kodi-addons/screensavers.rsxs/package.mk
@@ -47,7 +47,7 @@ addon() {
 
     mkdir -p $ADDON_BUILD/$_ADDON_ID/
     cp -PR $PKG_BUILD/.install_pkg/usr/share/kodi/addons/$_ADDON_ID/* $ADDON_BUILD/$_ADDON_ID/
-    cp -PL $PKG_BUILD/.install_pkg/usr/lib/kodi/addons/$_ADDON_ID/*.so $ADDON_BUILD/$_ADDON_ID/
+    cp -PL $PKG_BUILD/.install_pkg/usr/lib/kodi/addons/$_ADDON_ID/*.so* $ADDON_BUILD/$_ADDON_ID/
 
     MULTI_ADDONS="$MULTI_ADDONS $_ADDON_ID"
   done
diff --git a/packages/addons/official/repository/repository.unofficial.addon.pro/changelog.txt b/packages/addons/official/repository/repository.unofficial.addon.pro/changelog.txt
index 6273baabd02..937b186b22a 100644
--- a/packages/addons/official/repository/repository.unofficial.addon.pro/changelog.txt
+++ b/packages/addons/official/repository/repository.unofficial.addon.pro/changelog.txt
@@ -1,42 +1 @@
-8.1.0
-- rebuild for OpenELEC-8.0
-
-8.0.0
-- rebuild for OpenELEC-8.0
-
-7.0.1
-- change minimum kodi version to allow using with beta/rc release
-
-7.0.0
-- rebuild for OpenELEC-7.0
-- change warning text
-
-6.0.1
-- update repo minversion for kodi 15
-
-6.0.0
-- rebuild for OpenELEC-6.0
-
-4.3.3
-- rebuild
-
-4.3.2
-- rebuild for addon api bump
-
-4.1.2
-- fix typo in repository name
-
-4.1.1
-- bump
-
-4.1.0
-- rebuild for addon api bump
-
-4.0.0
-- rebuild for OpenELEC-4.0
-
-1.0.1
-- use generic/i386 addons for atv
-
-1.0.0
-- initial release
+see https://github.com/OpenELEC/OpenELEC.tv/tree/master/packages/addons/official
\ No newline at end of file
diff --git a/packages/addons/official/repository/repository.unofficial.addon.pro/package.mk b/packages/addons/official/repository/repository.unofficial.addon.pro/package.mk
index 03333ae4a9a..aad38330340 100644
--- a/packages/addons/official/repository/repository.unofficial.addon.pro/package.mk
+++ b/packages/addons/official/repository/repository.unofficial.addon.pro/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="repository.unofficial.addon.pro"
-PKG_VERSION="8.1"
+PKG_VERSION="8.2"
 PKG_REV="0"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/addons/official/script/script.config.vdr/package.mk b/packages/addons/official/script/script.config.vdr/package.mk
index 02a2f66ba72..261e123184c 100644
--- a/packages/addons/official/script/script.config.vdr/package.mk
+++ b/packages/addons/official/script/script.config.vdr/package.mk
@@ -32,7 +32,6 @@ PKG_LONGDESC="script.config.vdr"
 PKG_IS_ADDON="yes"
 PKG_ADDON_TYPE="dummy"
 PKG_ADDON_PROVIDES=""
-PKG_ADDON_REPOVERSION="8.0"
 
 PKG_AUTORECONF="no"
 
diff --git a/packages/addons/official/service/multimedia/boblightd/changelog.txt b/packages/addons/official/service/multimedia/boblightd/changelog.txt
index b38a9cd4dd3..937b186b22a 100644
--- a/packages/addons/official/service/multimedia/boblightd/changelog.txt
+++ b/packages/addons/official/service/multimedia/boblightd/changelog.txt
@@ -1,83 +1 @@
-8.1.0
-- rebuild for OpenELEC-8.0
-
-8.0.0
-- rebuild for OpenELEC-8.0
-
-7.0.0
-- rebuild for OpenELEC-7.0
-
-6.0.0
-- rebuild for OpenELEC-6.0
-
-4.3.5
-- added boblight-aml
-
-4.3.4
-- update to kodi
-
-4.3.0
-- rebuild for addon api bump
-
-4.1.2
-- rebuild for xlib changes
-
-4.1.1
-- update to boblightd-478
-
-4.1.0
-- convert to systemd service
-- rebuild for addon api bump
-
-4.0.1
-- rebuild
-
-4.0.0
-- rebuild for OpenELEC-4.0
-
-3.1.3
-- update to boblightd-474
-
-3.1.2
-- rebuild
-
-3.1.1
-- rebuild for OpenELEC-3.2
-
-3.0.4
-- update to boblightd-467
-
-3.0.3
-- update to boblightd-465
-
-3.0.2
-- depends on libGLU
-
-3.0.1
-- bump addon version
-- make OpenGL and X11 support optional
-
-2.1.3
-- update to boblightd-449
-
-2.1.2
-- rebuild
-
-2.1.1
-- update to addon version 2.1
-
-2.0.5
-- add possibilty to start / stop addon from addon manager with enable / disable
-- restart boblightd on sleep / resume
-
-2.0.4
-- Fixed so that boblightd will not start if it's already running
-
-2.0.3
-- Removed LD_LIBRARY_PATH dependencies
-
-2.0.2
-- Renamed to boblightd to avoid conflicts
-
-2.0.1
-- initial version boblight
+see https://github.com/OpenELEC/OpenELEC.tv/tree/master/packages/addons/official
\ No newline at end of file
diff --git a/packages/addons/official/service/multimedia/boblightd/package.mk b/packages/addons/official/service/multimedia/boblightd/package.mk
index bb77b889304..76e02b54363 100644
--- a/packages/addons/official/service/multimedia/boblightd/package.mk
+++ b/packages/addons/official/service/multimedia/boblightd/package.mk
@@ -32,7 +32,6 @@ PKG_LONGDESC="Boblight's main purpose is to create light effects from an externa
 PKG_IS_ADDON="yes"
 PKG_ADDON_TYPE="xbmc.service"
 PKG_ADDON_PROVIDES=""
-PKG_ADDON_REPOVERSION="8.1"
 
 PKG_AUTORECONF="yes"
 
diff --git a/packages/addons/official/service/multimedia/vdr-addon/changelog.txt b/packages/addons/official/service/multimedia/vdr-addon/changelog.txt
index 9a13af6a079..937b186b22a 100644
--- a/packages/addons/official/service/multimedia/vdr-addon/changelog.txt
+++ b/packages/addons/official/service/multimedia/vdr-addon/changelog.txt
@@ -1,323 +1 @@
-8.1.0
-- rebuild for OpenELEC-8.0
-
-8.0.1
-- vdr: add upstream patches
-- update to vdr-plugin-dvbapi-7a42b22
-- update to vdr-plugin-iptv-2226be2
-- update to vdr-plugin-restfulapi-0.2.6.5
-- update to vdr-plugin-satip-581ac49
-- update to vdr-plugin-vnsiserver-2c1f90b
-
-8.0.0
-- rebuild for OpenELEC-8.0
-- remove vdr-plugin-xmltv2vdr support
-
-7.0.3
-- update to vdr-plugin-vnsiserver-c63d8e8
-- udate to vdr-plugin-satip-18c9b79
-- update to vdr-plugin-dvbapi-c76c24e
-
-7.0.2
-- update to vdr-plugin-restfulapi-0.2.6.0
-- update to vdr-plugin-vnsiserver-00b5779
-- update to vdr-satip-b755dbf
-
-7.0.1
-- update to vdr-plugin-xmltv2vdr-b48e0bec
-- update to vdr-plugin-streamdev-fc52e92
-- update to vdr-plugin-restfulapi-0.2.5.6
-- update to vdr-epgsearch-0b09f90
-- update to vdr-plugin-dvbapi-31f409d
-- update to vdr-plugin-vnsiserver-d96f211
-- update to vdr-satip-9d5f7cc
-
-7.0.0
-- rebuild for OpenELEC-7.0
-
-6.0.1
-- update to vdr-plugin-dvbapi-0489e01
-- update to vdr-plugin-vnsiserver-a7b0670
-- update to vdr-satip-2.2.2
-- update to vdr-iptv-2.2.1
-
-6.0.0
-- rebuild for OpenELEC-6.0
-- remove plugin 'control'
-- remove plugin 'xvdr'
-- update to vdr-plugin-vnsiserver-36e2b61
-
-4.3.11
-- update to vdr-plugin-restfulapi-0.2.1.4
-
-4.3.10
-- disable dvbapi network mode
-
-4.3.9
-- update to vdr-2.2.0
-- update to vdr-iptv-2.2.0
-- update to vdr-satip-2.2.0
-- update to vdr-plugin-dvbapi-c0c7fa2
-- update to vdr-plugin-eepg-9cd9a75
-- update to vdr-plugin-vnsiserver-9529e6d
-- update to vdr-plugin-streamdev-84c6f6b
-
-4.3.8
-- update to vdr-2.1.10
-- update to vdr-plugin-restfulapi-0.2.1.1
-- update to vdr-plugin-vnsiserver-f2175ba
-
-4.3.7
-- update to vdr-2.1.8
-- update to vdr-plugin-vnsiserver-b887bc8
-- update to vdr-wirbelscan-0.0.9
-
-4.3.6
-- update to vdr-satip-1.0.2
-- update to vdr-2.1.7
-
-4.3.5
-- add plugin 'vdr-plugin-restfulapi'
-- update to vdr-plugin-xvdr-b300fc3
-
-4.3.4
-- update to vdr-plugin-vnsiserver-a48edf1
-- update to vdr-plugin-dvbapi-36a6b1d
-
-4.3.3
-- add plugin 'vdr-plugin-epgfixer'
-- update to vdr-satip-1.0.1
-
-4.3.2
-- update to kodi
-
-4.3.1
-- update to vdr-plugin-vnsiserver-a7cb405
-- update to vdr-plugin-dvbapi-2617a7f
-
-4.3.0
-- rebuild for addon api bump
-
-4.1.7
-- add option to override epg charset
-- update to vdr-plugin-vnsiserver-7d4aa81
-- update to vdr-satip-0.3.3
-- update to vdr-plugin-eepg-d7dc614
-- update to vdr-plugin-dvbapi-bdcad3f
-- update to vdr-iptv-2.1.3
-
-4.1.6
-- update to vdr-2.1.6
-- add plugin 'vdr-dummydevice'
-- add plugin 'vdr-satip'
-- remove plugin: 'sc'
-- update to vdr-iptv-2.1.2
-- update to vdr-plugin-xvdr-7f49bfa
-- update to vdr-plugin-vnsiserver-a3f7ac5
-- update to vdr-plugin-dvbapi-bf11f9e
-
-4.1.5
-- fix issues with "wait for frontend init". thanks @sraue
-
-4.1.4
-- add plugin 'vdr-plugin-eepg'
-- update to vdr-plugin-xvdr-4a9d95e
-- update to vdr-plugin-dvbapi-a9b738e
-
-4.1.3
-- update to vdr-2.1.5
-- update to vdr-iptv-2.1.0
-- update to vdr-plugin-vnsiserver-e5f02b6
-- update to vdr-plugin-dvbapi-a3b4a5a
-- update to vdr-plugin-xvdr-7d6ebb7
-
-4.1.2
-- update to vdr-2.1.3
-- update to vdr-iptv-2.0.1
-- update to vdr-plugin-dvbapi-4d9de95
-- update to vdr-plugin-streamdev-40704cd
-- update to vdr-plugin-vnsiserver-9021115
-- update to vdr-epgsearch-29c174a
-
-4.1.1
-- rebuild to fix curl/gnutls/ssl
-
-4.1.0
-- convert to systemd service
-- rebuild for addon api bump
-
-4.0.3
-- update to vdr-plugin-vnsiserver-e2e6804 (vnsi5)
-
-4.0.2
-- update to vdr-2.1.2
-- update to vdr-plugin-streamdev-a9c2adb
-- add option to run external reccmd (noad)
-
-4.0.1
-- update to vdr-2.1.1
-- remove rotorng. dish positioner is now implemented in vdr
-
-4.0.0
-- update to vdr-2.0.3
-- update to vdr-plugin-vnsiserver-cd5023b
-- update to vdr-plugin-xvdr-6249892
-- remove dvbsddevice / dvbhddevice
-
-3.1.4
-- vdr-plugin-dvbapi: switch to libdvbcsa
-
-3.1.3
-- rebuild
-
-3.1.2
-- fix unloading multiple modules on suspend
-
-3.1.1
-- rebuild for OpenELEC-3.2
-
-3.0.11
-- fix epg/live charset issue
-
-3.0.10
-- improve suspend/resume
-
-3.0.9
-- rebuild
-
-3.0.8
-- vdr-plugin-xmltv2vdr: use VDR_CACHE_DIR for epg.db
-- use own /var/run/vdr tmpfs mount
-
-3.0.7
-- update to vdr-plugin-xmltv2vdr-30903cc0
-- cleanup epgsources handling
-- * support for multiple epgsources
-- * support user defined epgsources
-- fixed epgsearch plugin not working without streamdev-server
-- vdr-addon: update to vdr-plugin-xvdr-33afe59
-
-3.0.6
-- fix startup error in vnsiserver / 32bit
-
-3.0.5
-- updated to vdr-2.0.1
-- remove dynamite plugin
-- update to vdr-epgsearch-e2de927
-- update to vdr-iptv-2.0.0
-- update to vdr-plugin-streamdev-329129d
-- update to vdr-live-0.3.0
-- update to vdr-plugin-dvbapi-555272d
-- update to vdr-plugin-xmltv2vdr-90c023f6
-- update to vdr-plugin-xvdr-935a294
-- update to rotorng-0.3.1
-- added dvbsddevice plugin
-- added dvbhddevice plugin
-
-3.0.4
-- update to vdr-1.1.33
-- update to vdr-plugin-dvbapi-cd93752
-- update to vdr-plugin-streamdev-f58086a
-- update to vdr-epgsearch-0fc4817
-- remove text2skin plugin
-- remove xinelib plugin
-
-3.0.3
-- update to vdr-sc-620
-- update to vdr-plugin-xvdr-c43033c
-- update to vdr-plugin-vnsiserver-e3cd383
-- update to vdr-plugin-streamdev-9135cde
-- update to vdr-plugin-dvbapi-e87e15f
-- update to vdr-dynamite-914af24
-
-3.0.2
-- update to vdr-plugin-xmltv2vdr-8be374e
-- update to vdr-plugin-xvdr-c2fa08a
-- added vdr-plugin-vnsiserver
-
-3.0.1
-- bump addon version
-
-2.1.8
-- rebuild due to some static libs in latest git
-
-2.1.7
-- update to vdr-plugin-streamdev-6a47e20
-- update to vdr-sc-613
-
-2.1.6
-- update to vdr-plugin-dvbapi-e3200c8
-- added vdr-xmltv2vdr plugin
-- added xineliboutput plugin
-- added text2skin plugin
-- preparation for xine based VDR frontend as separate addon
-
-2.1.5
-- fixed rotorng
-- added wirbelscancontrol plugin
-
-2.1.4
-- added live plugin
-- added rotorng plugin
-- added control plugin
-- added epgsearch plugin
-- update to vdr-iptv-0.5.2
-- update to vdr-plugin-streamdev-8719007
-- update to vdr-plugin-xvdr-b62ccbd
-
-2.1.3
-- adjust for fontconfig 2.9.0
-- update patchset 
-- suspend/resume fixes
-
-2.1.2
-- update to vdr-1.7.27
-- update to vdr-dynamite-10d78a8
-- update to vdr-sc-605
-
-2.1.1
-- rebuild for addon version 2.1
-- update to vdr-plugin-dvbapi-9bef03f
-
-2.0.9
-- modules to remove on sleep now configurable via settings ui
-- enabled plugins now configurable via settings ui
-
-2.0.8
-- update to vdr-1.7.25
-- update to vdr-plugin-xvdr-c98852f
-- add streamdev-client / server plugin
-- update to vdr-iptv-0.5.0
-- move w_scan in a own addon
-
-2.0.7
-- add possibilty to start / stop addon from addon manager with enable / disable
-
-2.0.6
-- enable unloading/loading dvb modules on suspend/resume
-
-2.0.5
-- update to vdr-1.7.24
-- sc now works without the need to disable dynamite plugin
-
-2.0.4
-- start userspace DVB drivers before VDR
-
-2.0.3
-- update to vdr-plugin-xvdr-0ac808a
-- rebuild for libiconv changes
-- update to vdr-plugin-dvbapi-b0194c8
-- update to w_scan-20120112
-
-2.0.2
-- update to vdr-plugin-xvdr-ec9b759
-- update to vdr-plugin-dvbapi-dad660a
-- update to vdr-dynamite-27d7bed
-- add plugin 'vdr-iptv'
-
-2.0.1
-- rename vdr-dvbapi-plugin to vdr-plugin-dvbapi
-- update to vdr-plugin-dvbapi-68e043a
-
-2.0.0
-- initial version vdr-1.7.22
+see https://github.com/OpenELEC/OpenELEC.tv/tree/master/packages/addons/official
\ No newline at end of file
diff --git a/packages/addons/official/service/multimedia/vdr-addon/package.mk b/packages/addons/official/service/multimedia/vdr-addon/package.mk
index 8bb253ba1e1..ab04d1b0998 100644
--- a/packages/addons/official/service/multimedia/vdr-addon/package.mk
+++ b/packages/addons/official/service/multimedia/vdr-addon/package.mk
@@ -18,7 +18,7 @@
 ################################################################################
 
 PKG_NAME="vdr-addon"
-PKG_VERSION="8.1"
+PKG_VERSION="8.2"
 PKG_REV="0"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
@@ -29,13 +29,14 @@ PKG_PRIORITY="optional"
 PKG_SECTION="service.multimedia"
 PKG_SHORTDESC="vdr: A powerful DVB TV application"
 PKG_LONGDESC="This project describes how to build your own digital satellite receiver and video disk recorder. It is based mainly on the DVB-S digital satellite receiver card, which used to be available from Fujitsu Siemens and the driver software developed by the LinuxTV project."
+
 PKG_AUTORECONF="no"
+
 PKG_IS_ADDON="yes"
 PKG_ADDON_TYPE="xbmc.service"
 PKG_ADDON_PROVIDES=""
 PKG_ADDON_REQUIRES="pvr.vdr.vnsi:0.0.0 script.config.vdr:0.0.0"
 PKG_ADDON_NAME="VDR PVR Backend"
-PKG_ADDON_REPOVERSION="8.1"
 
 make_target() {
   : # nothing to do here
diff --git a/packages/addons/official/shell/console/screen/changelog.txt b/packages/addons/official/shell/console/screen/changelog.txt
new file mode 100644
index 00000000000..937b186b22a
--- /dev/null
+++ b/packages/addons/official/shell/console/screen/changelog.txt
@@ -0,0 +1 @@
+see https://github.com/OpenELEC/OpenELEC.tv/tree/master/packages/addons/official
\ No newline at end of file
diff --git a/packages/addons/official/shell/console/screen/icon/icon.png b/packages/addons/official/shell/console/screen/icon/icon.png
new file mode 100644
index 00000000000..bfad6784bce
Binary files /dev/null and b/packages/addons/official/shell/console/screen/icon/icon.png differ
diff --git a/packages/addons/official/shell/console/screen/package.mk b/packages/addons/official/shell/console/screen/package.mk
new file mode 100644
index 00000000000..d2102ac7dbe
--- /dev/null
+++ b/packages/addons/official/shell/console/screen/package.mk
@@ -0,0 +1,59 @@
+################################################################################
+#      This file is part of OpenELEC - http://www.openelec.tv
+#      Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv)
+#
+#  OpenELEC is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  OpenELEC is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
+################################################################################
+
+PKG_NAME="screen"
+PKG_VERSION="4.5.1"
+PKG_REV="0"
+PKG_ARCH="any"
+PKG_LICENSE="GPL"
+PKG_SITE="http://www.gnu.org/software/screen/"
+PKG_URL="http://ftp.gnu.org/gnu/screen/screen-${PKG_VERSION}.tar.gz"
+PKG_DEPENDS_TARGET="toolchain netbsd-curses"
+PKG_PRIORITY="optional"
+PKG_SECTION="shell/console"
+PKG_SHORTDESC="terminal multiplexor with VT100/ANSI terminal emulation"
+PKG_LONGDESC="screen is a terminal multiplexor that runs several separate "screens" on a single physical character-based terminal. Each virtual terminal emulates a DEC VT100 plus several ANSI X3.64 and ISO 2022 functions. Screen sessions can be detached and resumed later on a different terminal."
+
+PKG_IS_ADDON="yes"
+PKG_ADDON_TYPE="xbmc.python.script"
+PKG_ADDON_PROVIDES=""
+
+PKG_AUTORECONF="no"
+
+PKG_CONFIGURE_OPTS_TARGET="ac_cv_header_utempter_h=no \
+                           --disable-pam \
+                           --disable-use-locale \
+                           --disable-telnet \
+                           --disable-socket-dir"
+
+pre_configure_target() {
+  export LDFLAGS=`echo $LDFLAGS | sed -e "s|-Wl,--as-needed||"`
+
+# screen fails to build in subdirs
+  cd $ROOT/$PKG_BUILD
+    rm -rf .$TARGET_NAME
+}
+
+makeinstall_target() {
+  : # nop
+}
+
+addon() {
+  mkdir -p $ADDON_BUILD/$PKG_ADDON_ID/bin
+    cp -P $PKG_BUILD/screen $ADDON_BUILD/$PKG_ADDON_ID/bin/screen
+}
diff --git a/packages/addons/official/shell/console/screen/patches/screen-cross-compile.patch b/packages/addons/official/shell/console/screen/patches/screen-cross-compile.patch
new file mode 100644
index 00000000000..b4bc30797c9
--- /dev/null
+++ b/packages/addons/official/shell/console/screen/patches/screen-cross-compile.patch
@@ -0,0 +1,64 @@
+--- a/configure
++++ b/configure
+@@ -414,7 +414,7 @@ as_fn_error ()
+     $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+   fi
+   $as_echo "$as_me: error: $2" >&2
+-  as_fn_exit $as_status
++  # as_fn_exit $as_status
+ } # as_fn_error
+ 
+ if expr a : '\(a\)' >/dev/null 2>&1 &&
+@@ -5941,8 +5941,6 @@ else
+ fi
+ rm -f core conftest.err conftest.$ac_objext \
+     conftest$ac_exeext conftest.$ac_ext
+-test -f /lib/libsec.a || test -f /usr/lib/libsec.a && LIBS="$LIBS -lsec"
+-test -f /lib/libshadow.a || test -f /usr/lib/libshadow.a && LIBS="$LIBS -lshadow"
+ oldlibs="$LIBS"
+ LIBS="$LIBS -lsun"
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking IRIX sun library..." >&5
+@@ -7023,7 +7021,7 @@ as_fn_error ()
+     $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+   fi
+   $as_echo "$as_me: error: $2" >&2
+-  as_fn_exit $as_status
++  # as_fn_exit $as_status
+ } # as_fn_error
+ 
+ 
+--- a/pty.c
++++ b/pty.c
+@@ -39,9 +39,9 @@
+ #endif
+ 
+ /* for solaris 2.1, Unixware (SVR4.2) and possibly others */
+-#ifdef HAVE_STROPTS_H
+-# include <sys/stropts.h>
+-#endif
++//#ifdef HAVE_STROPTS_H
++//# include <sys/stropts.h>
++//#endif
+ 
+ #if defined(sun) && defined(LOCKPTY) && !defined(TIOCEXCL)
+ # include <sys/ttold.h>
+--- a/sched.h
++++ b/sched.h
+@@ -27,6 +27,11 @@
+  * $Id$ GNU
+  */
+ 
++#ifndef __SCHED_H
++#define __SCHED_H
++
++#include <sys/time.h>
++
+ struct event
+ {
+   struct event *next;
+@@ -46,3 +51,5 @@ struct event
+ #define EV_READ		1
+ #define EV_WRITE	2
+ #define EV_ALWAYS	3
++
++#endif
diff --git a/packages/addons/official/shell/console/screen/patches/screen-dont_link_against_libelf.patch b/packages/addons/official/shell/console/screen/patches/screen-dont_link_against_libelf.patch
new file mode 100644
index 00000000000..496246a3ec4
--- /dev/null
+++ b/packages/addons/official/shell/console/screen/patches/screen-dont_link_against_libelf.patch
@@ -0,0 +1,22 @@
+diff -Naur screen-4.3.1/configure screen-4.3.1.patch/configure
+--- screen-4.3.1/configure	2015-06-28 23:42:40.000000000 +0200
++++ screen-4.3.1.patch/configure	2016-03-26 17:52:12.807054501 +0100
+@@ -4156,7 +4156,6 @@
+ 
+ 
+ oldlibs="$LIBS"
+-LIBS="$LIBS -lelf"
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking SVR4..." >&5
+ $as_echo "$as_me: checking SVR4..." >&6;}
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+diff -Naur screen-4.3.1/configure.ac screen-4.3.1.patch/configure.ac
+--- screen-4.3.1/configure.ac	2015-06-28 23:22:55.000000000 +0200
++++ screen-4.3.1.patch/configure.ac	2016-03-26 17:52:03.538036990 +0100
+@@ -203,7 +203,6 @@
+ ], LIBS="$LIBS -lsocket -linet";seqptx=1)
+ 
+ oldlibs="$LIBS"
+-LIBS="$LIBS -lelf"
+ AC_CHECKING(SVR4)
+ AC_TRY_LINK([#include <utmpx.h>
+ ],,
diff --git a/packages/addons/official/shell/console/screen/source/default.py b/packages/addons/official/shell/console/screen/source/default.py
new file mode 100644
index 00000000000..3b4290d2f95
--- /dev/null
+++ b/packages/addons/official/shell/console/screen/source/default.py
@@ -0,0 +1,21 @@
+################################################################################
+#      This file is part of OpenELEC - http://www.openelec.tv
+#      Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv)
+#
+#  OpenELEC is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  OpenELEC is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
+################################################################################
+import xbmcgui
+
+dialog = xbmcgui.Dialog()
+dialog.ok('', 'This is a console-only addon')
diff --git a/packages/addons/official/tools/file/changelog.txt b/packages/addons/official/tools/file/changelog.txt
new file mode 100644
index 00000000000..937b186b22a
--- /dev/null
+++ b/packages/addons/official/tools/file/changelog.txt
@@ -0,0 +1 @@
+see https://github.com/OpenELEC/OpenELEC.tv/tree/master/packages/addons/official
\ No newline at end of file
diff --git a/packages/addons/official/tools/file/icon/icon.png b/packages/addons/official/tools/file/icon/icon.png
new file mode 100644
index 00000000000..0493e3ef29a
Binary files /dev/null and b/packages/addons/official/tools/file/icon/icon.png differ
diff --git a/packages/addons/official/tools/file/package.mk b/packages/addons/official/tools/file/package.mk
new file mode 100644
index 00000000000..c86cfe84130
--- /dev/null
+++ b/packages/addons/official/tools/file/package.mk
@@ -0,0 +1,52 @@
+################################################################################
+#      This file is part of OpenELEC - http://www.openelec.tv
+#      Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv)
+#
+#  OpenELEC is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  OpenELEC is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
+################################################################################
+
+PKG_NAME="file"
+PKG_VERSION="5.30"
+PKG_REV="0"
+PKG_ARCH="any"
+PKG_LICENSE="BSD"
+PKG_SITE="http://www.darwinsys.com/file/"
+PKG_URL="ftp://ftp.astron.com/pub/file/$PKG_NAME-$PKG_VERSION.tar.gz"
+PKG_DEPENDS_HOST="ccache:host"
+PKG_DEPENDS_TARGET="toolchain zlib file:host"
+PKG_PRIORITY="optional"
+PKG_SECTION="tools"
+PKG_SHORTDESC="file: File type identification utility"
+PKG_LONGDESC="These are the sources to Darwin's file(1) utility and master magic(4) file, now maintained by Christos Zoulas. The file(1) utility is used to determine the types of various files."
+
+PKG_AUTORECONF="yes"
+
+PKG_IS_ADDON="yes"
+PKG_ADDON_TYPE="xbmc.python.script"
+PKG_ADDON_PROVIDES=""
+PKG_DISCLAIMER="this is an unofficial addon. please don't ask for support in openelec forum / irc channel"
+
+PKG_CONFIGURE_OPTS_HOST="--enable-fsect-man5 --enable-static --disable-shared"
+PKG_CONFIGURE_OPTS_TARGET="--enable-fsect-man5 --enable-static --disable-shared"
+
+makeinstall_target() {
+  : # meh
+}
+
+addon() {
+  mkdir -p $ADDON_BUILD/$PKG_ADDON_ID/bin
+    cp -PR $PKG_BUILD/.$TARGET_NAME/src/file $ADDON_BUILD/$PKG_ADDON_ID/bin
+  mkdir -p $ADDON_BUILD/$PKG_ADDON_ID/data
+    cp -PR $PKG_BUILD/.$TARGET_NAME/magic/magic.mgc $ADDON_BUILD/$PKG_ADDON_ID/data
+}
diff --git a/packages/addons/official/tools/file/patches/file-fix-segfault-in-magic_close.patch b/packages/addons/official/tools/file/patches/file-fix-segfault-in-magic_close.patch
new file mode 100644
index 00000000000..25b0f1bb217
--- /dev/null
+++ b/packages/addons/official/tools/file/patches/file-fix-segfault-in-magic_close.patch
@@ -0,0 +1,17 @@
+diff --git a/src/apprentice.c b/src/apprentice.c
+index 961e83d..b7d500c 100644
+--- a/src/apprentice.c
++++ b/src/apprentice.c
+@@ -348,11 +348,9 @@ apprentice_1(struct magic_set *ms, const char *fn, int action)
+ protected void
+ file_ms_free(struct magic_set *ms)
+ {
+-	size_t i;
+ 	if (ms == NULL)
+ 		return;
+-	for (i = 0; i < MAGIC_SETS; i++)
+-		mlist_free(ms->mlist[i]);
++	mlist_free(ms->mlist[0]);
+ 	free(ms->o.pbuf);
+ 	free(ms->o.buf);
+ 	free(ms->c.li);
diff --git a/packages/addons/official/tools/file/patches/file-move-magic.mgc-to-addon-data.patch b/packages/addons/official/tools/file/patches/file-move-magic.mgc-to-addon-data.patch
new file mode 100644
index 00000000000..cbe0fb5db37
--- /dev/null
+++ b/packages/addons/official/tools/file/patches/file-move-magic.mgc-to-addon-data.patch
@@ -0,0 +1,25 @@
+From 1191268f518d298c8617a4a74644ee1bb627f614 Mon Sep 17 00:00:00 2001
+From: Stefan Saraev <stefan@saraev.ca>
+Date: Mon, 25 Aug 2014 15:35:14 +0300
+Subject: [PATCH] move magic.mgc to addon/data
+
+---
+ src/magic.c |    2 +-
+ 1 files changed, 1 insertions(+), 1 deletions(-)
+
+diff --git a/src/magic.c b/src/magic.c
+index e4bd12b..59c1456 100644
+--- a/src/magic.c
++++ b/src/magic.c
+@@ -100,7 +100,7 @@ get_default_magic(void)
+ 	if ((home = getenv("HOME")) == NULL)
+ 		return MAGIC;
+ 
+-	if (asprintf(&hmagicpath, "%s/.magic.mgc", home) < 0)
++	if (asprintf(&hmagicpath, "%s/.kodi/addons/tools.file/data/magic.mgc", home) < 0)
+ 		return MAGIC;
+ 	if (stat(hmagicpath, &st) == -1) {
+ 		free(hmagicpath);
+-- 
+1.7.2.5
+
diff --git a/packages/addons/official/tools/file/patches/file-zip.patch b/packages/addons/official/tools/file/patches/file-zip.patch
new file mode 100644
index 00000000000..f16e3126e96
--- /dev/null
+++ b/packages/addons/official/tools/file/patches/file-zip.patch
@@ -0,0 +1,26 @@
+From e990dc53f658cb67ec7b33c32e0bc24e65801792 Mon Sep 17 00:00:00 2001
+From: Christos Zoulas <christos@zoulas.com>
+Date: Sat, 14 May 2011 15:04:15 +0000
+Subject: [PATCH] small archive
+
+---
+ magic/Magdir/archive |    5 ++++-
+ 1 files changed, 4 insertions(+), 1 deletions(-)
+
+diff --git a/magic/Magdir/archive b/magic/Magdir/archive
+index e84c8e3..5ad1ce3 100644
+--- a/magic/Magdir/archive
++++ b/magic/Magdir/archive
+@@ -654,6 +654,9 @@
+ >>>>78	string	-template		Template
+ !:mime	application/vnd.oasis.opendocument.image-template
+ 
++>26	byte	x			Zip archive data
++
++
+ # StarView Metafile
+ # From Pierre Ducroquet <pinaraf@pinaraf.info>
+ 0	string	VCLMTF	StarView MetaFile
+-- 
+1.7.4.1
+
diff --git a/packages/addons/official/tools/file/source/default.py b/packages/addons/official/tools/file/source/default.py
new file mode 100644
index 00000000000..3b4290d2f95
--- /dev/null
+++ b/packages/addons/official/tools/file/source/default.py
@@ -0,0 +1,21 @@
+################################################################################
+#      This file is part of OpenELEC - http://www.openelec.tv
+#      Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv)
+#
+#  OpenELEC is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  OpenELEC is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
+################################################################################
+import xbmcgui
+
+dialog = xbmcgui.Dialog()
+dialog.ok('', 'This is a console-only addon')
diff --git a/packages/addons/official/tools/mc/changelog.txt b/packages/addons/official/tools/mc/changelog.txt
new file mode 100644
index 00000000000..937b186b22a
--- /dev/null
+++ b/packages/addons/official/tools/mc/changelog.txt
@@ -0,0 +1 @@
+see https://github.com/OpenELEC/OpenELEC.tv/tree/master/packages/addons/official
\ No newline at end of file
diff --git a/packages/addons/official/tools/mc/icon/icon.png b/packages/addons/official/tools/mc/icon/icon.png
new file mode 100644
index 00000000000..4bee53d244c
Binary files /dev/null and b/packages/addons/official/tools/mc/icon/icon.png differ
diff --git a/packages/addons/official/tools/mc/package.mk b/packages/addons/official/tools/mc/package.mk
new file mode 100644
index 00000000000..69c90201c8c
--- /dev/null
+++ b/packages/addons/official/tools/mc/package.mk
@@ -0,0 +1,72 @@
+################################################################################
+#      This file is part of OpenELEC - http://www.openelec.tv
+#      Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv)
+#
+#  OpenELEC is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  OpenELEC is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
+################################################################################
+
+PKG_NAME="mc"
+PKG_VERSION="4.8.19"
+PKG_REV="0"
+PKG_ARCH="any"
+PKG_LICENSE="GPL"
+PKG_SITE="http://www.midnight-commander.org"
+PKG_URL="http://ftp.midnight-commander.org/${PKG_NAME}-${PKG_VERSION}.tar.xz"
+PKG_DEPENDS_TARGET="toolchain libtool:host gettext:host glib pcre netbsd-curses"
+PKG_PRIORITY="optional"
+PKG_SECTION="tools"
+PKG_SHORTDESC="mc: visual file manager"
+PKG_LONGDESC="GNU Midnight Commander is a visual file manager, licensed under GNU General Public License and therefore qualifies as Free Software. It's a feature rich full-screen text mode application that allows you to copy, move and delete files and whole directory trees, search for files and run commands in the subshell. Internal viewer and editor are included"
+
+PKG_IS_ADDON="yes"
+PKG_ADDON_TYPE="xbmc.python.script"
+PKG_ADDON_PROVIDES=""
+
+PKG_AUTORECONF="yes"
+
+PKG_CONFIGURE_OPTS_TARGET="ac_cv_search_addwstr=yes \
+            --sysconfdir=/storage/.kodi/addons/tools.mc/etc \
+            --datadir=/storage/.kodi/addons/tools.mc/data \
+            --libdir=/storage/.kodi/addons/tools.mc/mclib \
+            --disable-mclib \
+            --disable-aspell \
+            --disable-vfs \
+            --disable-doxygen-doc \
+            --disable-doxygen-dot \
+            --disable-doxygen-html \
+            --with-sysroot=$SYSROOT_PREFIX \
+            --with-screen=ncurses \
+            --without-x \
+            --with-gnu-ld \
+            --without-libiconv-prefix \
+            --without-libintl-prefix \
+            --with-internal-edit \
+            --without-diff-viewer \
+            --with-subshell"
+
+pre_configure_target() {
+  export LDFLAGS=$(echo $LDFLAGS | sed -e "s|-Wl,--as-needed||")
+  export LIBS="-lcurses -lterminfo"
+}
+
+post_makeinstall_target() {
+  rm -rf $INSTALL/storage/.kodi/addons/tools.mc/data/locale
+  rm -rf $INSTALL/storage/.kodi/addons/tools.mc/data/mc/help/mc.hlp.*
+}
+
+addon() {
+  mkdir -p $ADDON_BUILD/$PKG_ADDON_ID/bin
+    cp -Pa $PKG_BUILD/.install_pkg/usr/bin/* $ADDON_BUILD/$PKG_ADDON_ID/bin/
+    cp -Pa $PKG_BUILD/.install_pkg/storage/.kodi/addons/tools.mc/* $ADDON_BUILD/$PKG_ADDON_ID
+}
diff --git a/packages/addons/official/tools/mc/source/default.py b/packages/addons/official/tools/mc/source/default.py
new file mode 100644
index 00000000000..3b4290d2f95
--- /dev/null
+++ b/packages/addons/official/tools/mc/source/default.py
@@ -0,0 +1,21 @@
+################################################################################
+#      This file is part of OpenELEC - http://www.openelec.tv
+#      Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv)
+#
+#  OpenELEC is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  OpenELEC is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
+################################################################################
+import xbmcgui
+
+dialog = xbmcgui.Dialog()
+dialog.ok('', 'This is a console-only addon')
diff --git a/packages/addons/official/tools/mesa-demos/changelog.txt b/packages/addons/official/tools/mesa-demos/changelog.txt
new file mode 100644
index 00000000000..937b186b22a
--- /dev/null
+++ b/packages/addons/official/tools/mesa-demos/changelog.txt
@@ -0,0 +1 @@
+see https://github.com/OpenELEC/OpenELEC.tv/tree/master/packages/addons/official
\ No newline at end of file
diff --git a/packages/addons/official/tools/mesa-demos/icon/icon.png b/packages/addons/official/tools/mesa-demos/icon/icon.png
new file mode 100644
index 00000000000..f5b4f1f5ec7
Binary files /dev/null and b/packages/addons/official/tools/mesa-demos/icon/icon.png differ
diff --git a/packages/sysutils/irqbalance/package.mk b/packages/addons/official/tools/mesa-demos/package.mk
similarity index 51%
rename from packages/sysutils/irqbalance/package.mk
rename to packages/addons/official/tools/mesa-demos/package.mk
index 7c5ac8accb2..4b3cb97f9ea 100644
--- a/packages/sysutils/irqbalance/package.mk
+++ b/packages/addons/official/tools/mesa-demos/package.mk
@@ -16,33 +16,34 @@
 #  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
 ################################################################################
 
-PKG_NAME="irqbalance"
-PKG_VERSION="aa04f78"
-PKG_REV="1"
-PKG_ARCH="i386 x86_64 arm"
-PKG_LICENSE="GPLv2"
-PKG_SITE="https://github.com/Irqbalance/irqbalance"
-PKG_GIT_URL="https://github.com/Irqbalance/irqbalance.git"
-PKG_GIT_BRANCH="master"
-PKG_DEPENDS_TARGET="toolchain glib systemd"
+PKG_NAME="mesa-demos"
+PKG_VERSION="8.3.0"
+PKG_REV="0"
+PKG_ARCH="i386 x86_64"
+PKG_LICENSE="OSS"
+PKG_SITE="http://www.mesa3d.org/"
+PKG_URL="ftp://ftp.freedesktop.org/pub/mesa/demos/$PKG_VERSION/$PKG_NAME-$PKG_VERSION.tar.bz2"
+PKG_DEPENDS_TARGET="toolchain libX11 mesa glu glew"
 PKG_PRIORITY="optional"
-PKG_SECTION="system"
-PKG_SHORTDESC="irqbalance: a daemon to help balance the cpu load generated by interrupts across all of a systems cpus"
-PKG_LONGDESC="irqbalance is a daemon to help balance the cpu load generated by interrupts across all of a systems cpus."
+PKG_SECTION="tools"
+PKG_SHORTDESC="mesa-demos: Mesa 3D demos"
+PKG_LONGDESC="Mesa 3D demos - installed are the well known glxinfo and glxgears."
 
-PKG_IS_ADDON="no"
-PKG_AUTORECONF="yes"
+PKG_IS_ADDON="yes"
+PKG_ADDON_TYPE="xbmc.python.script"
+PKG_ADDON_PROVIDES=""
 
-PKG_CONFIGURE_OPTS_TARGET="--with-systemd --with-glib2 --without-libcap-ng"
+PKG_AUTORECONF="yes"
 
-post_makeinstall_target() {
-  mkdir -p $INSTALL/etc
-    cp $ROOT/$PKG_BUILD/misc/irqbalance.env $INSTALL/etc/irqbalance
+PKG_CONFIGURE_OPTS_TARGET="--without-glut"
 
-  mkdir -p $INSTALL/usr/lib/systemd/system
-    cp $ROOT/$PKG_BUILD/misc/irqbalance.service $INSTALL/usr/lib/systemd/system
+makeinstall_target() {
+  : # nop
 }
 
-post_install() {
-  enable_service irqbalance.service
+addon() {
+  mkdir -p $ADDON_BUILD/$PKG_ADDON_ID/bin
+    cp -P $PKG_BUILD/.$TARGET_NAME/src/xdemos/glxdemo $ADDON_BUILD/$PKG_ADDON_ID/bin/
+    cp -P $PKG_BUILD/.$TARGET_NAME/src/xdemos/glxgears $ADDON_BUILD/$PKG_ADDON_ID/bin/
+    cp -P $PKG_BUILD/.$TARGET_NAME/src/xdemos/glxinfo $ADDON_BUILD/$PKG_ADDON_ID/bin/
 }
diff --git a/packages/addons/official/tools/mesa-demos/source/default.py b/packages/addons/official/tools/mesa-demos/source/default.py
new file mode 100644
index 00000000000..3b4290d2f95
--- /dev/null
+++ b/packages/addons/official/tools/mesa-demos/source/default.py
@@ -0,0 +1,21 @@
+################################################################################
+#      This file is part of OpenELEC - http://www.openelec.tv
+#      Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv)
+#
+#  OpenELEC is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  OpenELEC is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
+################################################################################
+import xbmcgui
+
+dialog = xbmcgui.Dialog()
+dialog.ok('', 'This is a console-only addon')
diff --git a/packages/addons/official/tools/w_scan/changelog.txt b/packages/addons/official/tools/w_scan/changelog.txt
new file mode 100644
index 00000000000..937b186b22a
--- /dev/null
+++ b/packages/addons/official/tools/w_scan/changelog.txt
@@ -0,0 +1 @@
+see https://github.com/OpenELEC/OpenELEC.tv/tree/master/packages/addons/official
\ No newline at end of file
diff --git a/packages/addons/official/tools/w_scan/icon/icon.png b/packages/addons/official/tools/w_scan/icon/icon.png
new file mode 100644
index 00000000000..ba5c9edd144
Binary files /dev/null and b/packages/addons/official/tools/w_scan/icon/icon.png differ
diff --git a/packages/addons/official/tools/w_scan/package.mk b/packages/addons/official/tools/w_scan/package.mk
new file mode 100644
index 00000000000..507c75c90a1
--- /dev/null
+++ b/packages/addons/official/tools/w_scan/package.mk
@@ -0,0 +1,45 @@
+################################################################################
+#      This file is part of OpenELEC - http://www.openelec.tv
+#      Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv)
+#
+#  OpenELEC is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  OpenELEC is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
+################################################################################
+
+PKG_NAME="w_scan"
+PKG_VERSION="20170107"
+PKG_REV="0"
+PKG_ARCH="any"
+PKG_LICENSE="GPL"
+PKG_SITE="http://wirbel.htpc-forum.de/w_scan/index2.html"
+PKG_URL="http://wirbel.htpc-forum.de/w_scan/$PKG_NAME-$PKG_VERSION.tar.bz2"
+PKG_DEPENDS_TARGET="toolchain"
+PKG_PRIORITY="optional"
+PKG_SECTION="tools"
+PKG_SHORTDESC="w_scan: eine kleine Anwendung zum Scannen von ATSC/DVB-C/S/T Transpondern/Bouquets nach Sendern und Erstellen einer VDR channels.conf."
+PKG_LONGDESC="w_scan ist eine kleine Anwendung zum Scannen von ATSC/DVB-C/S/T Transpondern/Bouquets nach Sendern und Erstellen einer VDR channels.conf."
+
+PKG_AUTORECONF="yes"
+
+PKG_IS_ADDON="yes"
+PKG_ADDON_TYPE="xbmc.python.script"
+PKG_ADDON_PROVIDES=""
+
+makeinstall_target() {
+  : # nop
+}
+
+addon() {
+  mkdir -p $ADDON_BUILD/$PKG_ADDON_ID/bin
+    cp $PKG_BUILD/.$TARGET_NAME/w_scan $ADDON_BUILD/$PKG_ADDON_ID/bin
+}
diff --git a/packages/addons/official/tools/w_scan/source/default.py b/packages/addons/official/tools/w_scan/source/default.py
new file mode 100644
index 00000000000..3b4290d2f95
--- /dev/null
+++ b/packages/addons/official/tools/w_scan/source/default.py
@@ -0,0 +1,21 @@
+################################################################################
+#      This file is part of OpenELEC - http://www.openelec.tv
+#      Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv)
+#
+#  OpenELEC is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  OpenELEC is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
+################################################################################
+import xbmcgui
+
+dialog = xbmcgui.Dialog()
+dialog.ok('', 'This is a console-only addon')
diff --git a/packages/audio/alsa-lib/package.mk b/packages/audio/alsa-lib/package.mk
index dffcc1f7b17..9fede3d61e5 100644
--- a/packages/audio/alsa-lib/package.mk
+++ b/packages/audio/alsa-lib/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="alsa-lib"
-PKG_VERSION="1.1.3"
+PKG_VERSION="1.1.4.1"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/audio/alsa-utils/package.mk b/packages/audio/alsa-utils/package.mk
index e50cc4f9ddd..04d3bc79ef2 100644
--- a/packages/audio/alsa-utils/package.mk
+++ b/packages/audio/alsa-utils/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="alsa-utils"
-PKG_VERSION="1.1.3"
+PKG_VERSION="1.1.4"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/audio/libsndfile/package.mk b/packages/audio/libsndfile/package.mk
index 081586c2b22..364442af522 100644
--- a/packages/audio/libsndfile/package.mk
+++ b/packages/audio/libsndfile/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="libsndfile"
-PKG_VERSION="1.0.27"
+PKG_VERSION="1.0.28"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="LGPL"
@@ -31,6 +31,7 @@ PKG_LONGDESC="libsndfile is a C library for reading and writing sound files such
 
 PKG_IS_ADDON="no"
 PKG_AUTORECONF="yes"
+PKG_USE_CMAKE="no"
 
 # package specific configure options
 PKG_CONFIGURE_OPTS_TARGET="--enable-static --disable-shared \
@@ -43,6 +44,10 @@ PKG_CONFIGURE_OPTS_TARGET="--enable-static --disable-shared \
                            --enable-largefile \
                            --with-gnu-ld"
 
+pre_configure_target() {
+  CFLAGS="$CFLAGS -fPIC"
+}
+
 post_makeinstall_target() {
   rm -rf $INSTALL/usr/bin
 }
diff --git a/packages/audio/sbc/package.mk b/packages/audio/sbc/package.mk
index 8c0ba3e152a..e1a356ab22c 100644
--- a/packages/audio/sbc/package.mk
+++ b/packages/audio/sbc/package.mk
@@ -34,3 +34,7 @@ PKG_AUTORECONF="yes"
 
 PKG_CONFIGURE_OPTS_TARGET="--enable-static --disable-shared \
          --disable-tools --disable-tester"
+
+pre_configure_target() {
+  CFLAGS="$CFLAGS -fPIC"
+}
diff --git a/packages/audio/soxr/package.mk b/packages/audio/soxr/package.mk
index a0bbb207c0f..3b6a37b88eb 100644
--- a/packages/audio/soxr/package.mk
+++ b/packages/audio/soxr/package.mk
@@ -37,3 +37,7 @@ PKG_CMAKE_OPTS_TARGET="-DHAVE_WORDS_BIGENDIAN_EXITCODE=1 \
                        -DBUILD_TESTS=0 \
                        -DBUILD_EXAMPLES=1 \
                        -DBUILD_SHARED_LIBS=OFF"
+
+pre_configure_target() {
+  CFLAGS="$CFLAGS -fPIC"
+}
diff --git a/packages/audio/speex/package.mk b/packages/audio/speex/package.mk
index 82d20a98377..fa10d1f5733 100644
--- a/packages/audio/speex/package.mk
+++ b/packages/audio/speex/package.mk
@@ -33,3 +33,7 @@ PKG_IS_ADDON="no"
 PKG_AUTORECONF="yes"
 
 PKG_CONFIGURE_OPTS_TARGET="--disable-shared --enable-static"
+
+pre_configure_target() {
+  CFLAGS="$CFLAGS -fPIC"
+}
diff --git a/packages/databases/mariadb/package.mk b/packages/databases/mariadb/package.mk
index 973affe58fa..ff5ee52cb6d 100644
--- a/packages/databases/mariadb/package.mk
+++ b/packages/databases/mariadb/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="mariadb"
-PKG_VERSION="10.1.21"
+PKG_VERSION="10.1.23"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPLv2"
diff --git a/packages/databases/mariadb/patches/mariadb-X509_check.patch b/packages/databases/mariadb/patches/mariadb-X509_check.patch
new file mode 100644
index 00000000000..8b692d9d1a5
--- /dev/null
+++ b/packages/databases/mariadb/patches/mariadb-X509_check.patch
@@ -0,0 +1,12 @@
+diff -Naur mariadb-10.1.23/sql-common/client.c mariadb-10.1.23.patch/sql-common/client.c
+--- mariadb-10.1.23/sql-common/client.c	2017-05-02 07:13:42.000000000 +0200
++++ mariadb-10.1.23.patch/sql-common/client.c	2017-05-04 09:27:38.883412269 +0200
+@@ -1768,7 +1768,7 @@
+ 
+ #if defined(HAVE_OPENSSL)
+ 
+-#if OPENSSL_VERSION_NUMBER >= 0x10002000L && !defined(HAVE_YASSL)
++#ifdef X509_CHECK_FLAG_ALWAYS_CHECK_SUBJECT && !defined(HAVE_YASSL)
+ #include <openssl/x509v3.h>
+ #define HAVE_X509_check_host
+ #endif
diff --git a/packages/databases/mariadb/patches/mariadb-c11_atomics.patch b/packages/databases/mariadb/patches/mariadb-c11_atomics.patch
new file mode 100644
index 00000000000..3df7215f04d
--- /dev/null
+++ b/packages/databases/mariadb/patches/mariadb-c11_atomics.patch
@@ -0,0 +1,121 @@
+Description: Fix mips missing atomics primitives
+ On mips we don't have native support for 64bit atomic operations. Make use
+ of libatomic to emulate them.
+Author: Vicențiu Ciorbaru <vicentiu@mariadb.org>
+
+--- a/configure.cmake
++++ b/configure.cmake
+@@ -128,7 +128,7 @@ IF(UNIX)
+   ENDIF()
+   FIND_PACKAGE(Threads)
+ 
+-  SET(CMAKE_REQUIRED_LIBRARIES 
++  LIST(APPEND CMAKE_REQUIRED_LIBRARIES
+     ${LIBM} ${LIBNSL} ${LIBBIND} ${LIBCRYPT} ${LIBSOCKET} ${LIBDL} ${CMAKE_THREAD_LIBS_INIT} ${LIBRT} ${LIBEXECINFO})
+   # Need explicit pthread for gcc -fsanitize=address
+   IF(CMAKE_USE_PTHREADS_INIT AND CMAKE_C_FLAGS MATCHES "-fsanitize=")
+@@ -1028,7 +1028,26 @@ ELSEIF(NOT WITH_ATOMIC_OPS)
+     long long int *ptr= &var;
+     return (int)__atomic_load_n(ptr, __ATOMIC_SEQ_CST);
+   }"
+-  HAVE_GCC_C11_ATOMICS)
++  HAVE_GCC_C11_ATOMICS_WITHOUT_LIBATOMIC)
++  IF(HAVE_GCC_C11_ATOMICS_WITHOUT_LIBATOMIC)
++    SET(HAVE_GCC_C11_ATOMICS True)
++  ELSE()
++    SET(OLD_CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES})
++    LIST(APPEND CMAKE_REQUIRED_LIBRARIES "atomic")
++    CHECK_CXX_SOURCE_COMPILES("
++    int main()
++    {
++      long long int var= 1;
++      long long int *ptr= &var;
++      return (int)__atomic_load_n(ptr, __ATOMIC_SEQ_CST);
++    }"
++    HAVE_GCC_C11_ATOMICS_WITH_LIBATOMIC)
++    IF(HAVE_GCC_C11_ATOMICS_WITH_LIBATOMIC)
++      SET(HAVE_GCC_C11_ATOMICS True)
++    ELSE()
++      SET(CMAKE_REQUIRED_LIBRARIES ${OLD_CMAKE_REQUIRED_LIBRARIES})
++    ENDIF()
++  ENDIF()
+ ELSE()
+   MESSAGE(FATAL_ERROR "${WITH_ATOMIC_OPS} is not a valid value for WITH_ATOMIC_OPS!")
+ ENDIF()
+--- a/include/atomic/gcc_builtins.h
++++ b/include/atomic/gcc_builtins.h
+@@ -16,6 +16,7 @@
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+ 
++#if defined (HAVE_GCC_ATOMIC_BUILTINS)
+ #define make_atomic_add_body(S)                     \
+   v= __sync_fetch_and_add(a, v);
+ #define make_atomic_fas_body(S)                     \
+@@ -26,6 +27,20 @@
+   sav= __sync_val_compare_and_swap(a, cmp_val, set);\
+   if (!(ret= (sav == cmp_val))) *cmp= sav
+ 
++#elif defined(HAVE_GCC_C11_ATOMICS)
++
++#define make_atomic_add_body(S)                     \
++  __atomic_add_fetch(a, v, __ATOMIC_SEQ_CST)
++#define make_atomic_fas_body(S)                     \
++  v= __atomic_exchange_n(a, v, __ATOMIC_SEQ_CST)
++#define make_atomic_cas_body(S)                     \
++  int ## S sav;                                     \
++  ret= __atomic_compare_exchange_n(a, cmp, set,     \
++                                   0,               \
++                                   __ATOMIC_SEQ_CST,\
++                                   __ATOMIC_SEQ_CST);
++#endif
++
+ #ifdef MY_ATOMIC_MODE_DUMMY
+ #define make_atomic_load_body(S)   ret= *a
+ #define make_atomic_store_body(S)  *a= v
+--- a/include/atomic/nolock.h
++++ b/include/atomic/nolock.h
+@@ -17,7 +17,7 @@
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+ 
+ #if defined(__i386__) || defined(_MSC_VER) || defined(__x86_64__)   \
+-    || defined(HAVE_GCC_ATOMIC_BUILTINS) \
++    || defined(HAVE_GCC_ATOMIC_BUILTINS) || defined(HAVE_GCC_C11_ATOMICS) \
+     || defined(HAVE_SOLARIS_ATOMIC)
+ 
+ #  ifdef MY_ATOMIC_MODE_DUMMY
+@@ -41,7 +41,7 @@
+ #  elif __GNUC__
+ #    if defined(HAVE_SOLARIS_ATOMIC)
+ #      include "solaris.h"
+-#    elif defined(HAVE_GCC_ATOMIC_BUILTINS)
++#    elif defined(HAVE_GCC_ATOMIC_BUILTINS) || defined(HAVE_GCC_C11_ATOMICS)
+ #      include "gcc_builtins.h"
+ #    elif defined(__i386__) || defined(__x86_64__)
+ #      include "x86-gcc.h"
+--- a/mysys/CMakeLists.txt
++++ b/mysys/CMakeLists.txt
+@@ -78,6 +78,10 @@ IF(HAVE_BFD_H)
+   TARGET_LINK_LIBRARIES(mysys bfd)  
+ ENDIF(HAVE_BFD_H)
+ 
++IF(HAVE_GCC_C11_ATOMICS_WITH_LIBATOMIC)
++  TARGET_LINK_LIBRARIES(mysys atomic)
++ENDIF()
++
+ IF (WIN32)
+   TARGET_LINK_LIBRARIES(mysys IPHLPAPI)  
+ ENDIF(WIN32)
+--- a/sql/CMakeLists.txt
++++ b/sql/CMakeLists.txt
+@@ -171,6 +171,10 @@ TARGET_LINK_LIBRARIES(sql ${MYSQLD_STATI
+   ${SSL_LIBRARIES}
+   ${LIBSYSTEMD})
+ 
++IF(HAVE_GCC_C11_ATOMICS_WITH_LIBATOMIC)
++  TARGET_LINK_LIBRARIES(sql atomic)
++ENDIF()
++
+ IF(WIN32)
+   SET(MYSQLD_SOURCE main.cc nt_servc.cc nt_servc.h message.rc)
+   TARGET_LINK_LIBRARIES(sql psapi)
diff --git a/packages/databases/mariadb/patches/mariadb-mips-connect-unaligned.patch b/packages/databases/mariadb/patches/mariadb-mips-connect-unaligned.patch
new file mode 100644
index 00000000000..a7873de949c
--- /dev/null
+++ b/packages/databases/mariadb/patches/mariadb-mips-connect-unaligned.patch
@@ -0,0 +1,251 @@
+Description: Handle unaligned buffers in connect's TYPBLK class
+ On MIPS platforms (and probably others) unaligned memory access results in a
+ bus error. In the connect storage engine, block data for some data formats is
+ stored packed in memory and the TYPBLK class is used to read values from it.
+ Since TYPBLK does not have special handling for this packed memory, it can
+ quite easily result in unaligned memory accesses.
+ .
+ The simple way to fix this is to perform all accesses to the main buffer
+ through memcpy. With GCC and optimizations turned on, this call to memcpy is
+ completely optimized away on architectures where unaligned accesses are ok
+ (like x86).
+Author: James Cowgill <jcowgill@debian.org>
+---
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
+--- a/storage/connect/valblk.h
++++ b/storage/connect/valblk.h
+@@ -139,6 +139,7 @@ class VALBLK : public BLOCK {
+   int     Prec;             // Precision of float values
+   }; // end of class VALBLK
+ 
++
+ /***********************************************************************/
+ /*  Class TYPBLK: represents a block of typed values.                  */
+ /***********************************************************************/
+@@ -151,40 +152,40 @@ class TYPBLK : public VALBLK {
+   // Implementation
+   virtual bool   Init(PGLOBAL g, bool check);
+   virtual int    GetVlen(void) {return sizeof(TYPE);}
+-  virtual char   GetTinyValue(int n) {return (char)Typp[n];}
+-  virtual uchar  GetUTinyValue(int n) {return (uchar)Typp[n];}
+-  virtual short  GetShortValue(int n) {return (short)Typp[n];}
+-  virtual ushort GetUShortValue(int n) {return (ushort)Typp[n];}
+-  virtual int    GetIntValue(int n) {return (int)Typp[n];}
+-  virtual uint   GetUIntValue(int n) {return (uint)Typp[n];}
+-  virtual longlong GetBigintValue(int n) {return (longlong)Typp[n];}
+-  virtual ulonglong GetUBigintValue(int n) {return (ulonglong)Typp[n];}
+-  virtual double GetFloatValue(int n) {return (double)Typp[n];}
++  virtual char   GetTinyValue(int n) {return (char)UnalignedRead(n);}
++  virtual uchar  GetUTinyValue(int n) {return (uchar)UnalignedRead(n);}
++  virtual short  GetShortValue(int n) {return (short)UnalignedRead(n);}
++  virtual ushort GetUShortValue(int n) {return (ushort)UnalignedRead(n);}
++  virtual int    GetIntValue(int n) {return (int)UnalignedRead(n);}
++  virtual uint   GetUIntValue(int n) {return (uint)UnalignedRead(n);}
++  virtual longlong GetBigintValue(int n) {return (longlong)UnalignedRead(n);}
++  virtual ulonglong GetUBigintValue(int n) {return (ulonglong)UnalignedRead(n);}
++  virtual double GetFloatValue(int n) {return (double)UnalignedRead(n);}
+   virtual char  *GetCharString(char *p, int n);
+-  virtual void   Reset(int n) {Typp[n] = 0;}
++  virtual void   Reset(int n) {UnalignedWrite(n, 0);}
+ 
+   // Methods
+   using VALBLK::SetValue;
+   virtual void   SetValue(PSZ sp, int n);
+   virtual void   SetValue(char *sp, uint len, int n);
+   virtual void   SetValue(short sval, int n)
+-                  {Typp[n] = (TYPE)sval; SetNull(n, false);}
++                  {UnalignedWrite(n, (TYPE)sval); SetNull(n, false);}
+   virtual void   SetValue(ushort sval, int n)
+-                  {Typp[n] = (TYPE)sval; SetNull(n, false);}
++                  {UnalignedWrite(n, (TYPE)sval); SetNull(n, false);}
+   virtual void   SetValue(int lval, int n)
+-                  {Typp[n] = (TYPE)lval; SetNull(n, false);}
++                  {UnalignedWrite(n, (TYPE)lval); SetNull(n, false);}
+   virtual void   SetValue(uint lval, int n)
+-                  {Typp[n] = (TYPE)lval; SetNull(n, false);}
++                  {UnalignedWrite(n, (TYPE)lval); SetNull(n, false);}
+   virtual void   SetValue(longlong lval, int n)
+-                  {Typp[n] = (TYPE)lval; SetNull(n, false);}
++                  {UnalignedWrite(n, (TYPE)lval); SetNull(n, false);}
+   virtual void   SetValue(ulonglong lval, int n)
+-                  {Typp[n] = (TYPE)lval; SetNull(n, false);}
++                  {UnalignedWrite(n, (TYPE)lval); SetNull(n, false);}
+   virtual void   SetValue(double fval, int n)
+-                  {Typp[n] = (TYPE)fval; SetNull(n, false);}
++                  {UnalignedWrite(n, (TYPE)fval); SetNull(n, false);}
+   virtual void   SetValue(char cval, int n)
+-                  {Typp[n] = (TYPE)cval; SetNull(n, false);}
++                  {UnalignedWrite(n, (TYPE)cval); SetNull(n, false);}
+   virtual void   SetValue(uchar cval, int n)
+-                  {Typp[n] = (TYPE)cval; SetNull(n, false);}
++                  {UnalignedWrite(n, (TYPE)cval); SetNull(n, false);}
+   virtual void   SetValue(PVAL valp, int n);
+   virtual void   SetValue(PVBLK pv, int n1, int n2);
+   virtual void   SetMin(PVAL valp, int n);
+@@ -206,6 +207,17 @@ class TYPBLK : public VALBLK {
+   // Members
+   TYPE* const &Typp;
+   const char  *Fmt;
++
++  // Unaligned access
++  TYPE UnalignedRead(int n) const {
++    TYPE result;
++    memcpy(&result, Typp + n, sizeof(TYPE));
++    return result;
++  }
++
++  void UnalignedWrite(int n, TYPE value) {
++    memcpy(Typp + n, &value, sizeof(TYPE));
++  }
+   }; // end of class TYPBLK
+ 
+ /***********************************************************************/
+--- a/storage/connect/valblk.cpp
++++ b/storage/connect/valblk.cpp
+@@ -265,14 +265,14 @@ bool TYPBLK<TYPE>::Init(PGLOBAL g, bool
+ template <class TYPE>
+ char *TYPBLK<TYPE>::GetCharString(char *p, int n)
+   {
+-  sprintf(p, Fmt, Typp[n]);
++  sprintf(p, Fmt, UnalignedRead(n));
+   return p;
+   } // end of GetCharString
+ 
+ template <>
+ char *TYPBLK<double>::GetCharString(char *p, int n)
+   {
+-  sprintf(p, Fmt, Prec, Typp[n]);
++  sprintf(p, Fmt, Prec, UnalignedRead(n));
+   return p;
+   } // end of GetCharString
+ 
+@@ -288,7 +288,7 @@ void TYPBLK<TYPE>::SetValue(PVAL valp, i
+   ChkTyp(valp);
+ 
+   if (!(b = valp->IsNull()))
+-    Typp[n] = GetTypedValue(valp);
++    UnalignedWrite(n, GetTypedValue(valp));
+   else
+     Reset(n);
+ 
+@@ -350,9 +350,9 @@ void TYPBLK<TYPE>::SetValue(PSZ p, int n
+   ulonglong val = CharToNumber(p, strlen(p), maxval, Unsigned, &minus); 
+     
+   if (minus && val < maxval)
+-    Typp[n] = (TYPE)(-(signed)val);
++    UnalignedWrite(n, (TYPE)(-(signed)val));
+   else
+-    Typp[n] = (TYPE)val;
++    UnalignedWrite(n, (TYPE)val);
+ 
+   SetNull(n, false);
+   } // end of SetValue
+@@ -395,7 +395,7 @@ void TYPBLK<double>::SetValue(PSZ p, int
+     longjmp(g->jumper[g->jump_level], Type);
+     } // endif Check
+ 
+-  Typp[n] = atof(p);
++  UnalignedWrite(n, atof(p));
+   SetNull(n, false);
+   } // end of SetValue
+ 
+@@ -427,7 +427,7 @@ void TYPBLK<TYPE>::SetValue(PVBLK pv, in
+   ChkTyp(pv);
+ 
+   if (!(b = pv->IsNull(n2) && Nullable))
+-    Typp[n1] = GetTypedValue(pv, n2);
++    UnalignedWrite(n1, GetTypedValue(pv, n2));
+   else
+     Reset(n1);
+ 
+@@ -478,10 +478,10 @@ void TYPBLK<TYPE>::SetMin(PVAL valp, int
+   {
+   CheckParms(valp, n)
+   TYPE  tval = GetTypedValue(valp);
+-  TYPE& tmin = Typp[n];
++  TYPE  tmin = UnalignedRead(n);
+ 
+   if (tval < tmin)
+-    tmin = tval;
++    UnalignedWrite(n, tval);
+ 
+   } // end of SetMin
+ 
+@@ -493,10 +493,10 @@ void TYPBLK<TYPE>::SetMax(PVAL valp, int
+   {
+   CheckParms(valp, n)
+   TYPE  tval = GetTypedValue(valp);
+-  TYPE& tmin = Typp[n];
++  TYPE  tmin = UnalignedRead(n);
+ 
+   if (tval > tmin)
+-    tmin = tval;
++    UnalignedWrite(n, tval);
+ 
+   } // end of SetMax
+ 
+@@ -522,7 +522,7 @@ void TYPBLK<TYPE>::SetValues(PVBLK pv, i
+ template <class TYPE>
+ void TYPBLK<TYPE>::Move(int i, int j)
+   {
+-  Typp[j] = Typp[i];
++  UnalignedWrite(j, UnalignedRead(i));
+   MoveNull(i, j);
+   } // end of Move
+ 
+@@ -536,7 +536,7 @@ int TYPBLK<TYPE>::CompVal(PVAL vp, int n
+   ChkIndx(n);
+   ChkTyp(vp);
+ #endif   // _DEBUG
+-  TYPE mlv = Typp[n];
++  TYPE mlv = UnalignedRead(n);
+   TYPE vlv = GetTypedValue(vp);
+ 
+   return (vlv > mlv) ? 1 : (vlv < mlv) ? (-1) : 0;
+@@ -548,8 +548,8 @@ int TYPBLK<TYPE>::CompVal(PVAL vp, int n
+ template <class TYPE>
+ int TYPBLK<TYPE>::CompVal(int i1, int i2)
+   {
+-  TYPE lv1 = Typp[i1];
+-  TYPE lv2 = Typp[i2];
++  TYPE lv1 = UnalignedRead(i1);
++  TYPE lv2 = UnalignedRead(i2);
+ 
+   return (lv1 > lv2) ? 1 : (lv1 < lv2) ? (-1) : 0;
+   } // end of CompVal
+@@ -586,7 +586,7 @@ int TYPBLK<TYPE>::Find(PVAL vp)
+   TYPE n = GetTypedValue(vp);
+ 
+   for (i = 0; i < Nval; i++)
+-    if (n == Typp[i])
++    if (n == UnalignedRead(i))
+       break;
+ 
+   return (i < Nval) ? i : (-1);
+@@ -602,7 +602,7 @@ int TYPBLK<TYPE>::GetMaxLength(void)
+   int i, n, m;
+ 
+   for (i = n = 0; i < Nval; i++) {
+-    m = sprintf(buf, Fmt, Typp[i]);
++    m = sprintf(buf, Fmt, UnalignedRead(i));
+     n = MY_MAX(n, m);
+     } // endfor i
+ 
+@@ -1332,7 +1332,7 @@ char *DATBLK::GetCharString(char *p, int
+   char *vp;
+ 
+   if (Dvalp) {
+-    Dvalp->SetValue(Typp[n]);
++    Dvalp->SetValue(UnalignedRead(n));
+     vp = Dvalp->GetCharString(p);
+   } else
+     vp = TYPBLK<int>::GetCharString(p, n);
+@@ -1348,7 +1348,7 @@ void DATBLK::SetValue(PSZ p, int n)
+   if (Dvalp) {
+     // Decode the string according to format
+     Dvalp->SetValue_psz(p);
+-    Typp[n] = Dvalp->GetIntValue();
++    UnalignedWrite(n, Dvalp->GetIntValue());
+   } else
+     TYPBLK<int>::SetValue(p, n);
+ 
diff --git a/packages/databases/mariadb/patches/mariadb-mips-groonga-atomic.patch b/packages/databases/mariadb/patches/mariadb-mips-groonga-atomic.patch
new file mode 100644
index 00000000000..acf80522ddb
--- /dev/null
+++ b/packages/databases/mariadb/patches/mariadb-mips-groonga-atomic.patch
@@ -0,0 +1,28 @@
+Description: Ensure groonga is built with libatomic
+ MIPS (and possibly other) platforms require linking against libatomic to
+ support 64-bit atomic integers. Groonga was failing to do so and all related
+ tests were failing with an atomics relocation error on MIPS.
+Author: James Cowgill <jcowgill@debian.org>
+---
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
+--- a/storage/mroonga/vendor/groonga/CMakeLists.txt
++++ b/storage/mroonga/vendor/groonga/CMakeLists.txt
+@@ -234,6 +234,8 @@ endmacro()
+ include(build/ac_macros/check_headers.m4)
+ include(build/ac_macros/check_functions.m4)
+ 
++ac_check_lib(atomic __atomic_store_8)
++
+ ac_check_symbols(fpclassify math.h)
+ ac_check_lib(m fpclassify)
+ 
+--- a/storage/mroonga/vendor/groonga/lib/CMakeLists.txt
++++ b/storage/mroonga/vendor/groonga/lib/CMakeLists.txt
+@@ -62,6 +62,7 @@ endif()
+ set_target_properties(libgroonga PROPERTIES OUTPUT_NAME "groonga")
+ 
+ set(GRN_ALL_LIBRARIES
++    ${ATOMIC_LIBS}
+     ${EXECINFO_LIBS}
+     ${RT_LIBS}
+     ${PTHREAD_LIBS}
diff --git a/packages/databases/mariadb/patches/mariadb-mips-machine.patch b/packages/databases/mariadb/patches/mariadb-mips-machine.patch
new file mode 100644
index 00000000000..ecd34a0eb9a
--- /dev/null
+++ b/packages/databases/mariadb/patches/mariadb-mips-machine.patch
@@ -0,0 +1,25 @@
+Description: Fix DEFAULT_MACHINE on mips
+ The DEFAULT_MACHINE constant is calculated from the CMAKE_SYSTEM_PROCESSOR
+ variable which contains the processor which built mariadb. Since most Debian
+ buildds run on 64-bit hardware even though they build 32-bit binaries,
+ DEFAULT_MACHINE previously contained "mips64" on 32-bit builds. This confuses
+ some mroonga tests which rely on DEFAULT_MACHINE to detect 64-bitness.
+ .
+ This patch fixes the value of DEFAULT_MACHINE so it always contains just "mips"
+ on 32-bit mips builds.
+Author: James Cowgill <jcowgill@debian.org>
+---
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
+--- a/cmake/package_name.cmake
++++ b/cmake/package_name.cmake
+@@ -34,6 +34,10 @@ IF(NOT VERSION)
+       SET(DEFAULT_MACHINE "mips")
+     ENDIF()
+ 
++    IF(NOT 64BIT AND CMAKE_SYSTEM_PROCESSOR MATCHES "^mips64")
++      SET(DEFAULT_MACHINE "mips")
++    ENDIF()
++
+     IF(CMAKE_SYSTEM_NAME MATCHES "Windows")
+       SET(NEED_DASH_BETWEEN_PLATFORM_AND_MACHINE 0)
+       SET(DEFAULT_PLATFORM "win")
diff --git a/packages/databases/mariadb/patches/mariadb-mips-unstable-tests.patch b/packages/databases/mariadb/patches/mariadb-mips-unstable-tests.patch
new file mode 100644
index 00000000000..1361f81cd55
--- /dev/null
+++ b/packages/databases/mariadb/patches/mariadb-mips-unstable-tests.patch
@@ -0,0 +1,60 @@
+Description: Remove various tests from unstable-tests which now pass on MIPS
+Author: James Cowgill <jcowgill@debian.org>
+---
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
+--- a/mysql-test/unstable-tests
++++ b/mysql-test/unstable-tests
+@@ -159,7 +159,6 @@
+ 
+ federated.federated_bug_35333    : Modified in 10.1.22
+ federated.federated_innodb       : MDEV-10617, MDEV-10417 - Wrong checksum, timeouts, fails on Mips
+-federated.federated_partition    : MDEV-10417 - Fails on Mips
+ federated.federated_transactions : MDEV-10617, MDEV-10417 - Wrong checksum, timeouts, fails on Mips
+ federated.federatedx             : MDEV-10617 - Wrong checksum, timeouts
+ 
+@@ -290,10 +289,9 @@
+ 
+ #----------------------------------------------------------------
+ 
+-multi_source.gtid        : MDEV-10417 - Fails on Mips
+ multi_source.info_logs   : MDEV-10042 - Wrong result, MDEV-12629 - Valgrind warnings
+-multi_source.multisource : MDEV-10417 - Fails on Mips
+ multi_source.reset_slave : MDEV-10690 - wrong result
++multi_source.gtid        : MDEV-10620 - Timeout in wait condition
+ multi_source.simple      : MDEV-4633 - Wrong slave status output
+ multi_source.status_vars : MDEV-4632 - failed while waiting for Slave_received_heartbeats
+ 
+@@ -361,12 +359,9 @@
+ #----------------------------------------------------------------
+ 
+ rpl.last_insert_id                    : MDEV-10625 - warnings in error log
+-rpl.rpl_auto_increment                : MDEV-10417 - Fails on Mips
+-rpl.rpl_auto_increment_bug45679       : MDEV-10417 - Fails on Mips
+ rpl.rpl_auto_increment_update_failure : MDEV-10625 - warnings in error log
+ rpl.rpl_binlog_index                  : MDEV-9501 - Warning: failed registering on master
+ rpl.rpl_checksum_cache                : MDEV-12173 - Unexpected error
+-rpl.rpl_ddl                           : MDEV-10417 - Fails on Mips
+ rpl.rpl_domain_id_filter_restart      : MDEV-10684 - Wrong result
+ rpl.rpl_gtid_basic                    : MDEV-10681 - server startup problem
+ rpl.rpl_gtid_crash                    : MDEV-9501 - Warning: failed registering on master
+@@ -374,19 +369,17 @@
+ rpl.rpl_gtid_stop_start               : MDEV-10629 - Crash on shutdown, MDEV-12629 - Valgrind warnings
+ rpl.rpl_gtid_until                    : MDEV-10625 - warnings in error log
+ rpl.rpl_heartbeat_basic               : Modified in 10.1.22
+-rpl.rpl_innodb_bug30888               : MDEV-10417 - Fails on Mips
+ rpl.rpl_insert                        : MDEV-9329 - Fails on Ubuntu/s390x
+ rpl.rpl_insert_delayed                : MDEV-9329 - Fails on Ubuntu/s390x
+ rpl.rpl_invoked_features              : MDEV-10417 - Fails on Mips
+ rpl.rpl_mariadb_slave_capability      : MDEV-11018 - sporadic wrong events in binlog
+-rpl.rpl_mdev6020                      : MDEV-10630, MDEV-10417 - Timeouts, fails on Mips
++rpl.rpl_mdev6020                      : MDEV-10630, MDEV-10417 - Timeouts
+ rpl.rpl_mdev6386                      : Modified in 10.1.22
+ rpl.rpl_mysql_upgrade                 : Modified in 10.1.23
+ rpl.rpl_parallel                      : MDEV-10653 - Timeouts
+ rpl.rpl_parallel_optimistic           : MDEV-10511 - timeout
+ rpl.rpl_parallel_retry                : MDEV-11119 - Server crash
+ rpl.rpl_parallel_temptable            : MDEV-10356 - Crash in close_thread_tables
+-rpl.rpl_partition_innodb              : MDEV-10417 - Fails on Mips
+ rpl.rpl_password_boundaries           : MDEV-11534 - Slave IO warnings
+ rpl.rpl_row_log_innodb                : MDEV-10688 - Wrong result
+ rpl.rpl_row_sp001                     : MDEV-9329 - Fails on Ubuntu/s390x
diff --git a/packages/databases/sqlite/package.mk b/packages/databases/sqlite/package.mk
index bf7eb6c8dba..5483f80e454 100644
--- a/packages/databases/sqlite/package.mk
+++ b/packages/databases/sqlite/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="sqlite"
-PKG_VERSION="autoconf-3180000"
+PKG_VERSION="autoconf-3190200"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="PublicDomain"
@@ -43,6 +43,9 @@ PKG_AUTORECONF="yes"
 # data to help it make better index choices.
   CFLAGS="$CFLAGS -DSQLITE_ENABLE_STAT3"
 
+# relocation R_MIPS_HI16 against `a local symbol' can not be used when making a shared object; recompile with -fPIC
+  CFLAGS="$CFLAGS -fPIC"
+
 # When this C-preprocessor macro is defined, SQLite includes some additional APIs
 # that provide convenient access to meta-data about tables and queries. The APIs that
 # are enabled by this option are:
diff --git a/packages/devel/autoconf-archive/package.mk b/packages/devel/autoconf-archive/package.mk
index ed5bcbcbce6..245694f4b3a 100644
--- a/packages/devel/autoconf-archive/package.mk
+++ b/packages/devel/autoconf-archive/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="autoconf-archive"
-PKG_VERSION="2016.09.16"
+PKG_VERSION="2017.03.21"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/devel/boost/package.mk b/packages/devel/boost/package.mk
deleted file mode 100644
index 5cc8ce6e636..00000000000
--- a/packages/devel/boost/package.mk
+++ /dev/null
@@ -1,78 +0,0 @@
-################################################################################
-#      This file is part of OpenELEC - http://www.openelec.tv
-#      Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv)
-#
-#  OpenELEC is free software: you can redistribute it and/or modify
-#  it under the terms of the GNU General Public License as published by
-#  the Free Software Foundation, either version 2 of the License, or
-#  (at your option) any later version.
-#
-#  OpenELEC is distributed in the hope that it will be useful,
-#  but WITHOUT ANY WARRANTY; without even the implied warranty of
-#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#  GNU General Public License for more details.
-#
-#  You should have received a copy of the GNU General Public License
-#  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
-################################################################################
-
-PKG_NAME="boost"
-PKG_VERSION="1_61_0"
-PKG_REV="1"
-PKG_ARCH="any"
-PKG_LICENSE="OSS"
-PKG_SITE="http://www.boost.org/"
-PKG_URL="$SOURCEFORGE_SRC/boost/boost/1.61.0/${PKG_NAME}_${PKG_VERSION}.tar.bz2"
-PKG_SOURCE_DIR="${PKG_NAME}_${PKG_VERSION}"
-PKG_DEPENDS_HOST=""
-PKG_DEPENDS_TARGET="toolchain boost:host Python:host zlib bzip2"
-PKG_PRIORITY="optional"
-PKG_SECTION="devel"
-PKG_SHORTDESC="boost: Peer-reviewed STL style libraries for C++"
-PKG_LONGDESC="Boost provides free peer-reviewed portable C++ source libraries. The emphasis is on libraries which work well with the C++ Standard Library. One goal is to establish existing practice and provide reference implementations so that the Boost libraries are suitable for eventual standardization. Some of the libraries have already been proposed for inclusion in the C++ Standards Committee's upcoming C++ Standard Library Technical Report."
-
-PKG_IS_ADDON="no"
-PKG_AUTORECONF="no"
-
-make_host() {
-  cd tools/build/src/engine
-    sh build.sh
-}
-
-makeinstall_host() {
-  mkdir -p $ROOT/$TOOLCHAIN/bin
-    cp bin.*/bjam $ROOT/$TOOLCHAIN/bin
-}
-
-pre_configure_target() {
-  export CFLAGS="$CFLAGS -fPIC"
-  export CXXFLAGS="$CXXFLAGS -fPIC"
-  export LDFLAGS="$LDFLAGS -fPIC"
-}
-
-configure_target() {
-  sh bootstrap.sh --prefix=/usr \
-                  --with-bjam=$ROOT/$TOOLCHAIN/bin/bjam \
-                  --with-python=$ROOT/$TOOLCHAIN/bin/python \
-
-  echo "using gcc : `$CC -v 2>&1  | tail -n 1 |awk '{print $3}'` : $CC  : <compileflags>\"$CFLAGS\" <linkflags>\"$LDFLAGS\" ;" \
-    > tools/build/src/user-config.jam
-}
-
-make_target() {
-  : # nothing todo, we use makeinstall_target()
-}
-
-makeinstall_target() {
-  $ROOT/$TOOLCHAIN/bin/bjam -d2 --toolset=gcc link=static \
-                                --prefix=$SYSROOT_PREFIX/usr \
-                                --ignore-site-config \
-                                --layout=system \
-                                --with-thread \
-                                --with-iostreams \
-                                --with-system \
-                                --with-serialization \
-                                --with-filesystem \
-                                --with-regex -sICU_PATH="$SYSROOT_PREFIX/usr" \
-                                install
-}
diff --git a/packages/devel/cmake/package.mk b/packages/devel/cmake/package.mk
index 6e2ba8242b9..6d6f9b51de6 100644
--- a/packages/devel/cmake/package.mk
+++ b/packages/devel/cmake/package.mk
@@ -17,12 +17,12 @@
 ################################################################################
 
 PKG_NAME="cmake"
-PKG_VERSION="3.7.2"
+PKG_VERSION="3.8.2"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="BSD"
 PKG_SITE="http://www.cmake.org/"
-PKG_URL="http://www.cmake.org/files/v3.7/$PKG_NAME-$PKG_VERSION.tar.gz"
+PKG_URL="http://www.cmake.org/files/v3.8/$PKG_NAME-$PKG_VERSION.tar.gz"
 PKG_DEPENDS_HOST="ccache:host libressl:host"
 PKG_PRIORITY="optional"
 PKG_SECTION="toolchain/devel"
diff --git a/packages/devel/flex/package.mk b/packages/devel/flex/package.mk
index d8e0d2c5327..84093df9c95 100644
--- a/packages/devel/flex/package.mk
+++ b/packages/devel/flex/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="flex"
-PKG_VERSION="2.6.3"
+PKG_VERSION="2.6.4"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/devel/lcms2/package.mk b/packages/devel/lcms2/package.mk
index 391db90c951..a036cfc1458 100644
--- a/packages/devel/lcms2/package.mk
+++ b/packages/devel/lcms2/package.mk
@@ -35,6 +35,10 @@ PKG_CONFIGURE_OPTS_TARGET="--enable-static --disable-shared \
                            --with-zlib --with-threads \
                            --without-jpeg --without-tiff"
 
+pre_configure_target() {
+  CFLAGS="$CFLAGS -fPIC"
+}
+
 post_makeinstall_target() {
   rm -rf $INSTALL/usr/bin
 }
diff --git a/packages/devel/libcec/package.mk b/packages/devel/libcec/package.mk
index d76561c96a9..61ba198e0d1 100644
--- a/packages/devel/libcec/package.mk
+++ b/packages/devel/libcec/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="libcec"
-PKG_VERSION="0a97062"
+PKG_VERSION="3953f8d"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/devel/libcec/patches/libcec-00-imx6-support.patch b/packages/devel/libcec/patches/libcec-00-imx6-support.patch
index 7c56fb15560..802cecf38de 100644
--- a/packages/devel/libcec/patches/libcec-00-imx6-support.patch
+++ b/packages/devel/libcec/patches/libcec-00-imx6-support.patch
@@ -446,7 +446,7 @@ diff -Naur libcec-5388d3a/src/libcec/adapter/IMX/IMXCECAdapterCommunication.cpp
 +}
 +
 +
-+cec_logical_addresses CIMXCECAdapterCommunication::GetLogicalAddresses(void)
++cec_logical_addresses CIMXCECAdapterCommunication::GetLogicalAddresses(void) const
 +{
 +  cec_logical_addresses addresses;
 +  addresses.Clear();
@@ -635,7 +635,7 @@ diff -Naur libcec-5388d3a/src/libcec/adapter/IMX/IMXCECAdapterCommunication.h li
 +    bool SetLineTimeout(uint8_t UNUSED(iTimeout)) { return true; }
 +    bool StartBootloader(void) { return false; }
 +    bool SetLogicalAddresses(const cec_logical_addresses &addresses);
-+    cec_logical_addresses GetLogicalAddresses(void);
++    cec_logical_addresses GetLogicalAddresses(void) const;
 +    bool PingAdapter(void) { return IsInitialised(); }
 +    uint16_t GetFirmwareVersion(void);
 +    uint32_t GetFirmwareBuildDate(void) { return 0; }
@@ -669,7 +669,7 @@ diff -Naur libcec-5388d3a/src/libcec/adapter/IMX/IMXCECAdapterCommunication.h li
 +    //cec_logical_addresses       m_logicalAddresses;
 +    cec_logical_address         m_logicalAddress;
 +
-+    P8PLATFORM::CMutex            m_mutex;
++    mutable P8PLATFORM::CMutex  m_mutex;
 +    P8PLATFORM::CCDevSocket       *m_dev;	/**< the device connection */
 +    bool                        m_bLogicalAddressRegistered;
 +    bool                        m_bInitialised;
diff --git a/packages/devel/libugpio/package.mk b/packages/devel/libugpio/package.mk
index 01c0ac3e73c..a13899b0183 100644
--- a/packages/devel/libugpio/package.mk
+++ b/packages/devel/libugpio/package.mk
@@ -32,3 +32,7 @@ PKG_IS_ADDON="no"
 PKG_AUTORECONF="yes"
 
 PKG_CONFIGURE_OPTS_TARGET="--enable-static --disable-shared"
+
+pre_configure_target() {
+  CFLAGS="$CFLAGS -fPIC"
+}
diff --git a/packages/devel/netbsd-curses/package.mk b/packages/devel/netbsd-curses/package.mk
index afe10d21b9e..e3d30cc4b3f 100644
--- a/packages/devel/netbsd-curses/package.mk
+++ b/packages/devel/netbsd-curses/package.mk
@@ -36,6 +36,10 @@ PKG_AUTORECONF="no"
   export CFLAGS=`echo $CFLAGS | sed -e "s|-D_FORTIFY_SOURCE=.||g"`
   export LDFLAGS=`echo $LDFLAGS | sed -e "s|-D_FORTIFY_SOURCE=.||g"`
 
+pre_configure_target() {
+  CFLAGS="$CFLAGS -fPIC"
+}
+
 make_target() {
   make HOSTCC="$HOST_CC" CFLAGS="$CFLAGS -D_GNU_SOURCE" PREFIX=/usr all-static
 }
diff --git a/packages/devel/newt/package.mk b/packages/devel/newt/package.mk
index 23592f19af4..3223a91b0ca 100644
--- a/packages/devel/newt/package.mk
+++ b/packages/devel/newt/package.mk
@@ -18,7 +18,7 @@
 ################################################################################
 
 PKG_NAME="newt"
-PKG_VERSION="0.52.19"
+PKG_VERSION="0.52.20"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="LGPL"
@@ -33,7 +33,7 @@ PKG_LONGDESC="Newt is a programming library for color text mode, widget based us
 PKG_IS_ADDON="no"
 PKG_AUTORECONF="yes"
 
-PKG_CONFIGURE_OPTS_TARGET="--without-python --without-tcl"
+PKG_CONFIGURE_OPTS_TARGET="--disable-nls --without-python --without-tcl"
 
 pre_configure_target() {
  # newt fails to build in subdirs
diff --git a/packages/devel/pkg-config/package.mk b/packages/devel/pkg-config/package.mk
index 29099c75549..0c357809273 100644
--- a/packages/devel/pkg-config/package.mk
+++ b/packages/devel/pkg-config/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pkg-config"
-PKG_VERSION="0.29.1"
+PKG_VERSION="0.29.2"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/devel/popt/package.mk b/packages/devel/popt/package.mk
index f3b0c0253bd..4e814914f1d 100644
--- a/packages/devel/popt/package.mk
+++ b/packages/devel/popt/package.mk
@@ -33,3 +33,7 @@ PKG_IS_ADDON="no"
 PKG_AUTORECONF="no"
 
 PKG_CONFIGURE_OPTS_TARGET="--enable-static --disable-shared"
+
+pre_configure_target() {
+  CFLAGS="$CFLAGS -fPIC"
+}
diff --git a/packages/devel/readline/package.mk b/packages/devel/readline/package.mk
index d298c9deb36..512072813e9 100644
--- a/packages/devel/readline/package.mk
+++ b/packages/devel/readline/package.mk
@@ -38,6 +38,10 @@ PKG_CONFIGURE_OPTS_TARGET="bash_cv_wcwidth_broken=no \
                            --with-curses \
                            --without-purify"
 
+pre_configure_target() {
+  CFLAGS="$CFLAGS -fPIC"
+}
+
 post_makeinstall_target() {
   rm -rf $INSTALL/usr/share/readline
 }
diff --git a/packages/devel/swig/package.mk b/packages/devel/swig/package.mk
index 507a7b01f04..7d8930f748e 100644
--- a/packages/devel/swig/package.mk
+++ b/packages/devel/swig/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="swig"
-PKG_VERSION="3.0.10"
+PKG_VERSION="3.0.12"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/graphics/libdrm/package.mk b/packages/graphics/libdrm/package.mk
index 9a2ec9cbc4f..8cc3eb9637b 100644
--- a/packages/graphics/libdrm/package.mk
+++ b/packages/graphics/libdrm/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="libdrm"
-PKG_VERSION="2.4.76"
+PKG_VERSION="2.4.81"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/graphics/libepoxy/package.mk b/packages/graphics/libepoxy/package.mk
index b2e799b9a34..4e597326fe9 100644
--- a/packages/graphics/libepoxy/package.mk
+++ b/packages/graphics/libepoxy/package.mk
@@ -22,7 +22,7 @@
 # in Xorg.log
 
 PKG_NAME="libepoxy"
-PKG_VERSION="1.4.1"
+PKG_VERSION="1.4.2"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/graphics/libjpeg-turbo/patches/libjpeg-turbo-mips.patch b/packages/graphics/libjpeg-turbo/patches/libjpeg-turbo-mips.patch
new file mode 100644
index 00000000000..17ffecdc7e3
--- /dev/null
+++ b/packages/graphics/libjpeg-turbo/patches/libjpeg-turbo-mips.patch
@@ -0,0 +1,21 @@
+From: =?utf-8?q?Ond=C5=99ej_Sur=C3=BD?= <ondrej@sury.org>
+Date: Thu, 20 Oct 2016 15:51:10 +0200
+Subject: Declare env on MIPS on first use (Courtesy of Aurelien Jarno)
+
+---
+ simd/jsimd_mips.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/simd/jsimd_mips.c b/simd/jsimd_mips.c
+index 63b8115..a371a32 100644
+--- a/simd/jsimd_mips.c
++++ b/simd/jsimd_mips.c
+@@ -79,7 +79,7 @@ init_simd (void)
+ #endif
+ 
+   /* Force different settings through environment variables */
+-  env = getenv("JSIMD_FORCEDSPR2");
++  char *env = getenv("JSIMD_FORCEDSPR2");
+   if ((env != NULL) && (strcmp(env, "1") == 0))
+     simd_support = JSIMD_MIPS_DSPR2;
+   env = getenv("JSIMD_FORCENONE");
diff --git a/packages/graphics/libpng/package.mk b/packages/graphics/libpng/package.mk
index 398680034ee..e399168a661 100644
--- a/packages/graphics/libpng/package.mk
+++ b/packages/graphics/libpng/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="libpng"
-PKG_VERSION="1.6.28"
+PKG_VERSION="1.6.29"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="OSS"
@@ -36,6 +36,12 @@ PKG_AUTORECONF="no"
 PKG_CMAKE_OPTS_HOST="-DPNG_SHARED=OFF -DPNG_STATIC=ON -DPNG_TESTS=OFF -DCMAKE_SYSTEM_PROCESSOR=$(uname -m)"
 PKG_CMAKE_OPTS_TARGET="-DPNG_SHARED=OFF -DPNG_STATIC=ON -DPNG_TESTS=OFF -DCMAKE_SYSTEM_PROCESSOR=$TARGET_ARCH"
 
+pre_configure_target() {
+  if [ "$TARGET_ARCH" = "x86_64" ]; then
+    CFLAGS+=" -DPNG_INTEL_SSE"
+  fi
+}
+
 post_makeinstall_target() {
   sed -e "s:\([\"'= ]\)/usr:\\1$SYSROOT_PREFIX/usr:g" \
       -e "s:libs=\"-lpng16\":libs=\"-lpng16 -lz\":g" \
diff --git a/packages/graphics/mesa/package.mk b/packages/graphics/mesa/package.mk
index 25fb52949b9..cd335c80834 100644
--- a/packages/graphics/mesa/package.mk
+++ b/packages/graphics/mesa/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="mesa"
-PKG_VERSION="17.0.3"
+PKG_VERSION="17.0.7"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="OSS"
diff --git a/packages/lang/gcc/package.mk b/packages/lang/gcc/package.mk
index 10c3c132a38..ab5b157020d 100644
--- a/packages/lang/gcc/package.mk
+++ b/packages/lang/gcc/package.mk
@@ -34,6 +34,13 @@ PKG_LONGDESC="This package contains the GNU Compiler Collection. It includes com
 PKG_IS_ADDON="no"
 PKG_AUTORECONF="no"
 
+# Some target need gcc/libatomic:
+if [ "GCC_LIBATOMIC_SUPPORT" = "yes" ]; then
+  GCC_LIBATOMIC="--enable-libatomic"
+else
+  GCC_LIBATOMIC="--disable-libatomic"
+fi
+
 GCC_COMMON_CONFIGURE_OPTS="--target=$TARGET_NAME \
                            --with-sysroot=$SYSROOT_PREFIX \
                            --with-gmp=$ROOT/$TOOLCHAIN \
@@ -53,13 +60,13 @@ GCC_COMMON_CONFIGURE_OPTS="--target=$TARGET_NAME \
                            --without-cloog \
                            --disable-libada \
                            --disable-libmudflap \
-                           --disable-libatomic \
                            --disable-libitm \
                            --disable-libquadmath \
                            --disable-libgomp \
                            --disable-libmpx"
 
 PKG_CONFIGURE_OPTS_BOOTSTRAP="$GCC_COMMON_CONFIGURE_OPTS \
+                              --disable-libatomic \
                               --enable-languages=c \
                               --disable-__cxa_atexit \
                               --disable-libsanitizer \
@@ -73,6 +80,7 @@ PKG_CONFIGURE_OPTS_BOOTSTRAP="$GCC_COMMON_CONFIGURE_OPTS \
                               $GCC_OPTS"
 
 PKG_CONFIGURE_OPTS_HOST="$GCC_COMMON_CONFIGURE_OPTS \
+                         $GCC_LIBATOMIC \
                          --enable-languages=c,c++ \
                          --enable-__cxa_atexit \
                          --enable-decimal-float \
diff --git a/packages/linux-drivers/dvbhdhomerun/package.mk b/packages/linux-drivers/dvbhdhomerun/package.mk
deleted file mode 100644
index e1a9d94d01a..00000000000
--- a/packages/linux-drivers/dvbhdhomerun/package.mk
+++ /dev/null
@@ -1,66 +0,0 @@
-################################################################################
-#      This file is part of OpenELEC - http://www.openelec.tv
-#      Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv)
-#
-#  OpenELEC is free software: you can redistribute it and/or modify
-#  it under the terms of the GNU General Public License as published by
-#  the Free Software Foundation, either version 2 of the License, or
-#  (at your option) any later version.
-#
-#  OpenELEC is distributed in the hope that it will be useful,
-#  but WITHOUT ANY WARRANTY; without even the implied warranty of
-#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#  GNU General Public License for more details.
-#
-#  You should have received a copy of the GNU General Public License
-#  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
-################################################################################
-
-PKG_NAME="dvbhdhomerun"
-PKG_VERSION="20130704"
-PKG_REV="1"
-PKG_ARCH="any"
-PKG_LICENSE="GPL"
-PKG_SITE="http://sourceforge.net/projects/dvbhdhomerun/"
-PKG_URL="${DISTRO_SRC}/${PKG_NAME}-${PKG_VERSION}.tar.xz"
-#PKG_URL="$SOURCEFORGE_SRC/project/dvbhdhomerun/${PKG_NAME}_${PKG_VERSION}.tar.gz"
-#PKG_SOURCE_DIR="${PKG_NAME}_${PKG_VERSION}"
-PKG_DEPENDS_TARGET="toolchain linux libhdhomerun"
-PKG_NEED_UNPACK="$LINUX_DEPENDS"
-PKG_PRIORITY="optional"
-PKG_SECTION="driver/dvb"
-PKG_SHORTDESC="A linux DVB driver for the HDHomeRun TV tuner (http://www.silicondust.com)."
-PKG_LONGDESC="A linux DVB driver for the HDHomeRun TV tuner (http://www.silicondust.com)."
-
-PKG_IS_ADDON="no"
-PKG_AUTORECONF="no"
-
-PKG_CMAKE_SCRIPT_TARGET="userhdhomerun/CMakeLists.txt"
-
-pre_make_target() {
-  ( cd ../kernel
-    LDFLAGS="" make dvb_hdhomerun KERNEL_DIR=$(get_pkg_build linux)
-    fix_module_depends dvb_hdhomerun_core.ko "dvb_core"
-  )
-}
-
-pre_configure_target() {
-
-# use it here to be sure libhdhomerun is already built
-  PKG_CMAKE_OPTS_TARGET="-DLIBHDHOMERUN_PATH=$(ls -d $ROOT/$BUILD/libhdhomerun-*/)"
-
-# absolute path
-  LIBHDHOMERUN_PATH=$(ls -d $ROOT/$BUILD/libhdhomerun-*/)
-  sed -i "s|SET(LIBHDHOMERUN_PATH .*)|SET(LIBHDHOMERUN_PATH $LIBHDHOMERUN_PATH)|g" ../userhdhomerun/CMakeLists.txt
-  sed -i "s|/etc/dvbhdhomerun|/tmp/dvbhdhomerun|g" ../userhdhomerun/hdhomerun_tuner.cpp
-  sed -i "s|/etc/dvbhdhomerun|/tmp/dvbhdhomerun|g" ../userhdhomerun/hdhomerun_controller.cpp
-}
-
-makeinstall_target() {
-  cd $ROOT/$PKG_BUILD
-    mkdir -p $INSTALL/usr/lib/modules/$(get_module_dir)/hdhomerun
-      cp kernel/*.ko $INSTALL/usr/lib/modules/$(get_module_dir)/hdhomerun/
-
-    mkdir -p $INSTALL/usr/bin
-      cp -PR .$TARGET_NAME/userhdhomerun $INSTALL/usr/bin
-}
diff --git a/packages/linux-drivers/dvbhdhomerun/patches/dvbhdhomerun-001-conf_file.patch b/packages/linux-drivers/dvbhdhomerun/patches/dvbhdhomerun-001-conf_file.patch
deleted file mode 100644
index d93002d43fe..00000000000
--- a/packages/linux-drivers/dvbhdhomerun/patches/dvbhdhomerun-001-conf_file.patch
+++ /dev/null
@@ -1,52 +0,0 @@
-diff -uNr dvbhdhomerun-0.0.15-orig/userhdhomerun/conf_inifile.cpp dvbhdhomerun-0.0.15/userhdhomerun/conf_inifile.cpp
---- dvbhdhomerun-0.0.15-orig/userhdhomerun/conf_inifile.cpp	2013-02-17 22:37:34.000000000 +0100
-+++ dvbhdhomerun-0.0.15/userhdhomerun/conf_inifile.cpp	2013-03-02 10:23:46.000000000 +0100
-@@ -8,6 +8,38 @@
- 
- using namespace std;
- 
-+// http://stackoverflow.com/questions/6089231/getting-std-ifstream-to-handle-lf-cr-and-crlf
-+std::istream& safeGetline(std::istream& is, std::string& t)
-+{
-+    t.clear();
-+
-+    // The characters in the stream are read one-by-one using a std::streambuf.
-+    // That is faster than reading them one-by-one using the std::istream.
-+    // Code that uses streambuf this way must be guarded by a sentry object.
-+    // The sentry object performs various tasks,
-+    // such as thread synchronization and updating the stream state.
-+
-+    std::istream::sentry se(is);
-+    std::streambuf* sb = is.rdbuf();
-+
-+    for(;;) {
-+        int c = sb->sbumpc();
-+        switch (c) {
-+        case '\r':
-+            c = sb->sgetc();
-+            if(c == '\n')
-+                sb->sbumpc();
-+            return is;
-+        case '\n':
-+        case EOF:
-+            return is;
-+        default:
-+            t += (char)c;
-+        }
-+    }
-+}
-+
-+
- bool ConfIniFile::OpenIniFile(const string& _filename)
- {
-    m_sectionKeyValue.clear();
-@@ -17,7 +49,8 @@
-    if(conffile.is_open()) {
-       string line;
-       string section;
--      while(getline(conffile, line)) {
-+      //while(getline(conffile, line)) {
-+      while(safeGetline(conffile, line)) {
-          if(line.empty()) {
-             //LOG() << " ignore, empty";
-          } 
diff --git a/packages/linux-drivers/dvbhdhomerun/patches/dvbhdhomerun-002-auto_detect.patch b/packages/linux-drivers/dvbhdhomerun/patches/dvbhdhomerun-002-auto_detect.patch
deleted file mode 100644
index 9a926b9960c..00000000000
--- a/packages/linux-drivers/dvbhdhomerun/patches/dvbhdhomerun-002-auto_detect.patch
+++ /dev/null
@@ -1,34 +0,0 @@
-diff -uNr dvbhdhomerun-0.0.15-orig/userhdhomerun/hdhomerun_tuner.cpp dvbhdhomerun-0.0.15/userhdhomerun/hdhomerun_tuner.cpp
---- dvbhdhomerun-0.0.15-orig/userhdhomerun/hdhomerun_tuner.cpp	2013-02-17 22:37:34.000000000 +0100
-+++ dvbhdhomerun-0.0.15/userhdhomerun/hdhomerun_tuner.cpp	2013-03-02 10:25:15.000000000 +0100
-@@ -97,12 +97,28 @@
-          string type(tmp);
-          LOG() << "Type of device: " << type << endl;
-          if(type == "hdhomerun_dvbt") {
--            LOG() << "Notice, setting to DVB-C!! Use /etc/dvbhdhomerun to change that." << endl;
--            m_type = HdhomerunTuner::DVBC;
-+            m_type = HdhomerunTuner::DVBT;
-          }
-          else if(type == "hdhomerun_atsc") {
-             m_type = HdhomerunTuner::ATSC;
-          }
-+         else if(type == "hdhomerun3_dvbt") {
-+            m_type = HdhomerunTuner::DVBT;
-+         }        
-+         else if(type.find("dvbt") != string::npos) {
-+            m_type = HdhomerunTuner::DVBT;
-+         }
-+         else if(type.find("dvbc") != string::npos) {
-+            m_type = HdhomerunTuner::DVBC;
-+         }
-+         else if(type.find("atsc") != string::npos) {
-+            m_type = HdhomerunTuner::ATSC;
-+         }
-+
-+         if (m_type != HdhomerunTuner::NOT_SET) {
-+            LOG() << "Auto detecting tuner type set to \"" << type 
-+                  << "\" based on auto detecting" << endl;
-+         }
-       }
-       else {
-          ERR() << "get_model_str from HDHomeRun failed!" << endl;
diff --git a/packages/linux-drivers/dvbhdhomerun/patches/dvbhdhomerun-hdhomerun_discover_find_devices_custom_v2.patch b/packages/linux-drivers/dvbhdhomerun/patches/dvbhdhomerun-hdhomerun_discover_find_devices_custom_v2.patch
deleted file mode 100644
index 4ecdee32539..00000000000
--- a/packages/linux-drivers/dvbhdhomerun/patches/dvbhdhomerun-hdhomerun_discover_find_devices_custom_v2.patch
+++ /dev/null
@@ -1,12 +0,0 @@
-diff -Naur dvbhdhomerun-20130704/userhdhomerun/hdhomerun_controller.cpp dvbhdhomerun-20130704.patch/userhdhomerun/hdhomerun_controller.cpp
---- dvbhdhomerun-20130704/userhdhomerun/hdhomerun_controller.cpp	2013-02-17 22:37:34.000000000 +0100
-+++ dvbhdhomerun-20130704.patch/userhdhomerun/hdhomerun_controller.cpp	2015-09-23 14:13:00.659818567 +0200
-@@ -71,7 +71,7 @@
-   // ...really fragile API design...
-   memset(devices, 0, sizeof(devices));
- 
--  int numOfDevices = hdhomerun_discover_find_devices_custom(0, HDHOMERUN_DEVICE_TYPE_TUNER, HDHOMERUN_DEVICE_ID_WILDCARD, devices, m_maxDevices);
-+  int numOfDevices = hdhomerun_discover_find_devices_custom_v2(0, HDHOMERUN_DEVICE_TYPE_TUNER, HDHOMERUN_DEVICE_ID_WILDCARD, devices, m_maxDevices);
-   LOG() << "Num of devices = " << numOfDevices << endl;
- 
-   if(numOfDevices == 0) {
diff --git a/packages/linux-drivers/dvbhdhomerun/patches/dvbhdhomerun-linux-3.8.4.patch b/packages/linux-drivers/dvbhdhomerun/patches/dvbhdhomerun-linux-3.8.4.patch
deleted file mode 100644
index ccde5f0e40d..00000000000
--- a/packages/linux-drivers/dvbhdhomerun/patches/dvbhdhomerun-linux-3.8.4.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-diff --git a/kernel/dvb_hdhomerun_init.c b/kernel/dvb_hdhomerun_init.c
-index d02a322..bd97d5e 100644
---- a/kernel/dvb_hdhomerun_init.c
-+++ b/kernel/dvb_hdhomerun_init.c
-@@ -143,7 +143,7 @@ static int dvb_hdhomerun_stop_feed(struct dvb_demux_feed *feed)
- 	return ret;
- }
- 
--static int __devinit dvb_hdhomerun_register(struct dvb_hdhomerun *hdhomerun)
-+static int  dvb_hdhomerun_register(struct dvb_hdhomerun *hdhomerun)
- {
- 	struct dvb_adapter *dvb_adapter;
- 	struct dvb_demux *dvbdemux;
-@@ -284,7 +284,7 @@ static void dvb_hdhomerun_unregister(struct dvb_hdhomerun *hdhomerun)
- }
- 
- 
--static int __devinit dvb_hdhomerun_probe(struct platform_device *plat_dev)
-+static int dvb_hdhomerun_probe(struct platform_device *plat_dev)
- {
- 	int ret;
- 	struct dvb_hdhomerun *hdhomerun;
diff --git a/packages/linux-drivers/dvbhdhomerun/patches/dvbhdhomerun-linux-4.2.patch b/packages/linux-drivers/dvbhdhomerun/patches/dvbhdhomerun-linux-4.2.patch
deleted file mode 100644
index d0ecedc5432..00000000000
--- a/packages/linux-drivers/dvbhdhomerun/patches/dvbhdhomerun-linux-4.2.patch
+++ /dev/null
@@ -1,73 +0,0 @@
-From 198aef39dee0357524c88ecc0665312c2c72a0d9 Mon Sep 17 00:00:00 2001
-From: HappyHeyoka <hh.kde.crash@gmail.com>
-Date: Sat, 25 Jul 2015 19:47:50 +1000
-Subject: [PATCH] Track changes to kernel include 'dvb_frontend.h' where
- fe_status_t has gone in line with kernel coding style. Changes should be
- backwards compatible (famous last words)
-
----
- kernel/dvb_hdhomerun_control_messages.h | 2 +-
- kernel/dvb_hdhomerun_fe.c               | 6 +++---
- userhdhomerun/hdhomerun_control.cpp     | 4 ++--
- 3 files changed, 6 insertions(+), 6 deletions(-)
-
-diff --git a/kernel/dvb_hdhomerun_control_messages.h b/kernel/dvb_hdhomerun_control_messages.h
-index 45230b1..c22fc80 100644
---- a/kernel/dvb_hdhomerun_control_messages.h
-+++ b/kernel/dvb_hdhomerun_control_messages.h
-@@ -60,7 +60,7 @@ struct dvbhdhomerun_control_mesg {
- 	unsigned int type;
- 	union {
- 		unsigned int frequency;
--		fe_status_t fe_status;
-+		enum fe_status frontend_status;
- 		int16_t signal_strength;
- 		struct dmx_pes_filter_params dmx_pes_filter;
- 		struct hdhomerun_dvb_demux_feed demux_feed;
-diff --git a/kernel/dvb_hdhomerun_fe.c b/kernel/dvb_hdhomerun_fe.c
-index a96799b..58be54c 100644
---- a/kernel/dvb_hdhomerun_fe.c
-+++ b/kernel/dvb_hdhomerun_fe.c
-@@ -49,7 +49,7 @@ struct dvb_hdhomerun_fe_state {
- 
- extern int hdhomerun_debug_mask;
- 
--static int dvb_hdhomerun_fe_read_status(struct dvb_frontend* fe, fe_status_t* status)
-+static int dvb_hdhomerun_fe_read_status(struct dvb_frontend* fe, enum fe_status* status)
- {
- 	struct dvbhdhomerun_control_mesg mesg;
- 	struct dvb_hdhomerun_fe_state* state = fe->demodulator_priv;
-@@ -60,7 +60,7 @@ static int dvb_hdhomerun_fe_read_status(struct dvb_frontend* fe, fe_status_t* st
- 	mesg.id = state->id;
- 	hdhomerun_control_post_and_wait(&mesg);
- 
--	*status = mesg.u.fe_status;
-+	*status = mesg.u.frontend_status;
- 
- 	return 0;
- }
-@@ -203,7 +203,7 @@ static int dvb_hdhomerun_fe_tune(struct dvb_frontend *fe, bool re_tune,
- #else
- static int dvb_hdhomerun_fe_tune(struct dvb_frontend *fe, struct dvb_frontend_parameters *params,
- #endif
--					unsigned int mode_flags, unsigned int *delay, fe_status_t *status)
-+					unsigned int mode_flags, unsigned int *delay, enum fe_status *status)
- {
-    int ret;
- 	DEBUG_FUNC(1);
-diff --git a/userhdhomerun/hdhomerun_control.cpp b/userhdhomerun/hdhomerun_control.cpp
-index 63b12f8..912b49d 100644
---- a/userhdhomerun/hdhomerun_control.cpp
-+++ b/userhdhomerun/hdhomerun_control.cpp
-@@ -219,9 +219,9 @@ void Control::FE_READ_Status(struct dvbhdhomerun_control_mesg& _mesg)
- 
-   HdhomerunTuner* tuner = m_hdhomerun->GetTuner(_mesg.id);
-   if(tuner) {
--     fe_status_t status = (fe_status_t)tuner->ReadStatus();
-+     fe_status status = (fe_status)tuner->ReadStatus();
-   
--     _mesg.u.fe_status = status;
-+     _mesg.u.frontend_status = status;
-   }
-   else {
-      ERR() << "Tuner id does not exist!" << _mesg.id << endl;
diff --git a/packages/linux-drivers/gpu-aml/patches/gpu-aml-0002-use-rfc-date-in-version-string.patch b/packages/linux-drivers/gpu-aml/patches/gpu-aml-0002-use-rfc-date-in-version-string.patch
new file mode 100644
index 00000000000..b21447b9805
--- /dev/null
+++ b/packages/linux-drivers/gpu-aml/patches/gpu-aml-0002-use-rfc-date-in-version-string.patch
@@ -0,0 +1,13 @@
+diff --git a/mali/Kbuild b/mali/Kbuild
+index 7cc2225..6ce6270 100755
+--- a/mali/Kbuild
++++ b/mali/Kbuild
+@@ -312,7 +312,7 @@ VERSION_STRINGS += REPO_URL=$(REPO_URL)
+ VERSION_STRINGS += REVISION=$(DRIVER_REV)
+ VERSION_STRINGS += CHANGED_REVISION=$(CHANGED_REVISION)
+ VERSION_STRINGS += CHANGE_DATE=$(CHANGE_DATE)
+-VERSION_STRINGS += BUILD_DATE=$(shell date)
++VERSION_STRINGS += BUILD_DATE=$(shell date -R)
+ ifdef CONFIG_MALI400_DEBUG
+ VERSION_STRINGS += BUILD=debug
+ else
diff --git a/packages/linux-drivers/wetekdvb/package.mk b/packages/linux-drivers/wetekdvb/package.mk
index dac85e4cf8f..d4a3fea45f9 100644
--- a/packages/linux-drivers/wetekdvb/package.mk
+++ b/packages/linux-drivers/wetekdvb/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="wetekdvb"
-PKG_VERSION="20170116"
+PKG_VERSION="20170602"
 PKG_REV="1"
 PKG_ARCH="arm aarch64"
 PKG_LICENSE="nonfree"
diff --git a/packages/linux-firmware/bcm2835-firmware/package.mk b/packages/linux-firmware/bcm2835-firmware/package.mk
index 3f9bd7c40af..6e98724e7ae 100644
--- a/packages/linux-firmware/bcm2835-firmware/package.mk
+++ b/packages/linux-firmware/bcm2835-firmware/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="bcm2835-firmware"
-PKG_VERSION="e5ca26f"
+PKG_VERSION="89ec375"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="nonfree"
diff --git a/packages/linux/config/default-network-drivers.config b/packages/linux/config/default-network-drivers.config
index dc33a26c783..381f4f5e69d 100644
--- a/packages/linux/config/default-network-drivers.config
+++ b/packages/linux/config/default-network-drivers.config
@@ -5,13 +5,13 @@ CONFIG_NET=y
 CONFIG_NETDEVICES=y
 CONFIG_NET_CORE=y
 CONFIG_MACVLAN=m
-CONFIG_MACSEC=m
-CONFIG_NETCONSOLE=m
+CONFIG_MACSEC=y
+CONFIG_NETCONSOLE=y
 CONFIG_NETPOLL=y
 CONFIG_NET_POLL_CONTROLLER=y
-CONFIG_TUN=m
+CONFIG_TUN=y
 CONFIG_VETH=m
-CONFIG_VIRTIO_NET=m
+CONFIG_VIRTIO_NET=y
 
 ### PPP support ###
 
diff --git a/packages/linux/config/default-networking.config b/packages/linux/config/default-networking.config
index d2fd326ed51..796c58396fd 100644
--- a/packages/linux/config/default-networking.config
+++ b/packages/linux/config/default-networking.config
@@ -41,3 +41,5 @@ CONFIG_RFKILL=y
 CONFIG_RFKILL_PM=y
 CONFIG_RFKILL_LEDS=y
 
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_FQ_CODEL=m
diff --git a/packages/linux/config/extra-filesystem.config b/packages/linux/config/extra-filesystem.config
index 2f3fea229c3..f09fa833eca 100644
--- a/packages/linux/config/extra-filesystem.config
+++ b/packages/linux/config/extra-filesystem.config
@@ -17,6 +17,8 @@ CONFIG_F2FS_FS=m
 CONFIG_F2FS_STAT_FS=y
 CONFIG_F2FS_CHECK_FS=y
 
+CONFIG_FSCACHE=y
+
 CONFIG_AUTOFS4_FS=y
 CONFIG_FUSE_FS=m
 CONFIG_OVERLAY_FS=m
diff --git a/packages/linux/config/extra-lan-pci.config b/packages/linux/config/extra-lan-pci.config
index 7586257fe00..2252c882817 100644
--- a/packages/linux/config/extra-lan-pci.config
+++ b/packages/linux/config/extra-lan-pci.config
@@ -3,83 +3,83 @@
 CONFIG_NET=y
 CONFIG_NETDEVICES=y
 CONFIG_PCI=y
-CONFIG_MII=m
+CONFIG_MII=y
 
 CONFIG_ETHERNET=y
-CONFIG_MDIO=m
+CONFIG_MDIO=y
 
 CONFIG_NET_VENDOR_3COM=y
-CONFIG_VORTEX=m
+CONFIG_VORTEX=y
 
 CONFIG_NET_VENDOR_AMD=y
-CONFIG_PCNET32=m
+CONFIG_PCNET32=y
 
 CONFIG_NET_VENDOR_ARC=y
 
 CONFIG_NET_VENDOR_ATHEROS=y
-CONFIG_ATL2=m
-CONFIG_ATL1=m
-CONFIG_ATL1E=m
-CONFIG_ATL1C=m
-CONFIG_ALX=m
+CONFIG_ATL2=y
+CONFIG_ATL1=y
+CONFIG_ATL1E=y
+CONFIG_ATL1C=y
+CONFIG_ALX=y
 
 CONFIG_NET_VENDOR_BROADCOM=y
-CONFIG_B44=m
+CONFIG_B44=y
 CONFIG_B44_PCI_AUTOSELECT=y
 CONFIG_B44_PCICORE_AUTOSELECT=y
 CONFIG_B44_PCI=y
-CONFIG_BNX2=m
-CONFIG_CNIC=m
-CONFIG_TIGON3=m
-CONFIG_DNET=m
+CONFIG_BNX2=y
+CONFIG_CNIC=y
+CONFIG_TIGON3=y
+CONFIG_DNET=y
 
 CONFIG_NET_VENDOR_DEC=y
 
 CONFIG_NET_TULIP=y
-CONFIG_ULI526X=m
+CONFIG_ULI526X=y
 
 CONFIG_NET_VENDOR_EZCHIP=y
 
 CONFIG_NET_VENDOR_INTEL=y
-CONFIG_E100=m
-CONFIG_E1000=m
-CONFIG_E1000E=m
+CONFIG_E100=y
+CONFIG_E1000=y
+CONFIG_E1000E=y
 CONFIG_E1000E_HWTS=y
-CONFIG_IGB=m
+CONFIG_IGB=y
 CONFIG_IGB_HWMON=y
-CONFIG_IXGB=m
-CONFIG_IXGBE=m
+CONFIG_IXGB=y
+CONFIG_IXGBE=y
 CONFIG_IXGBE_HWMON=y
 CONFIG_IXGBEVF=y
 
 CONFIG_NET_VENDOR_I825XX=y
-CONFIG_JME=m
+CONFIG_JME=y
 
 CONFIG_NET_VENDOR_MARVELL=y
-CONFIG_SKGE=m
+CONFIG_SKGE=y
 CONFIG_SKGE_GENESIS=y
-CONFIG_SKY2=m
+CONFIG_SKY2=y
 
 CONFIG_NET_VENDOR_NETRONOME=y
 
 CONFIG_NET_VENDOR_NVIDIA=y
-CONFIG_FORCEDETH=m
+CONFIG_FORCEDETH=y
 
 CONFIG_NET_VENDOR_QLOGIC=y
-CONFIG_QLA3XXX=m
+CONFIG_QLA3XXX=y
 
 CONFIG_NET_VENDOR_REALTEK=y
-CONFIG_8139TOO=m
-CONFIG_R8169=m
+CONFIG_8139TOO=y
+CONFIG_R8169=y
 
 CONFIG_NET_VENDOR_SIS=y
-CONFIG_SIS900=m
-CONFIG_SIS190=m
+CONFIG_SIS900=y
+CONFIG_SIS190=y
 
 CONFIG_NET_VENDOR_SMSC=y
-CONFIG_SMSC911X=m
-CONFIG_SMSC9420=m
+CONFIG_SMSC911X=y
+CONFIG_SMSC9420=y
 
 CONFIG_NET_VENDOR_VIA=y
-CONFIG_VIA_RHINE=m
-CONFIG_VIA_VELOCITY=m
+CONFIG_VIA_RHINE=y
+CONFIG_VIA_VELOCITY=y
diff --git a/packages/linux/config/extra-lan-phy.config b/packages/linux/config/extra-lan-phy.config
index 0533d4bd556..dce936a811b 100644
--- a/packages/linux/config/extra-lan-phy.config
+++ b/packages/linux/config/extra-lan-phy.config
@@ -5,13 +5,13 @@ CONFIG_NETDEVICES=y
 CONFIG_PHYLIB=y
 CONFIG_SWPHY=y
 
-CONFIG_AT803X_PHY=m
-CONFIG_AMD_PHY=m
-CONFIG_MARVELL_PHY=m
-CONFIG_SMSC_PHY=m
-CONFIG_BCM_NET_PHYLIB=m
-CONFIG_BROADCOM_PHY=m
-CONFIG_BCM87XX_PHY=m
-CONFIG_REALTEK_PHY=m
-CONFIG_FIXED_PHY=m
-CONFIG_VITESSE_PHY=m
+CONFIG_AT803X_PHY=y
+CONFIG_AMD_PHY=y
+CONFIG_MARVELL_PHY=y
+CONFIG_SMSC_PHY=y
+CONFIG_BCM_NET_PHYLIB=y
+CONFIG_BROADCOM_PHY=y
+CONFIG_BCM87XX_PHY=y
+CONFIG_REALTEK_PHY=y
+CONFIG_FIXED_PHY=y
+CONFIG_VITESSE_PHY=y
diff --git a/packages/linux/config/extra-lan-usb.config b/packages/linux/config/extra-lan-usb.config
index 4df80a8a49a..72d14fb3a86 100644
--- a/packages/linux/config/extra-lan-usb.config
+++ b/packages/linux/config/extra-lan-usb.config
@@ -2,7 +2,7 @@
 
 CONFIG_NET=y
 CONFIG_NETDEVICES=y
-CONFIG_MII=m
+CONFIG_MII=y
 CONFIG_USB=y
 
 CONFIG_USB_NET_DRIVERS=y
diff --git a/packages/linux/config/extra-nfs.config b/packages/linux/config/extra-nfs.config
index 9c28b306f39..52b2eda0d7d 100644
--- a/packages/linux/config/extra-nfs.config
+++ b/packages/linux/config/extra-nfs.config
@@ -6,7 +6,7 @@ CONFIG_FILE_LOCKING=y
 CONFIG_MANDATORY_FILE_LOCKING=y
 CONFIG_NETWORK_FILESYSTEMS=y
 
-CONFIG_NFS_FS=m
+CONFIG_NFS_FS=y
 CONFIG_NFS_V2=y
 CONFIG_NFS_V3=y
 # CONFIG_NFS_V3_ACL is not set
@@ -19,6 +19,7 @@ CONFIG_PNFS_FLEXFILE_LAYOUT=m
 CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org"
 CONFIG_NFS_V4_1_MIGRATION=y
 CONFIG_ROOT_NFS=y
+CONFIG_NFS_FSCACHE=y
 # CONFIG_NFS_USE_LEGACY_DNS is not set
 CONFIG_NFS_USE_KERNEL_DNS=y
 # CONFIG_NFSD is not set
diff --git a/packages/linux/config/extra-sound-soc-rpi.config b/packages/linux/config/extra-sound-soc-rpi.config
index 94f12a5156e..44cc30aa4f2 100644
--- a/packages/linux/config/extra-sound-soc-rpi.config
+++ b/packages/linux/config/extra-sound-soc-rpi.config
@@ -36,3 +36,15 @@ CONFIG_SND_PISOUND=m
 
 CONFIG_SND_SOC_I2C_AND_SPI=m
 
+CONFIG_SND_SOC_ADAU7002=m
+CONFIG_SND_SOC_CS4265=m
+CONFIG_SND_SOC_STA32X=m
+CONFIG_SND_SOC_WM8804_I2C=m
+
+CONFIG_SND_SOC_ARIZONA=m
+CONFIG_MFD_ARIZONA=y
+CONFIG_REGULATOR=y
+CONFIG_REGULATOR_ARIZONA=m
+CONFIG_EXTCON=m
+CONFIG_EXTCON_ARIZONA=m
+
diff --git a/packages/linux/config/extra-wlan-usb.config b/packages/linux/config/extra-wlan-usb.config
index e990c712403..f13d1fd8f2a 100644
--- a/packages/linux/config/extra-wlan-usb.config
+++ b/packages/linux/config/extra-wlan-usb.config
@@ -72,8 +72,8 @@ CONFIG_WLAN_VENDOR_REALTEK=y
 CONFIG_RTL8187=m
 CONFIG_RTL8187_LEDS=y
 CONFIG_RTL_CARDS=m
-CONFIG_RTL8XXXU=m
-CONFIG_RTL8XXXU_UNTESTED=y
+# CONFIG_RTL8XXXU is not set
+# CONFIG_RTL8XXXU_UNTESTED is not set
 CONFIG_RTL8192CU=m
 CONFIG_RTLWIFI=m
 CONFIG_RTLWIFI_USB=m
diff --git a/packages/linux/firmware/rtl_nic/rtl8105e-1.fw b/packages/linux/firmware/rtl_nic/rtl8105e-1.fw
new file mode 100644
index 00000000000..4c2cbd0e60c
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8105e-1.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8106e-1.fw b/packages/linux/firmware/rtl_nic/rtl8106e-1.fw
new file mode 100644
index 00000000000..85694cb6bf5
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8106e-1.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8106e-2.fw b/packages/linux/firmware/rtl_nic/rtl8106e-2.fw
new file mode 100644
index 00000000000..ac042757e13
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8106e-2.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8107e-1.fw b/packages/linux/firmware/rtl_nic/rtl8107e-1.fw
new file mode 100644
index 00000000000..c071c2f0252
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8107e-1.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8107e-2.fw b/packages/linux/firmware/rtl_nic/rtl8107e-2.fw
new file mode 100644
index 00000000000..76fc0544f9f
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8107e-2.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8168d-1.fw b/packages/linux/firmware/rtl_nic/rtl8168d-1.fw
new file mode 100644
index 00000000000..99e002de1fb
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8168d-1.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8168d-2.fw b/packages/linux/firmware/rtl_nic/rtl8168d-2.fw
new file mode 100644
index 00000000000..7022ab0af47
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8168d-2.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8168e-1.fw b/packages/linux/firmware/rtl_nic/rtl8168e-1.fw
new file mode 100644
index 00000000000..d203bd5d0dc
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8168e-1.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8168e-2.fw b/packages/linux/firmware/rtl_nic/rtl8168e-2.fw
new file mode 100644
index 00000000000..7ea5984cff6
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8168e-2.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8168e-3.fw b/packages/linux/firmware/rtl_nic/rtl8168e-3.fw
new file mode 100644
index 00000000000..0f888a32676
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8168e-3.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8168f-1.fw b/packages/linux/firmware/rtl_nic/rtl8168f-1.fw
new file mode 100644
index 00000000000..bf7883163e3
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8168f-1.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8168f-2.fw b/packages/linux/firmware/rtl_nic/rtl8168f-2.fw
new file mode 100644
index 00000000000..c3424929bde
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8168f-2.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8168g-1.fw b/packages/linux/firmware/rtl_nic/rtl8168g-1.fw
new file mode 100644
index 00000000000..2c628b03a7f
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8168g-1.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8168g-2.fw b/packages/linux/firmware/rtl_nic/rtl8168g-2.fw
new file mode 100644
index 00000000000..944d44e6a66
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8168g-2.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8168g-3.fw b/packages/linux/firmware/rtl_nic/rtl8168g-3.fw
new file mode 100644
index 00000000000..0c97d7ea143
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8168g-3.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8168h-1.fw b/packages/linux/firmware/rtl_nic/rtl8168h-1.fw
new file mode 100644
index 00000000000..296813ab073
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8168h-1.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8168h-2.fw b/packages/linux/firmware/rtl_nic/rtl8168h-2.fw
new file mode 100644
index 00000000000..df92848abbd
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8168h-2.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8402-1.fw b/packages/linux/firmware/rtl_nic/rtl8402-1.fw
new file mode 100644
index 00000000000..82fa35d7099
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8402-1.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8411-1.fw b/packages/linux/firmware/rtl_nic/rtl8411-1.fw
new file mode 100644
index 00000000000..72772dbc834
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8411-1.fw differ
diff --git a/packages/linux/firmware/rtl_nic/rtl8411-2.fw b/packages/linux/firmware/rtl_nic/rtl8411-2.fw
new file mode 100644
index 00000000000..e3789fe576c
Binary files /dev/null and b/packages/linux/firmware/rtl_nic/rtl8411-2.fw differ
diff --git a/packages/linux/package.mk b/packages/linux/package.mk
index ec2b4efbe28..18393ca43a8 100644
--- a/packages/linux/package.mk
+++ b/packages/linux/package.mk
@@ -22,7 +22,7 @@ PKG_ARCH="any"
 PKG_LICENSE="GPL"
 PKG_SITE="http://www.kernel.org"
 PKG_DEPENDS_HOST="ccache:host"
-PKG_DEPENDS_TARGET="toolchain cpio:host xz:host pciutils kmod wireless-regdb keyutils irqbalance"
+PKG_DEPENDS_TARGET="toolchain cpio:host xz:host pciutils kmod wireless-regdb keyutils"
 PKG_DEPENDS_INIT="toolchain cpu-firmware:init"
 PKG_NEED_UNPACK="$LINUX_DEPENDS"
 PKG_PRIORITY="optional"
@@ -54,7 +54,7 @@ case "$LINUX" in
     PKG_PATCH_DIRS="linux-4.8 imx6-4.8"
     ;;
   rpi)
-    PKG_VERSION="a22fc2f"
+    PKG_VERSION="31e73f0"
     PKG_GIT_URL="https://github.com/raspberrypi/linux.git"
     PKG_GIT_BRANCH="rpi-4.9.y"
     PKG_PATCH_DIRS="linux-4.9 rpi-4.9"
@@ -65,7 +65,7 @@ case "$LINUX" in
     PKG_PATCH_DIRS="linux-4.8"
     ;;
   *)
-    PKG_VERSION="4.9.20"
+    PKG_VERSION="4.9.30"
     PKG_URL="http://www.kernel.org/pub/linux/kernel/v4.x/$PKG_NAME-$PKG_VERSION.tar.xz"
     PKG_PATCH_DIRS="linux-4.9"
     ;;
@@ -81,7 +81,7 @@ PKG_MAKE_OPTS_HOST="headers_check"
 [ "$NFS_SUPPORT" = yes ]             && KERNEL_EXTRA_CONFIG+=" nfs"
 [ "$SAMBA_SUPPORT" = yes ]           && KERNEL_EXTRA_CONFIG+=" samba"
 [ "$BLUETOOTH_SUPPORT" = yes ]       && KERNEL_EXTRA_CONFIG+=" bluetooth"
-[ "$UVESAFB_SUPPORT" = yes ]         && KERNEL_EXTRA_CONFIG+=" uvesafb"
+[ "$UVESAFB_SUPPORT" = yes ]         && KERNEL_EXTRA_CONFIG+=" uvesafb" && PKG_DEPENDS_INIT+=" v86d:init"
 
 post_patch() {
   if [ -f $PROJECT_DIR/$PROJECT/$PKG_NAME/$PKG_NAME.$TARGET_ARCH.conf ]; then
diff --git a/packages/linux/patches/linux-4.9/linux-999-i915-use-legacy-turbo.patch b/packages/linux/patches/linux-4.9/linux-999-i915-use-legacy-turbo.patch
deleted file mode 100644
index a8df9d68d2f..00000000000
--- a/packages/linux/patches/linux-4.9/linux-999-i915-use-legacy-turbo.patch
+++ /dev/null
@@ -1,27 +0,0 @@
-From aefcd1a6b1ec22e4e0d26eb932b618c5d12f7e9c Mon Sep 17 00:00:00 2001
-From: fritsch <peter.fruehberger@gmail.com>
-Date: Sun, 21 Feb 2016 12:39:16 +0100
-Subject: [PATCH] i915_irq: enable legacy turbo (4.6)
-
----
- drivers/gpu/drm/i915/i915_irq.c | 7 +------
- 1 file changed, 1 insertion(+), 6 deletions(-)
-
-diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
-index 1c21220..6507643 100644
---- a/drivers/gpu/drm/i915/i915_irq.c
-+++ b/drivers/gpu/drm/i915/i915_irq.c
-@@ -4557,12 +4557,7 @@ void intel_irq_init(struct drm_i915_private *dev_priv)
- 	INIT_WORK(&dev_priv->rps.work, gen6_pm_rps_work);
- 	INIT_WORK(&dev_priv->l3_parity.error_work, ivybridge_parity_work);
- 
--	/* Let's track the enabled rps events */
--	if (IS_VALLEYVIEW(dev_priv))
--		/* WaGsvRC0ResidencyMethod:vlv */
--		dev_priv->pm_rps_events = GEN6_PM_RP_DOWN_EI_EXPIRED | GEN6_PM_RP_UP_EI_EXPIRED;
--	else
--		dev_priv->pm_rps_events = GEN6_PM_RPS_EVENTS;
-+	dev_priv->pm_rps_events = GEN6_PM_RPS_EVENTS;
- 
- 	INIT_DELAYED_WORK(&dev_priv->gpu_error.hangcheck_work,
- 			  i915_hangcheck_elapsed);
diff --git a/packages/linux/system.d/module-load.service b/packages/linux/system.d/module-load.service
index 1e2b89d3429..fbab6394c9d 100644
--- a/packages/linux/system.d/module-load.service
+++ b/packages/linux/system.d/module-load.service
@@ -4,7 +4,7 @@ DefaultDependencies=false
 
 [Service]
 Type=oneshot
-ExecStart=/bin/sh -c "for module in $(cat /usr/lib/modules-load.d/*.conf 2>/dev/null) ; do /sbin/modprobe $module; done"
+ExecStart=/bin/sh -c "for module in $(cat /usr/lib/modules-load.d/*.conf /storage/.config/modules-load.d/*.conf 2>/dev/null) ; do /sbin/modprobe $module; done"
 RemainAfterExit=yes
 
 [Install]
diff --git a/packages/mediacenter/kodi-platform/package.mk b/packages/mediacenter/kodi-platform/package.mk
index 493782da5a0..1e878baef8b 100644
--- a/packages/mediacenter/kodi-platform/package.mk
+++ b/packages/mediacenter/kodi-platform/package.mk
@@ -33,7 +33,9 @@ PKG_LONGDESC="kodi-platform:"
 PKG_IS_ADDON="no"
 PKG_AUTORECONF="no"
 
-PKG_CMAKE_OPTS_TARGET="-DCMAKE_MODULE_PATH=$SYSROOT_PREFIX/usr/share/kodi \
+PKG_CMAKE_OPTS_TARGET="-DCMAKE_INSTALL_LIBDIR:STRING=lib \
+                       -DCMAKE_INSTALL_LIBDIR_NOARCH:STRING=lib \
+                       -DCMAKE_INSTALL_PREFIX_TOOLCHAIN=$SYSROOT_PREFIX/usr \
                        -DBUILD_SHARED_LIBS=0"
 
 post_makeinstall_target() {
diff --git a/packages/mediacenter/kodi-platform/patches/kodi-platform-01_crosscompile-badness.patch b/packages/mediacenter/kodi-platform/patches/kodi-platform-01_crosscompile-badness.patch
index 4ff62af8bd6..711a11f8590 100644
--- a/packages/mediacenter/kodi-platform/patches/kodi-platform-01_crosscompile-badness.patch
+++ b/packages/mediacenter/kodi-platform/patches/kodi-platform-01_crosscompile-badness.patch
@@ -2,12 +2,12 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt
 index 2765341..1bd4fc2 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -26,7 +26,7 @@ else()
+@@ -22,7 +22,7 @@
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
  endif()
  
 -set(kodiplatform_INCLUDE_DIRS ${TINYXML_INCLUDE_DIR} "${CMAKE_INSTALL_PREFIX}/include/kodi")
-+set(kodiplatform_INCLUDE_DIRS ${TINYXML_INCLUDE_DIR} "${CMAKE_INSTALL_PREFIX_TOOLCHAIN}/include/kodi")
++set(kodiplatform_INCLUDE_DIRS ${TINYXML_INCLUDE_DIR})
  IF(WIN32)
    LIST(APPEND kodiplatform_INCLUDE_DIRS "${CMAKE_INSTALL_PREFIX}/include/kodi/windows")
  ENDIF(WIN32)
@@ -15,12 +15,23 @@ diff --git a/kodiplatform-config.cmake.in b/kodiplatform-config.cmake.in
 index 3fc5273..60bdf1b 100644
 --- a/kodiplatform-config.cmake.in
 +++ b/kodiplatform-config.cmake.in
-@@ -10,7 +10,7 @@
+@@ -10,16 +10,16 @@
  #
  # propagate these properties from one build system to the other
  set (kodiplatform_VERSION "@kodiplatform_VERSION_MAJOR@.@kodiplatform_VERSION_MINOR@")
 -set (kodiplatform_INCLUDE_DIRS @kodiplatform_INCLUDE_DIRS@ @CMAKE_INSTALL_PREFIX@/include)
-+set (kodiplatform_INCLUDE_DIRS @kodiplatform_INCLUDE_DIRS@ @CMAKE_INSTALL_PREFIX_TOOLCHAIN@/include)
++set (kodiplatform_INCLUDE_DIRS @kodiplatform_INCLUDE_DIRS@ @CMAKE_INSTALL_PREFIX_TOOLCHAIN@/include/kodi)
  set (kodiplatform_LIBRARY_DIRS "@CMAKE_LIBRARY_OUTPUT_DIRECTORY@")
  set (kodiplatform_LINKER_FLAGS "@kodiplatform_LINKER_FLAGS@")
  set (kodiplatform_CONFIG_VARS "@kodiplatform_CONFIG_VARS@")
+ 
+ # libraries come from the build tree where this file was generated
+ if(WIN32)
+-  set (kodiplatform_LIBRARY "@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@/kodiplatform.lib")
++  set (kodiplatform_LIBRARY "@CMAKE_INSTALL_PREFIX_TOOLCHAIN@/@CMAKE_INSTALL_LIBDIR@/kodiplatform.lib")
+ else(WIN32)
+-  set (kodiplatform_LIBRARY "-L@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@ -lkodiplatform")
++  set (kodiplatform_LIBRARY "-L@CMAKE_INSTALL_PREFIX_TOOLCHAIN@/@CMAKE_INSTALL_LIBDIR@ -lkodiplatform")
+ endif(WIN32)
+ set (kodiplatform_LIBRARIES ${kodiplatform_LIBRARY} "@kodiplatform_LIBRARIES@")
+ mark_as_advanced (kodiplatform_LIBRARY)
diff --git a/packages/mediacenter/kodi-platform/patches/kodi-platform-02_no-multi-lib.patch b/packages/mediacenter/kodi-platform/patches/kodi-platform-02_no-multi-lib.patch
deleted file mode 100644
index 319d40a998b..00000000000
--- a/packages/mediacenter/kodi-platform/patches/kodi-platform-02_no-multi-lib.patch
+++ /dev/null
@@ -1,37 +0,0 @@
-diff -Naur kodi-platform-15edaf7.orig/CMakeLists.txt kodi-platform-15edaf7/CMakeLists.txt
---- kodi-platform-15edaf7.orig/CMakeLists.txt	2015-10-22 17:44:43.034540766 -0700
-+++ kodi-platform-15edaf7/CMakeLists.txt	2015-10-22 17:46:38.851326343 -0700
-@@ -9,7 +9,6 @@
- find_package(TinyXML REQUIRED)
- find_package(Threads REQUIRED)
- find_package(p8-platform REQUIRED)
--include(UseMultiArch.cmake)
- include(CheckAtomic.cmake)
- 
- set(kodiplatform_NAME kodiplatform)
-@@ -43,7 +42,7 @@
- set_target_properties(kodiplatform PROPERTIES VERSION ${kodiplatform_VERSION_MAJOR}.${kodiplatform_VERSION_MINOR}.${kodiplatform_VERSION_PATCH}
-                                               SOVERSION ${kodiplatform_VERSION_MAJOR}.0)
- 
--install(TARGETS kodiplatform DESTINATION ${CMAKE_INSTALL_LIBDIR})
-+install(TARGETS kodiplatform DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
- install(FILES src/util/XMLUtils.h
-         DESTINATION include/kodi/util)
- 
-@@ -57,14 +56,14 @@
-                                  ${CMAKE_INSTALL_PREFIX}/include)
- 
-   install(FILES ${CMAKE_BINARY_DIR}/kodiplatform.pc
--          DESTINATION ${CMAKE_INSTALL_LIBDIR_NOARCH}/pkgconfig)
-+          DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/pkgconfig)
- ENDIF(NOT WIN32)
- 
- # config mode
- configure_file (kodiplatform-config.cmake.in
-                 kodiplatform-config.cmake @ONLY)
- install(FILES ${CMAKE_BINARY_DIR}/kodiplatform-config.cmake
--        DESTINATION ${CMAKE_INSTALL_LIBDIR_NOARCH}/kodiplatform)
-+        DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/kodiplatform)
- 
- # Quell warnings with in-tree builds
- set(KODI_BUILD_DIR ${KODI_BUILD_DIR})
diff --git a/packages/mediacenter/kodi/config/repository.openelec.tv/addon.xml b/packages/mediacenter/kodi/config/repository.openelec.tv/addon.xml
index d5437d03931..0ffc6ec93ba 100644
--- a/packages/mediacenter/kodi/config/repository.openelec.tv/addon.xml
+++ b/packages/mediacenter/kodi/config/repository.openelec.tv/addon.xml
@@ -1,8 +1,8 @@
 ﻿<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
 <addon id="repository.openelec.tv"
-		name="[COLOR FF757677]Open[/COLOR][COLOR FF8ABEE2]ELEC[/COLOR] Add-ons (official)"
-		version="7.0.0"
-		provider-name="Team [COLOR FF757677]Open[/COLOR][COLOR FF8ABEE2]ELEC[/COLOR]">
+		name="OpenELEC Add-ons (official)"
+		version="8.2.0"
+		provider-name="Team OpenELEC">
 	<extension point="xbmc.addon.repository"
 		name="OpenELEC Add-ons (official)">
 		<info>@ADDON_URL@/addons.xml</info>
@@ -12,6 +12,6 @@
 	<extension point="xbmc.addon.metadata">
 		<summary>OpenELEC Add-ons (official)</summary>
 		<description>The OpenELEC official repository contains Kodi PVR Clients, Screensavers, Visualisations, the unofficial repo installer, and more. Add-ons in this repository are maintained and supported by OpenELEC staff and sponsors. If you find a broken or non-working add-on please report it via the forums.</description>
-		<platform>all</platform>	
+		<platform>all</platform>
 	</extension>
 </addon>
diff --git a/packages/mediacenter/kodi/package.mk b/packages/mediacenter/kodi/package.mk
index 97460d22554..2e1c1a0d6fa 100644
--- a/packages/mediacenter/kodi/package.mk
+++ b/packages/mediacenter/kodi/package.mk
@@ -31,13 +31,13 @@ PKG_LONGDESC="Kodi Media Center (which was formerly named Xbox Media Center or X
 
 case "$KODIPLAYER_DRIVER" in
   bcm2835-firmware)
-    PKG_VERSION="02fffe1"
+    PKG_VERSION="c1d9c5c"
     PKG_GIT_URL="https://github.com/OpenELEC/xbmc.git"
     PKG_GIT_BRANCH="krypton_rbp_backports"
     PKG_KEEP_CHECKOUT="no"
     ;;
   *)
-    PKG_VERSION="17.1-Krypton"
+    PKG_VERSION="17.3-Krypton"
     PKG_GIT_URL="https://github.com/xbmc/xbmc.git"
     PKG_GIT_BRANCH="Krypton"
     PKG_KEEP_CHECKOUT="no"
@@ -59,7 +59,6 @@ PKG_CMAKE_OPTS_BOOTSTRAP="-DCORE_SOURCE_DIR=$ROOT/$PKG_BUILD"
 PKG_CMAKE_OPTS_HOST="-DCORE_SOURCE_DIR=$ROOT/$PKG_BUILD"
 PKG_CMAKE_OPTS_TARGET="-DNATIVEPREFIX=$ROOT/$TOOLCHAIN \
                        -DDEPENDS_PATH=$ROOT/$PKG_BUILD/depends \
-                       -DWITH_ARCH=$TARGET_ARCH \
                        -DCMAKE_BUILD_TYPE=none \
                        -DPYTHON_INCLUDE_DIRS=$SYSROOT_PREFIX/usr/include/python2.7 \
                        -DGIT_VERSION=$PKG_VERSION \
@@ -80,6 +79,11 @@ PKG_CMAKE_OPTS_TARGET="-DNATIVEPREFIX=$ROOT/$TOOLCHAIN \
                        -DENABLE_XSLT=OFF \
                        -DENABLE_DBUS=ON"
 
+if [ "$TARGET_ARCH" = "x86_64" ]; then
+  PKG_CMAKE_OPTS_TARGET+=" -DWITH_CPU=$TARGET_ARCH"
+else
+  PKG_CMAKE_OPTS_TARGET+=" -DWITH_ARCH=$TARGET_ARCH"
+fi
 
 if [ "$DISPLAYSERVER" = "x11" ]; then
   PKG_DEPENDS_TARGET+=" libX11 libXext libdrm libXrandr"
@@ -375,11 +379,9 @@ post_makeinstall_target() {
 post_install() {
 # enable default services
   enable_service kodi-autostart.service
-  enable_service kodi-cleanlogs.service
   enable_service kodi-halt.service
   enable_service kodi-poweroff.service
   enable_service kodi-reboot.service
-  enable_service kodi-waitonnetwork.service
   enable_service kodi.service
   enable_service kodi-lirc-suspend.service
 }
diff --git a/packages/mediacenter/kodi/patches/kodi-998.03-cmake-sse.patch b/packages/mediacenter/kodi/patches/kodi-998.03-cmake-sse.patch
new file mode 100644
index 00000000000..140a87e8dd6
--- /dev/null
+++ b/packages/mediacenter/kodi/patches/kodi-998.03-cmake-sse.patch
@@ -0,0 +1,110 @@
+diff -Naur kodi-17.1-Krypton/project/cmake/modules/FindSSE.cmake kodi-17.1-Krypton.patch/project/cmake/modules/FindSSE.cmake
+--- kodi-17.1-Krypton/project/cmake/modules/FindSSE.cmake	2017-03-20 17:17:49.000000000 +0100
++++ kodi-17.1-Krypton.patch/project/cmake/modules/FindSSE.cmake	2017-04-19 14:00:34.000000000 +0200
+@@ -2,43 +2,46 @@
+ # the project is compiled.
+ include(TestCXXAcceptsFlag)
+ 
+-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
++if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows")
++  CHECK_CXX_ACCEPTS_FLAG("-msse" _SSE_OK)
++  CHECK_CXX_ACCEPTS_FLAG("-msse2" _SSE2_OK)
++  CHECK_CXX_ACCEPTS_FLAG("-msse3" _SSE3_OK)
++  CHECK_CXX_ACCEPTS_FLAG("-mssse3" _SSSE3_OK)
++  CHECK_CXX_ACCEPTS_FLAG("-msse4.1" _SSE41_OK)
++  CHECK_CXX_ACCEPTS_FLAG("-msse4.2" _SSE42_OK)
++  CHECK_CXX_ACCEPTS_FLAG("-mavx" _AVX_OK)
++  CHECK_CXX_ACCEPTS_FLAG("-mavx2" _AVX2_OK)
++endif()
++
++if(CMAKE_SYSTEM_NAME MATCHES "Linux" AND NOT CMAKE_CROSSCOMPILING)
+    if(CPU MATCHES "x86_64" OR CPU MATCHES "i.86")
+      exec_program(cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO)
+ 
+      string(REGEX REPLACE "^.*(sse).*$" "\\1" _SSE_THERE ${CPUINFO})
+      string(COMPARE EQUAL "sse" "${_SSE_THERE}" _SSE_TRUE)
+-     CHECK_CXX_ACCEPTS_FLAG("-msse" _SSE_OK)
+ 
+      string(REGEX REPLACE "^.*(sse2).*$" "\\1" _SSE_THERE ${CPUINFO})
+      string(COMPARE EQUAL "sse2" "${_SSE_THERE}" _SSE2_TRUE)
+-     CHECK_CXX_ACCEPTS_FLAG("-msse2" _SSE2_OK)
+ 
+      # SSE3 is also known as the Prescott New Instructions (PNI)
+      # it's labeled as pni in /proc/cpuinfo
+      string(REGEX REPLACE "^.*(pni).*$" "\\1" _SSE_THERE ${CPUINFO})
+      string(COMPARE EQUAL "pni" "${_SSE_THERE}" _SSE3_TRUE)
+-     CHECK_CXX_ACCEPTS_FLAG("-msse3" _SSE3_OK)
+ 
+      string(REGEX REPLACE "^.*(ssse3).*$" "\\1" _SSE_THERE ${CPUINFO})
+      string(COMPARE EQUAL "ssse3" "${_SSE_THERE}" _SSSE3_TRUE)
+-     CHECK_CXX_ACCEPTS_FLAG("-mssse3" _SSSE3_OK)
+ 
+      string(REGEX REPLACE "^.*(sse4_1).*$" "\\1" _SSE_THERE ${CPUINFO})
+      string(COMPARE EQUAL "sse4_1" "${_SSE_THERE}" _SSE41_TRUE)
+-     CHECK_CXX_ACCEPTS_FLAG("-msse4.1" _SSE41_OK)
+ 
+      string(REGEX REPLACE "^.*(sse4_2).*$" "\\1" _SSE_THERE ${CPUINFO})
+      string(COMPARE EQUAL "sse4_2" "${_SSE_THERE}" _SSE42_TRUE)
+-     CHECK_CXX_ACCEPTS_FLAG("-msse4.2" _SSE42_OK)
+ 
+      string(REGEX REPLACE "^.*(avx).*$" "\\1" _SSE_THERE ${CPUINFO})
+      string(COMPARE EQUAL "avx" "${_SSE_THERE}" _AVX_TRUE)
+-     CHECK_CXX_ACCEPTS_FLAG("-mavx" _AVX_OK)
+ 
+      string(REGEX REPLACE "^.*(avx2).*$" "\\1" _SSE_THERE ${CPUINFO})
+      string(COMPARE EQUAL "avx2" "${_SSE_THERE}" _AVX2_TRUE)
+-     CHECK_CXX_ACCEPTS_FLAG("-mavx2" _AVX2_OK)
+    endif()
+ elseif(CMAKE_SYSTEM_NAME MATCHES "Darwin")
+    if(NOT CPU MATCHES "arm")
+@@ -46,35 +49,27 @@
+ 
+       string(REGEX REPLACE "^.*[^S](SSE).*$" "\\1" _SSE_THERE ${CPUINFO})
+       string(COMPARE EQUAL "SSE" "${_SSE_THERE}" _SSE_TRUE)
+-      CHECK_CXX_ACCEPTS_FLAG("-msse" _SSE_OK)
+ 
+       string(REGEX REPLACE "^.*[^S](SSE2).*$" "\\1" _SSE_THERE ${CPUINFO})
+       string(COMPARE EQUAL "SSE2" "${_SSE_THERE}" _SSE2_TRUE)
+-      CHECK_CXX_ACCEPTS_FLAG("-msse2" _SSE2_OK)
+ 
+       string(REGEX REPLACE "^.*[^S](SSE3).*$" "\\1" _SSE_THERE ${CPUINFO})
+       string(COMPARE EQUAL "SSE3" "${_SSE_THERE}" _SSE3_TRUE)
+-      CHECK_CXX_ACCEPTS_FLAG("-msse3" _SSE3_OK)
+ 
+       string(REGEX REPLACE "^.*(SSSE3).*$" "\\1" _SSE_THERE ${CPUINFO})
+       string(COMPARE EQUAL "SSSE3" "${_SSE_THERE}" _SSSE3_TRUE)
+-      CHECK_CXX_ACCEPTS_FLAG("-mssse3" _SSSE3_OK)
+ 
+       string(REGEX REPLACE "^.*(SSE4.1).*$" "\\1" _SSE_THERE ${CPUINFO})
+       string(COMPARE EQUAL "SSE4.1" "${_SSE_THERE}" _SSE41_TRUE)
+-      CHECK_CXX_ACCEPTS_FLAG("-msse4.1" _SSE41_OK)
+ 
+       string(REGEX REPLACE "^.*(SSE4.2).*$" "\\1" _SSE_THERE ${CPUINFO})
+       string(COMPARE EQUAL "SSE4.2" "${_SSE_THERE}" _SSE42_TRUE)
+-      CHECK_CXX_ACCEPTS_FLAG("-msse4.2" _SSE42_OK)
+ 
+       string(REGEX REPLACE "^.*(AVX).*$" "\\1" _SSE_THERE ${CPUINFO})
+       string(COMPARE EQUAL "AVX" "${_SSE_THERE}" _AVX_TRUE)
+-      CHECK_CXX_ACCEPTS_FLAG("-mavx" _AVX_OK)
+ 
+       string(REGEX REPLACE "^.*(AVX2).*$" "\\1" _SSE_THERE ${CPUINFO})
+       string(COMPARE EQUAL "AVX2" "${_SSE_THERE}" _AVX2_TRUE)
+-      CHECK_CXX_ACCEPTS_FLAG("-mavx2" _AVX2_OK)
+    endif()
+ elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
+    # TODO
+@@ -82,6 +77,14 @@
+    set(_SSE_OK   true)
+    set(_SSE2_TRUE true)
+    set(_SSE2_OK   true)
++elseif(CMAKE_SYSTEM_NAME MATCHES "Linux" AND CMAKE_CROSSCOMPILING)
++   # TODO
++   set(_SSE_TRUE true)
++   set(_SSE2_TRUE true)
++   set(_SSE3_TRUE    true)
++   set(_SSSE3_TRUE   true)
++   set(_SSE41_TRUE   true)
++#   set(_SSE42_TRUE   true)
+ endif()
+ 
+ include(FindPackageHandleStandardArgs)
diff --git a/packages/mediacenter/kodi/patches/kodi-999.90-PR11619.patch b/packages/mediacenter/kodi/patches/kodi-999.90-PR11619.patch
new file mode 100644
index 00000000000..1f547fb1432
--- /dev/null
+++ b/packages/mediacenter/kodi/patches/kodi-999.90-PR11619.patch
@@ -0,0 +1,72 @@
+From e7435af4142f7fa544af0df858cd864a54bc84a9 Mon Sep 17 00:00:00 2001
+From: fritsch <fritsch@kodi.tv>
+Date: Sat, 21 Jan 2017 17:59:43 +0100
+Subject: [PATCH 1/4] Increase CHUNK_SIZE to 128 KB
+
+---
+ xbmc/filesystem/FileCache.cpp | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xbmc/filesystem/FileCache.cpp b/xbmc/filesystem/FileCache.cpp
+index 8e1c95c..c5ffcab 100644
+--- a/xbmc/filesystem/FileCache.cpp
++++ b/xbmc/filesystem/FileCache.cpp
+@@ -43,7 +43,7 @@
+ 
+ using namespace XFILE;
+ 
+-#define READ_CACHE_CHUNK_SIZE (64*1024)
++#define READ_CACHE_CHUNK_SIZE (128*1024)
+ 
+ class CWriteRate
+ {
+
+From 4147f05c5f2e39c84685cdcedbb208899bfae816 Mon Sep 17 00:00:00 2001
+From: fritsch <Peter.Fruehberger@gmail.com>
+Date: Sun, 5 Feb 2017 21:59:24 +0100
+Subject: [PATCH 2/4] NFSFile: Return max chunksize possible
+
+---
+ xbmc/filesystem/NFSFile.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xbmc/filesystem/NFSFile.h b/xbmc/filesystem/NFSFile.h
+index c48e3a9..4187580 100644
+--- a/xbmc/filesystem/NFSFile.h
++++ b/xbmc/filesystem/NFSFile.h
+@@ -147,7 +147,7 @@ namespace XFILE
+     //implement iocontrol for seek_possible for preventing the stat in File class for
+     //getting this info ...
+     virtual int IoControl(EIoControl request, void* param){ if(request == IOCTRL_SEEK_POSSIBLE) return 1;return -1;};    
+-    virtual int  GetChunkSize() {return 1;}
++    virtual int  GetChunkSize() {return gNfsConnection.GetMaxReadChunkSize();}
+     
+     virtual bool OpenForWrite(const CURL& url, bool bOverWrite = false);
+     virtual bool Delete(const CURL& url);
+
+From 06440afd24df440b039f982b13843f0ce8f929a2 Mon Sep 17 00:00:00 2001
+From: fritsch <Peter.Fruehberger@gmail.com>
+Date: Sat, 22 Apr 2017 08:06:16 +0200
+Subject: [PATCH 4/4] DVDDemuxFFmpeg: Store probe_buffer on heap
+
+---
+ xbmc/cores/VideoPlayer/DVDDemuxers/DVDDemuxFFmpeg.cpp | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/xbmc/cores/VideoPlayer/DVDDemuxers/DVDDemuxFFmpeg.cpp b/xbmc/cores/VideoPlayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
+index 66ee2ed..b753c0b 100644
+--- a/xbmc/cores/VideoPlayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
++++ b/xbmc/cores/VideoPlayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
+@@ -295,10 +295,10 @@ bool CDVDDemuxFFmpeg::Open(CDVDInputStream* pInput, bool streaminfo, bool filein
+       if (trySPDIFonly || (iformat && strcmp(iformat->name, "wav") == 0))
+       {
+         AVProbeData pd;
+-        uint8_t probe_buffer[FFMPEG_FILE_BUFFER_SIZE + AVPROBE_PADDING_SIZE];
++        std::unique_ptr<uint8_t[]> probe_buffer (new uint8_t[FFMPEG_FILE_BUFFER_SIZE + AVPROBE_PADDING_SIZE]);
+ 
+         // init probe data
+-        pd.buf = probe_buffer;
++        pd.buf = probe_buffer.get();
+         pd.filename = strFile.c_str();
+ 
+         // av_probe_input_buffer might have changed the buffer_size beyond our allocated amount
diff --git a/packages/mediacenter/kodi/patches/kodi-999.90-PR11626.patch b/packages/mediacenter/kodi/patches/kodi-999.90-PR11626.patch
deleted file mode 100644
index a37b98807d9..00000000000
--- a/packages/mediacenter/kodi/patches/kodi-999.90-PR11626.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-From f2c9f4a02fa52c974e63e2822bf53e23befeb726 Mon Sep 17 00:00:00 2001
-From: Memphiz <memphis@machzwo.de>
-Date: Mon, 6 Feb 2017 22:51:24 +0100
-Subject: [PATCH] [touchscreen/pictures] - allow to back out from fullscreen
- picture mode by mapping longpress
-
----
- system/keymaps/touchscreen.xml | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/system/keymaps/touchscreen.xml b/system/keymaps/touchscreen.xml
-index c3135ae..2eb245c 100644
---- a/system/keymaps/touchscreen.xml
-+++ b/system/keymaps/touchscreen.xml
-@@ -80,6 +80,7 @@
-       <swipe direction="left" pointers="2">NextPicture</swipe>
-       <tap pointers="2">Pause</tap>
-       <tap pointers="3">Info</tap>
-+      <longpress>Back</longpress>
-     </touch>
-   </SlideShow>
-   <ScreenCalibration>
diff --git a/packages/mediacenter/kodi/patches/kodi-999.90-PR11906.patch b/packages/mediacenter/kodi/patches/kodi-999.90-PR11906.patch
deleted file mode 100644
index aaa0de36fd6..00000000000
--- a/packages/mediacenter/kodi/patches/kodi-999.90-PR11906.patch
+++ /dev/null
@@ -1,450 +0,0 @@
-From cbd96778cf947793f709c0e8508a085f5f101f2e Mon Sep 17 00:00:00 2001
-From: Kai Sommerfeld <kai.sommerfeld@gmx.com>
-Date: Sun, 19 Mar 2017 15:24:13 +0100
-Subject: [PATCH 1/3] [PVR] Guide window: Fix event/channel selection after
- channel group switch.
-
----
- xbmc/epg/GUIEPGGridContainer.cpp       | 53 +++++++++++++++++-----------------
- xbmc/epg/GUIEPGGridContainer.h         |  1 -
- xbmc/epg/GUIEPGGridContainerModel.cpp  |  3 ++
- xbmc/epg/GUIEPGGridContainerModel.h    |  1 +
- xbmc/pvr/windows/GUIWindowPVRGuide.cpp | 15 +++++-----
- 5 files changed, 38 insertions(+), 35 deletions(-)
-
-diff --git a/xbmc/epg/GUIEPGGridContainer.cpp b/xbmc/epg/GUIEPGGridContainer.cpp
-index 44ff0dd..ec3c4b04 100644
---- a/xbmc/epg/GUIEPGGridContainer.cpp
-+++ b/xbmc/epg/GUIEPGGridContainer.cpp
-@@ -346,14 +346,6 @@ void CGUIEPGGridContainer::RenderItem(float posX, float posY, CGUIListItem *item
-   g_graphicsContext.RestoreOrigin();
- }
- 
--void CGUIEPGGridContainer::ResetCoordinates()
--{
--  m_channelCursor = 0;
--  m_channelOffset = 0;
--  m_blockCursor = 0;
--  m_blockOffset = 0;
--}
--
- bool CGUIEPGGridContainer::OnAction(const CAction &action)
- {
-   switch (action.GetID())
-@@ -603,9 +595,23 @@ void CGUIEPGGridContainer::UpdateItems()
- 
-   if (prevSelectedEpgTag && (oldChannelIndex != 0 || oldBlockIndex != 0))
-   {
--    if (m_gridModel->GetGridItem(newChannelIndex, newBlockIndex)->GetEPGInfoTag() != prevSelectedEpgTag)
-+    if (newChannelIndex >= m_gridModel->ChannelItemsSize() ||
-+        newBlockIndex >= m_gridModel->GetBlockCount() ||
-+        m_gridModel->GetGridItem(newChannelIndex, newBlockIndex)->GetEPGInfoTag() != prevSelectedEpgTag)
-+    {
-       m_gridModel->FindChannelAndBlockIndex(channelUid, broadcastUid, eventOffset, newChannelIndex, newBlockIndex);
- 
-+      if (newChannelIndex == CGUIEPGGridContainerModel::INVALID_INDEX ||
-+          newBlockIndex == CGUIEPGGridContainerModel::INVALID_INDEX)
-+      {
-+        // previous selection is no longer in grid, goto channel 0 and now
-+        SetInvalid();
-+        GoToChannel(0);
-+        GoToNow();
-+        return;
-+      }
-+    }
-+
-     // restore previous selection.
-     if (newChannelIndex == oldChannelIndex && newBlockIndex == oldBlockIndex)
-     {
-@@ -1382,18 +1388,18 @@ void CGUIEPGGridContainer::SetTimelineItems(const std::unique_ptr<CFileItemList>
- 
- void CGUIEPGGridContainer::GoToChannel(int channelIndex)
- {
--  if (channelIndex > m_gridModel->ChannelItemsSize() - m_channelsPerPage)
--  {
--    // last page
--    ScrollToChannelOffset(m_gridModel->ChannelItemsSize() - m_channelsPerPage);
--    SetChannel(channelIndex - (m_gridModel->ChannelItemsSize() - m_channelsPerPage));
--  }
--  else if (channelIndex < m_channelsPerPage)
-+  if (channelIndex < m_channelsPerPage)
-   {
-     // first page
-     ScrollToChannelOffset(0);
-     SetChannel(channelIndex);
-   }
-+  else if (channelIndex > m_gridModel->ChannelItemsSize() - m_channelsPerPage)
-+  {
-+    // last page
-+    ScrollToChannelOffset(m_gridModel->ChannelItemsSize() - m_channelsPerPage);
-+    SetChannel(channelIndex - (m_gridModel->ChannelItemsSize() - m_channelsPerPage));
-+  }
-   else
-   {
-     ScrollToChannelOffset(channelIndex - m_channelCursor);
-@@ -1403,17 +1409,12 @@ void CGUIEPGGridContainer::GoToChannel(int channelIndex)
- 
- void CGUIEPGGridContainer::GoToBlock(int blockIndex)
- {
--  if (blockIndex > m_gridModel->GetBlockCount() - m_blocksPerPage)
-+  int lastPage = m_gridModel->GetBlockCount() - m_blocksPerPage;
-+  if (blockIndex > lastPage)
-   {
--    // last block
--    ScrollToBlockOffset(m_gridModel->GetBlockCount() - m_blocksPerPage);
--    SetBlock(blockIndex - (m_gridModel->GetBlockCount() - m_blocksPerPage));
--  }
--  else if (blockIndex < m_blocksPerPage)
--  {
--    // first block
--    ScrollToBlockOffset(0);
--    SetBlock(blockIndex);
-+    // last page
-+    ScrollToBlockOffset(lastPage);
-+    SetBlock(blockIndex - lastPage);
-   }
-   else
-   {
-diff --git a/xbmc/epg/GUIEPGGridContainer.h b/xbmc/epg/GUIEPGGridContainer.h
-index cf6a901..72be679 100644
---- a/xbmc/epg/GUIEPGGridContainer.h
-+++ b/xbmc/epg/GUIEPGGridContainer.h
-@@ -84,7 +84,6 @@ namespace EPG
-     void SetTimelineItems(const std::unique_ptr<CFileItemList> &items, const CDateTime &gridStart, const CDateTime &gridEnd);
-     void SetChannel(const PVR::CPVRChannelPtr &channel);
-     void SetChannel(const std::string &channel);
--    void ResetCoordinates();
- 
-   protected:
-     bool OnClick(int actionID);
-diff --git a/xbmc/epg/GUIEPGGridContainerModel.cpp b/xbmc/epg/GUIEPGGridContainerModel.cpp
-index 1abc733..69fd12d 100644
---- a/xbmc/epg/GUIEPGGridContainerModel.cpp
-+++ b/xbmc/epg/GUIEPGGridContainerModel.cpp
-@@ -272,6 +272,9 @@ void CGUIEPGGridContainerModel::FindChannelAndBlockIndex(int channelUid, unsigne
-   const CDateTimeSpan blockDuration(0, 0, MINSPERBLOCK, 0);
-   bool bFoundPrevChannel = false;
- 
-+  newChannelIndex = INVALID_INDEX;
-+  newBlockIndex = INVALID_INDEX;
-+
-   for (size_t channel = 0; channel < m_channelItems.size(); ++channel)
-   {
-     CDateTime gridCursor(m_gridStart); //reset cursor for new channel
-diff --git a/xbmc/epg/GUIEPGGridContainerModel.h b/xbmc/epg/GUIEPGGridContainerModel.h
-index 06e66ec..716816e 100644
---- a/xbmc/epg/GUIEPGGridContainerModel.h
-+++ b/xbmc/epg/GUIEPGGridContainerModel.h
-@@ -53,6 +53,7 @@ namespace EPG
-     void Refresh(const std::unique_ptr<CFileItemList> &items, const CDateTime &gridStart, const CDateTime &gridEnd, int iRulerUnit, int iBlocksPerPage, float fBlockSize);
-     void SetInvalid();
- 
-+    static const int INVALID_INDEX = -1;
-     void FindChannelAndBlockIndex(int channelUid, unsigned int broadcastUid, int eventOffset, int &newChannelIndex, int &newBlockIndex) const;
- 
-     void FreeChannelMemory(int keepStart, int keepEnd);
-diff --git a/xbmc/pvr/windows/GUIWindowPVRGuide.cpp b/xbmc/pvr/windows/GUIWindowPVRGuide.cpp
-index 93e875c..2193f951 100644
---- a/xbmc/pvr/windows/GUIWindowPVRGuide.cpp
-+++ b/xbmc/pvr/windows/GUIWindowPVRGuide.cpp
-@@ -508,10 +508,16 @@ void CGUIWindowPVRGuide::GetViewNextItems(CFileItemList &items)
- 
- bool CGUIWindowPVRGuide::RefreshTimelineItems()
- {
--  if (m_bRefreshTimelineItems)
-+  bool bRefreshTimelineItems;
-   {
-+    CSingleLock lock(m_critSection);
-+
-+    bRefreshTimelineItems = m_bRefreshTimelineItems;
-     m_bRefreshTimelineItems = false;
-+  }
- 
-+  if (bRefreshTimelineItems)
-+  {
-     CGUIEPGGridContainer* epgGridContainer = GetGridControl();
-     if (epgGridContainer)
-     {
-@@ -557,18 +563,11 @@ bool CGUIWindowPVRGuide::RefreshTimelineItems()
- void CGUIWindowPVRGuide::GetViewTimelineItems(CFileItemList &items)
- {
-   bool bRefresh = false;
--
-   {
-     CSingleLock lock(m_critSection);
- 
--    // group change detected reset grid coordinates and refresh grid items
-     if (!m_bRefreshTimelineItems && *m_cachedChannelGroup != *GetChannelGroup())
-     {
--      CGUIEPGGridContainer* epgGridContainer = GetGridControl();
--      if (!epgGridContainer)
--        return;
--
--      epgGridContainer->ResetCoordinates();
-       m_bRefreshTimelineItems = true;
-       bRefresh = true;
-     }
-
-From 3bcba5bb2a8956ff6254f1c8308fd3215966ce84 Mon Sep 17 00:00:00 2001
-From: Kai Sommerfeld <kai.sommerfeld@gmx.com>
-Date: Sun, 19 Mar 2017 22:48:52 +0100
-Subject: [PATCH 2/3] [PVR] Guide window: Implement asynchronous channel group
- switching.
-
----
- xbmc/pvr/windows/GUIWindowPVRGuide.cpp | 61 +++++++++++++++++++++++++---------
- xbmc/pvr/windows/GUIWindowPVRGuide.h   |  6 ++++
- 2 files changed, 52 insertions(+), 15 deletions(-)
-
-diff --git a/xbmc/pvr/windows/GUIWindowPVRGuide.cpp b/xbmc/pvr/windows/GUIWindowPVRGuide.cpp
-index 2193f951..b74cc36 100644
---- a/xbmc/pvr/windows/GUIWindowPVRGuide.cpp
-+++ b/xbmc/pvr/windows/GUIWindowPVRGuide.cpp
-@@ -19,6 +19,7 @@
-  */
- 
- #include "ContextMenuManager.h"
-+#include "dialogs/GUIDialogBusy.h"
- #include "epg/GUIEPGGridContainer.h"
- #include "GUIUserMessages.h"
- #include "epg/EpgContainer.h"
-@@ -41,8 +42,7 @@ using namespace PVR;
- using namespace EPG;
- 
- CGUIWindowPVRGuide::CGUIWindowPVRGuide(bool bRadio) :
--  CGUIWindowPVRBase(bRadio, bRadio ? WINDOW_RADIO_GUIDE : WINDOW_TV_GUIDE, "MyPVRGuide.xml"),
--  m_cachedChannelGroup(new CPVRChannelGroup)
-+  CGUIWindowPVRBase(bRadio, bRadio ? WINDOW_RADIO_GUIDE : WINDOW_TV_GUIDE, "MyPVRGuide.xml")
- {
-   m_bRefreshTimelineItems = false;
-   g_EpgContainer.RegisterObserver(this);
-@@ -68,15 +68,15 @@ void CGUIWindowPVRGuide::Init()
-     epgGridContainer->GoToNow();
-   }
- 
--  m_bRefreshTimelineItems = true;
-   StartRefreshTimelineItemsThread();
-+  m_bRefreshTimelineItems = true; // force data update on window open/re-open
- }
- 
- void CGUIWindowPVRGuide::ClearData()
- {
-   {
-     CSingleLock lock(m_critSection);
--    m_cachedChannelGroup.reset(new CPVRChannelGroup);
-+    m_cachedChannelGroup.reset();
-     m_newTimeline.reset();
-   }
- 
-@@ -97,7 +97,6 @@ void CGUIWindowPVRGuide::OnInitWindow()
- void CGUIWindowPVRGuide::OnDeinitWindow(int nextWindowID)
- {
-   StopRefreshTimelineItemsThread();
--  m_bRefreshTimelineItems = false;
- 
-   CGUIWindowPVRBase::OnDeinitWindow(nextWindowID);
- }
-@@ -112,7 +111,10 @@ void CGUIWindowPVRGuide::StartRefreshTimelineItemsThread()
- void CGUIWindowPVRGuide::StopRefreshTimelineItemsThread()
- {
-   if (m_refreshTimelineItemsThread)
--    m_refreshTimelineItemsThread->StopThread(false);
-+  {
-+    m_bRefreshTimelineItems = false;
-+    m_refreshTimelineItemsThread->Stop();
-+  }
- }
- 
- void CGUIWindowPVRGuide::Notify(const Observable &obs, const ObservableMessage msg)
-@@ -562,20 +564,22 @@ bool CGUIWindowPVRGuide::RefreshTimelineItems()
- 
- void CGUIWindowPVRGuide::GetViewTimelineItems(CFileItemList &items)
- {
--  bool bRefresh = false;
-+  bool bRefreshTimelineItems = false;
-+
-   {
-     CSingleLock lock(m_critSection);
- 
--    if (!m_bRefreshTimelineItems && *m_cachedChannelGroup != *GetChannelGroup())
-+    if (m_cachedChannelGroup && *m_cachedChannelGroup != *GetChannelGroup())
-     {
-+      // channel group change and not very first open of this window. force immediate update.
-       m_bRefreshTimelineItems = true;
--      bRefresh = true;
-+      bRefreshTimelineItems = true;
-     }
-   }
- 
--  // never call RefreshTimelineItems with locked mutex!
--  if (bRefresh)
--    RefreshTimelineItems();
-+  // never call DoRefresh with locked mutex!
-+  if (bRefreshTimelineItems)
-+    m_refreshTimelineItemsThread->DoRefresh();
- 
-   {
-     CSingleLock lock(m_critSection);
-@@ -704,8 +708,23 @@ bool CGUIWindowPVRGuide::OnContextButtonDeleteTimer(CFileItem *item, CONTEXT_BUT
- 
- CPVRRefreshTimelineItemsThread::CPVRRefreshTimelineItemsThread(CGUIWindowPVRGuide *pGuideWindow)
- : CThread("epg-grid-refresh-timeline-items"),
--  m_pGuideWindow(pGuideWindow)
-+  m_pGuideWindow(pGuideWindow),
-+  m_ready(true),
-+  m_done(false)
-+{
-+}
-+
-+void CPVRRefreshTimelineItemsThread::Stop()
- {
-+  StopThread(false);
-+  m_ready.Set(); // wake up the worker thread to let it exit
-+}
-+
-+void CPVRRefreshTimelineItemsThread::DoRefresh()
-+{
-+  m_ready.Set(); // wake up the worker thread
-+  m_done.Reset();
-+  CGUIDialogBusy::WaitOnEvent(m_done, 100, false);
- }
- 
- void CPVRRefreshTimelineItemsThread::Process()
-@@ -717,12 +736,19 @@ void CPVRRefreshTimelineItemsThread::Process()
- 
-   while (!m_bStop)
-   {
-+    m_done.Reset();
-+
-     if (m_pGuideWindow->RefreshTimelineItems() && !m_bStop)
-     {
-       CGUIMessage m(GUI_MSG_REFRESH_LIST, m_pGuideWindow->GetID(), 0, ObservableMessageEpg);
-       KODI::MESSAGING::CApplicationMessenger::GetInstance().SendGUIMessage(m);
-     }
- 
-+    if (m_bStop)
-+      break;
-+
-+    m_done.Set();
-+
-     // in order to fill the guide window asap, use a short update interval until we the
-     // same amount of epg events for BOOSTED_SLEEPS_THRESHOLD + 1 times in a row .
-     if (iUpdatesWithoutChange < BOOSTED_SLEEPS_THRESHOLD)
-@@ -736,11 +762,16 @@ void CPVRRefreshTimelineItemsThread::Process()
- 
-       iLastEpgItemsCount = iCurrentEpgItemsCount;
- 
--      Sleep(1000); // boosted update cycle
-+      m_ready.WaitMSec(1000); // boosted update cycle
-     }
-     else
-     {
--      Sleep(5000); // normal update cycle
-+      m_ready.WaitMSec(5000); // normal update cycle
-     }
-+
-+    m_ready.Reset();
-   }
-+
-+  m_ready.Reset();
-+  m_done.Set();
- }
-diff --git a/xbmc/pvr/windows/GUIWindowPVRGuide.h b/xbmc/pvr/windows/GUIWindowPVRGuide.h
-index d5a41fc..c4f0682 100644
---- a/xbmc/pvr/windows/GUIWindowPVRGuide.h
-+++ b/xbmc/pvr/windows/GUIWindowPVRGuide.h
-@@ -21,6 +21,7 @@
- 
- #include <atomic>
- #include <memory>
-+#include "threads/Event.h"
- #include "threads/Thread.h"
- #include "GUIWindowPVRBase.h"
- 
-@@ -99,7 +100,12 @@ namespace PVR
- 
-     virtual void Process();
- 
-+    void DoRefresh();
-+    void Stop();
-+
-   private:
-     CGUIWindowPVRGuide *m_pGuideWindow;
-+    CEvent m_ready;
-+    CEvent m_done;
-   };
- }
-
-From cf2f80904d7713c021f996b94d60142548536dc0 Mon Sep 17 00:00:00 2001
-From: Kai Sommerfeld <kai.sommerfeld@gmx.com>
-Date: Tue, 21 Mar 2017 23:09:05 +0100
-Subject: [PATCH 3/3] [PVR] Guide window: Optimize re-opening of the window
- (re-use item list if no changes came in while window was not active).
-
----
- xbmc/pvr/windows/GUIWindowPVRGuide.cpp | 25 +++++++++++++++++++------
- 1 file changed, 19 insertions(+), 6 deletions(-)
-
-diff --git a/xbmc/pvr/windows/GUIWindowPVRGuide.cpp b/xbmc/pvr/windows/GUIWindowPVRGuide.cpp
-index b74cc36..9c4d338 100644
---- a/xbmc/pvr/windows/GUIWindowPVRGuide.cpp
-+++ b/xbmc/pvr/windows/GUIWindowPVRGuide.cpp
-@@ -51,6 +51,8 @@ CGUIWindowPVRGuide::CGUIWindowPVRGuide(bool bRadio) :
- CGUIWindowPVRGuide::~CGUIWindowPVRGuide(void)
- {
-   g_EpgContainer.UnregisterObserver(this);
-+
-+  m_bRefreshTimelineItems = false;
-   StopRefreshTimelineItemsThread();
- }
- 
-@@ -68,8 +70,13 @@ void CGUIWindowPVRGuide::Init()
-     epgGridContainer->GoToNow();
-   }
- 
-+  if (!m_refreshTimelineItemsThread)
-+  {
-+    CSingleLock lock(m_critSection);
-+    m_bRefreshTimelineItems = true; // force data update on first window open
-+  }
-+
-   StartRefreshTimelineItemsThread();
--  m_bRefreshTimelineItems = true; // force data update on window open/re-open
- }
- 
- void CGUIWindowPVRGuide::ClearData()
-@@ -98,6 +105,16 @@ void CGUIWindowPVRGuide::OnDeinitWindow(int nextWindowID)
- {
-   StopRefreshTimelineItemsThread();
- 
-+  {
-+    CSingleLock lock(m_critSection);
-+    if (m_vecItems && !m_newTimeline)
-+    {
-+      // speedup: save a copy of current items for reuse when re-opening the window
-+      m_newTimeline.reset(new CFileItemList);
-+      m_newTimeline->Copy(*m_vecItems);
-+    }
-+  }
-+
-   CGUIWindowPVRBase::OnDeinitWindow(nextWindowID);
- }
- 
-@@ -111,16 +128,12 @@ void CGUIWindowPVRGuide::StartRefreshTimelineItemsThread()
- void CGUIWindowPVRGuide::StopRefreshTimelineItemsThread()
- {
-   if (m_refreshTimelineItemsThread)
--  {
--    m_bRefreshTimelineItems = false;
-     m_refreshTimelineItemsThread->Stop();
--  }
- }
- 
- void CGUIWindowPVRGuide::Notify(const Observable &obs, const ObservableMessage msg)
- {
--  if (IsActive() &&
--      m_viewControl.GetCurrentControl() == GUIDE_VIEW_TIMELINE &&
-+  if (m_viewControl.GetCurrentControl() == GUIDE_VIEW_TIMELINE &&
-       (msg == ObservableMessageEpg ||
-        msg == ObservableMessageEpgContainer ||
-        msg == ObservableMessageChannelGroupReset ||
diff --git a/packages/mediacenter/kodi/patches/kodi-999.90-PR11917.patch b/packages/mediacenter/kodi/patches/kodi-999.90-PR11917.patch
deleted file mode 100644
index 8546fb1566b..00000000000
--- a/packages/mediacenter/kodi/patches/kodi-999.90-PR11917.patch
+++ /dev/null
@@ -1,38 +0,0 @@
-From 0cfdd9e73c2b24c404217294d2274cd75003c14a Mon Sep 17 00:00:00 2001
-From: Kai Sommerfeld <kai.sommerfeld@gmx.com>
-Date: Wed, 29 Mar 2017 20:07:57 +0200
-Subject: [PATCH] [PVR] Krypton: Quick and dirty fix for trac #17374.
-
----
- xbmc/Application.cpp    | 3 +++
- xbmc/pvr/PVRManager.cpp | 3 ---
- 2 files changed, 3 insertions(+), 3 deletions(-)
-
-diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index b8ff91b..947f093 100644
---- a/xbmc/Application.cpp
-+++ b/xbmc/Application.cpp
-@@ -2932,6 +2932,9 @@ void CApplication::Stop(int exitCode)
-     CLog::Log(LOGNOTICE, "stop player");
-     m_pPlayer->ClosePlayer();
- 
-+    // quick and dirty Krypton-only fix for http://trac.kodi.tv/ticket/17374
-+    g_PVRManager.SetWakeupCommand();
-+
-     StopServices();
- 
- #ifdef HAS_ZEROCONF
-diff --git a/xbmc/pvr/PVRManager.cpp b/xbmc/pvr/PVRManager.cpp
-index 52a8e1b..18f8fbd 100644
---- a/xbmc/pvr/PVRManager.cpp
-+++ b/xbmc/pvr/PVRManager.cpp
-@@ -493,9 +493,6 @@ void CPVRManager::Unload()
- 
- void CPVRManager::Shutdown()
- {
--  // set system wakeup data
--  SetWakeupCommand();
--
-   Unload();
- 
-   // release addons
diff --git a/packages/mediacenter/kodi/patches/kodi-999.90-PR12008.patch b/packages/mediacenter/kodi/patches/kodi-999.90-PR12008.patch
new file mode 100644
index 00000000000..39ca76087e8
--- /dev/null
+++ b/packages/mediacenter/kodi/patches/kodi-999.90-PR12008.patch
@@ -0,0 +1,99 @@
+From f35298ec0a9e5dfbd38a7a64f3b5eb3bb3c219af Mon Sep 17 00:00:00 2001
+From: fritsch <Peter.Fruehberger@gmail.com>
+Date: Mon, 24 Apr 2017 20:05:07 +0200
+Subject: [PATCH] AESinkPULSE: Return to PA's delay infrastructure
+
+---
+ xbmc/cores/AudioEngine/Sinks/AESinkPULSE.cpp | 27 +++++----------------------
+ xbmc/cores/AudioEngine/Sinks/AESinkPULSE.h   |  2 --
+ 2 files changed, 5 insertions(+), 24 deletions(-)
+
+diff --git a/xbmc/cores/AudioEngine/Sinks/AESinkPULSE.cpp b/xbmc/cores/AudioEngine/Sinks/AESinkPULSE.cpp
+index 5b86767..718a497 100644
+--- a/xbmc/cores/AudioEngine/Sinks/AESinkPULSE.cpp
++++ b/xbmc/cores/AudioEngine/Sinks/AESinkPULSE.cpp
+@@ -22,7 +22,6 @@
+ #include "AESinkPULSE.h"
+ #include "utils/log.h"
+ #include "Util.h"
+-#include "utils/TimeUtils.h"
+ #include "guilib/LocalizeStrings.h"
+ #include "Application.h"
+ #include "cores/AudioEngine/Engines/ActiveAE/ActiveAE.h"
+@@ -486,8 +485,6 @@ CAESinkPULSE::CAESinkPULSE()
+   m_MainLoop = NULL;
+   m_BytesPerSecond = 0;
+   m_BufferSize = 0;
+-  m_filled_bytes = 0;
+-  m_lastPackageStamp = 0;
+   m_Channels = 0;
+   m_Stream = NULL;
+   m_Context = NULL;
+@@ -511,8 +508,6 @@ bool CAESinkPULSE::Initialize(AEAudioFormat &format, std::string &device)
+   m_passthrough = false;
+   m_BytesPerSecond = 0;
+   m_BufferSize = 0;
+-  m_filled_bytes = 0;
+-  m_lastPackageStamp = 0;
+   m_Channels = 0;
+   m_Stream = NULL;
+   m_Context = NULL;
+@@ -790,8 +785,6 @@ void CAESinkPULSE::Deinitialize()
+   m_IsAllocated = false;
+   m_passthrough = false;
+   m_periodSize = 0;
+-  m_filled_bytes = 0;
+-  m_lastPackageStamp = 0;
+ 
+   if (m_Stream)
+     Drain();
+@@ -833,22 +826,14 @@ void CAESinkPULSE::GetDelay(AEDelayStatus& status)
+   }
+ 
+   pa_threaded_mainloop_lock(m_MainLoop);
+-  const pa_timing_info* pti = pa_stream_get_timing_info(m_Stream);
+-  // only incorporate local sink delay + internal PA transport delay
+-  double sink_delay = (pti->configured_sink_usec / 1000000.0);
+-  double transport_delay = pti->transport_usec / 1000000.0;
++  pa_usec_t r_usec;
++  int negative;
+ 
+-  uint64_t diff = CurrentHostCounter() - m_lastPackageStamp;
+-  unsigned int bytes_played = (unsigned int) ((double) diff * (double) m_BytesPerSecond  / (double) CurrentHostFrequency() + 0.5);
+-
+-  int buffer_delay = m_filled_bytes - bytes_played;
+-  if (buffer_delay < 0)
+-    buffer_delay = 0;
++  if (pa_stream_get_latency(m_Stream, &r_usec, &negative) < 0)
++    r_usec = 0;
+ 
+   pa_threaded_mainloop_unlock(m_MainLoop);
+-
+-  double delay = buffer_delay / (double) m_BytesPerSecond + sink_delay + transport_delay;
+-  status.SetDelay(delay);
++  status.SetDelay(r_usec / 1000000.0);
+ }
+ 
+ double CAESinkPULSE::GetCacheTotal()
+@@ -886,8 +871,6 @@ unsigned int CAESinkPULSE::AddPackets(uint8_t **data, unsigned int frames, unsig
+     CLog::Log(LOGERROR, "CPulseAudioDirectSound::AddPackets - pa_stream_write failed\n");
+     return 0;
+   }
+-  m_lastPackageStamp = CurrentHostCounter();
+-  m_filled_bytes = m_BufferSize - (free - length);
+   unsigned int res = (unsigned int)(length / m_format.m_frameSize);
+ 
+   return res;
+diff --git a/xbmc/cores/AudioEngine/Sinks/AESinkPULSE.h b/xbmc/cores/AudioEngine/Sinks/AESinkPULSE.h
+index ac0525f..0bdf5d2 100644
+--- a/xbmc/cores/AudioEngine/Sinks/AESinkPULSE.h
++++ b/xbmc/cores/AudioEngine/Sinks/AESinkPULSE.h
+@@ -70,8 +70,6 @@ class CAESinkPULSE : public IAESink
+   pa_cvolume m_Volume;
+   bool m_volume_needs_update;
+   uint32_t m_periodSize;
+-  uint64_t m_lastPackageStamp;
+-  uint64_t m_filled_bytes;
+ 
+   pa_context *m_Context;
+   pa_threaded_mainloop *m_MainLoop;
diff --git a/packages/mediacenter/kodi/patches/kodi-999.99-SMB3_support.patch b/packages/mediacenter/kodi/patches/kodi-999.99-SMB3_support.patch
new file mode 100644
index 00000000000..a9270379965
--- /dev/null
+++ b/packages/mediacenter/kodi/patches/kodi-999.99-SMB3_support.patch
@@ -0,0 +1,17 @@
+diff --git a/xbmc/filesystem/SMBFile.cpp b/xbmc/filesystem/SMBFile.cpp
+index 08fc390..dd1dcb3 100644
+--- a/xbmc/filesystem/SMBFile.cpp
++++ b/xbmc/filesystem/SMBFile.cpp
+@@ -136,6 +136,9 @@ void CSMB::Init()
+         if (g_advancedSettings.m_sambadoscodepage.length() > 0)
+           fprintf(f, "\tdos charset = %s\n", g_advancedSettings.m_sambadoscodepage.c_str());
+ 
++        // Allow better than SMB1/NT1 to be negotiated (if supported)
++          fprintf(f, "\tclient max protocol = SMB3\n");
++
+         fclose(f);
+       }
+     }
+-- 
+2.7.4
+
diff --git a/packages/mediacenter/kodi/scripts/kodi.sh b/packages/mediacenter/kodi/scripts/kodi.sh
index b7b33e320d7..35ae97eea1a 100755
--- a/packages/mediacenter/kodi/scripts/kodi.sh
+++ b/packages/mediacenter/kodi/scripts/kodi.sh
@@ -92,6 +92,13 @@ fi
 # clean up any stale cores. just in case
 rm -f /storage/.cache/cores/*
 
+# clean 0 byte database files
+for file in /storage/.kodi/userdata/Database/*.db; do
+  if [ ! -s $file ]; then
+    rm -rf $file
+  fi
+done
+
 /usr/lib/kodi/kodi.bin $SAVED_ARGS
 RET=$?
 
diff --git a/packages/mediacenter/kodi/system.d/kodi-cleanlogs.service b/packages/mediacenter/kodi/system.d/kodi-cleanlogs.service
deleted file mode 100644
index 6f1ae9c683c..00000000000
--- a/packages/mediacenter/kodi/system.d/kodi-cleanlogs.service
+++ /dev/null
@@ -1,13 +0,0 @@
-[Unit]
-Description=Kodi clean debug logs
-ConditionKernelCommandLine=!debugging
-ConditionPathExists=!/storage/.cache/debug.openelec
-Before=kodi.service
-
-[Service]
-Type=oneshot
-ExecStart=-/bin/sh -c 'rm -rf /storage/.kodi/userdata/addon_data/*/*.log /storage/.kodi/userdata/addon_data/*/log/*'
-RemainAfterExit=yes
-
-[Install]
-WantedBy=kodi.service
diff --git a/packages/mediacenter/kodi/system.d/kodi-waitonnetwork.service b/packages/mediacenter/kodi/system.d/kodi-waitonnetwork.service
deleted file mode 100644
index 9785da7fdc5..00000000000
--- a/packages/mediacenter/kodi/system.d/kodi-waitonnetwork.service
+++ /dev/null
@@ -1,19 +0,0 @@
-[Unit]
-Description=Wait on network
-Requisite=connman.service
-After=connman.service
-Before=network-online.target
-DefaultDependencies=no
-Conflicts=shutdown.target
-ConditionPathExists=/storage/.cache/openelec/network_wait
-
-[Service]
-Type=oneshot
-EnvironmentFile=/storage/.cache/openelec/network_wait
-ExecStartPre=/bin/sh -c 'echo "waiting on Network to come online ... (max. $WAIT_NETWORK_TIME sec.)"'
-ExecStart=/usr/sbin/connmand-wait-online --timeout=${WAIT_NETWORK_TIME}
-StandardOutput=tty
-RemainAfterExit=yes
-
-[Install]
-WantedBy=network-online.target
diff --git a/packages/mediacenter/kodi/system.d/kodi.service b/packages/mediacenter/kodi/system.d/kodi.service
index d70ff33c2fa..7ad30e1c8a0 100644
--- a/packages/mediacenter/kodi/system.d/kodi.service
+++ b/packages/mediacenter/kodi/system.d/kodi.service
@@ -14,6 +14,7 @@ Environment=KODI_HOME=/usr/share/kodi/
 EnvironmentFile=-/run/openelec/kodi.conf
 EnvironmentFile=-/run/openelec/debug/kodi.conf
 ExecStartPre=-/usr/lib/kodi/kodi-config
+ExecStartPre=-/bin/sh -c 'rm -rf /storage/.kodi/userdata/addon_data/*/*.log /storage/.kodi/userdata/addon_data/*/log/*'
 ExecStart=/usr/lib/kodi/kodi.sh --standalone -fs $KODI_ARGS $KODI_DEBUG
 ExecStop=/bin/kill -TERM $MAINPID
 TimeoutStopSec=5
diff --git a/packages/mediacenter/p8-platform/package.mk b/packages/mediacenter/p8-platform/package.mk
index b156578e2e8..d047d0b4d2e 100644
--- a/packages/mediacenter/p8-platform/package.mk
+++ b/packages/mediacenter/p8-platform/package.mk
@@ -34,8 +34,8 @@ PKG_IS_ADDON="no"
 PKG_AUTORECONF="no"
 
 PKG_CMAKE_OPTS_TARGET="-DCMAKE_INSTALL_PREFIX=/usr \
-                       -DCMAKE_INSTALL_LIBDIR=lib \
-                       -DCMAKE_INSTALL_LIBDIR_NOARCH=lib \
+                       -DCMAKE_INSTALL_LIBDIR:STRING=lib \
+                       -DCMAKE_INSTALL_LIBDIR_NOARCH:STRING=lib \
                        -DCMAKE_INSTALL_PREFIX_TOOLCHAIN=$SYSROOT_PREFIX/usr \
                        -DCMAKE_PREFIX_PATH=$SYSROOT_PREFIX/usr \
                        -DBUILD_SHARED_LIBS=0"
diff --git a/packages/mediacenter/p8-platform/patches/p8-platform-01-fix.patch b/packages/mediacenter/p8-platform/patches.bk/p8-platform-01-fix.patch
similarity index 100%
rename from packages/mediacenter/p8-platform/patches/p8-platform-01-fix.patch
rename to packages/mediacenter/p8-platform/patches.bk/p8-platform-01-fix.patch
diff --git a/packages/mediacenter/p8-platform/patches/p8-platform-01-revert-cc-badness.patch b/packages/mediacenter/p8-platform/patches/p8-platform-01-revert-cc-badness.patch
new file mode 100644
index 00000000000..c060921a361
--- /dev/null
+++ b/packages/mediacenter/p8-platform/patches/p8-platform-01-revert-cc-badness.patch
@@ -0,0 +1,28 @@
+From f91594676d1f75530addd87363ccbc6510efb84e Mon Sep 17 00:00:00 2001
+From: Stefan Saraev <stefan@saraev.ca>
+Date: Fri, 8 May 2015 11:19:42 +0300
+Subject: [PATCH] revert cc badness
+
+this reverts upstream commit 68f8418
+---
+ CMakeLists.txt |    6 ------
+ 1 file changed, 6 deletions(-)
+
+diff -Naur a/CMakeLists.txt b/CMakeLists.txt
+--- a/CMakeLists.txt	2016-01-05 23:58:40.000000000 +0100
++++ b/CMakeLists.txt	2016-01-06 01:26:52.004076744 +0100
+@@ -22,12 +22,6 @@
+                    src/windows/os-threads.cpp)
+ endif()
+ 
+-set(p8-platform_INCLUDE_DIRS "${CMAKE_INSTALL_PREFIX}/include/p8-platform")
+-IF(WIN32)
+-  LIST(APPEND p8-platform_INCLUDE_DIRS "${CMAKE_INSTALL_PREFIX}/include/p8-platform/windows")
+-ENDIF(WIN32)
+-set(p8-platform_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
+-
+ if(NOT ${CORE_SYSTEM_NAME} STREQUAL "")
+   if(${CORE_SYSTEM_NAME} STREQUAL "osx" OR ${CORE_SYSTEM_NAME} STREQUAL "ios")
+     list(APPEND p8-platform_LIBRARIES "-framework CoreVideo -framework IOKit")
+-- 
+1.7.10.4
diff --git a/packages/mediacenter/p8-platform/patches/p8-platform-02-cmake_install_prefix_toolchain.patch b/packages/mediacenter/p8-platform/patches/p8-platform-02-cmake_install_prefix_toolchain.patch
new file mode 100644
index 00000000000..6f2a694a71e
--- /dev/null
+++ b/packages/mediacenter/p8-platform/patches/p8-platform-02-cmake_install_prefix_toolchain.patch
@@ -0,0 +1,32 @@
+diff -Naur a/p8-platform-config.cmake.in b/p8-platform-config.cmake.in
+--- a/p8-platform-config.cmake.in	2016-01-19 20:51:52.000000000 +0100
++++ b/p8-platform-config.cmake.in	2017-01-03 19:48:27.000000000 +0100
+@@ -10,16 +10,16 @@
+ #
+ # propagate these properties from one build system to the other
+ set (p8-platform_VERSION "@p8-platform_VERSION_MAJOR@.@p8-platform_VERSION_MINOR@")
+-set (p8-platform_INCLUDE_DIRS @p8-platform_INCLUDE_DIRS@ @CMAKE_INSTALL_PREFIX@/include)
++set (p8-platform_INCLUDE_DIRS @p8-platform_INCLUDE_DIRS@ @CMAKE_INSTALL_PREFIX_TOOLCHAIN@/include/p8-platform)
+ set (p8-platform_LIBRARY_DIRS "@CMAKE_LIBRARY_OUTPUT_DIRECTORY@")
+ set (p8-platform_LINKER_FLAGS "@p8-platform_LINKER_FLAGS@")
+ set (p8-platform_CONFIG_VARS "@p8-platform_CONFIG_VARS@")
+ 
+ # libraries come from the build tree where this file was generated
+ if(WIN32)
+-  set (p8-platform_LIBRARY "@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@/p8-platform.lib")
++  set (p8-platform_LIBRARY "@CMAKE_INSTALL_PREFIX_TOOLCHAIN@/@CMAKE_INSTALL_LIBDIR@/p8-platform.lib")
+ else(WIN32)
+-  set (p8-platform_LIBRARY "-L@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@ -lp8-platform")
++  set (p8-platform_LIBRARY "-L@CMAKE_INSTALL_PREFIX_TOOLCHAIN@/@CMAKE_INSTALL_LIBDIR@ -lp8-platform")
+ endif(WIN32)
+ set (p8-platform_LIBRARIES ${p8-platform_LIBRARY} "@p8-platform_LIBRARIES@")
+ mark_as_advanced (p8-platform_LIBRARY)
+diff -Naur a/p8-platform.pc.in b/p8-platform.pc.in
+--- a/p8-platform.pc.in	2016-01-19 20:51:52.000000000 +0100
++++ b/p8-platform.pc.in	2017-01-03 19:48:13.000000000 +0100
+@@ -7,4 +7,4 @@
+ Description: @p8-platform_DESCRIPTION@ @p8-platform_VERSION_MAJOR@.@p8-platform_VERSION_MINOR@
+ Version: @p8-platform_VERSION_MAJOR@.@p8-platform_VERSION_MINOR@.@p8-platform_VERSION_PATCH@
+ Libs: -L${libdir} -lp8-platform
+-Cflags: -I${includedir}
++Cflags: -I${includedir}/p8-platform
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-152-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-152-pfcd_hevc_optimisations.patch
index 2ba89df63d2..a306368b42d 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-152-pfcd_hevc_optimisations.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-152-pfcd_hevc_optimisations.patch
@@ -1,6 +1,6 @@
 diff -Naur ffmpeg-3.2.4/ffmpeg.c ffmpeg-3.2.4.patch/ffmpeg.c
 --- ffmpeg-3.2.4/ffmpeg.c	2017-02-10 14:25:26.000000000 +0100
-+++ ffmpeg-3.2.4.patch/ffmpeg.c	2017-03-22 22:42:34.812798482 +0100
++++ ffmpeg-3.2.4.patch/ffmpeg.c	2017-05-28 20:42:45.712088573 +0200
 @@ -23,6 +23,11 @@
   * multimedia converter based on the FFmpeg libraries
   */
@@ -39,7 +39,7 @@ diff -Naur ffmpeg-3.2.4/ffmpeg.c ffmpeg-3.2.4.patch/ffmpeg.c
  #if HAVE_SYS_RESOURCE_H
  #include <sys/time.h>
  #include <sys/types.h>
-@@ -160,6 +184,169 @@
+@@ -160,6 +184,182 @@
  static void free_input_threads(void);
  #endif
  
@@ -87,7 +87,7 @@ diff -Naur ffmpeg-3.2.4/ffmpeg.c ffmpeg-3.2.4.patch/ffmpeg.c
 +  mmal_buffer_header_release(buffer);
 +}
 +
-+static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
++static MMAL_COMPONENT_T* display_init(const enum AVPixelFormat fmt, size_t x, size_t y, size_t w, size_t h)
 +{
 +    MMAL_COMPONENT_T* display;
 +    MMAL_DISPLAYREGION_T region =
@@ -98,7 +98,7 @@ diff -Naur ffmpeg-3.2.4/ffmpeg.c ffmpeg-3.2.4.patch/ffmpeg.c
 +        .fullscreen = 0,
 +        .dest_rect = {x, y, w, h}
 +    };
-+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(w, h);
++    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h);
 +
 +    bcm_host_init();  // TODO is this needed?
 +    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
@@ -108,7 +108,7 @@ diff -Naur ffmpeg-3.2.4/ffmpeg.c ffmpeg-3.2.4.patch/ffmpeg.c
 +
 +    {
 +        MMAL_ES_FORMAT_T* format = display->input[0]->format;
-+        format->encoding = MMAL_ENCODING_I420;
++        format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 : MMAL_ENCODING_I420;
 +        format->es->video.width = geo.stride_y;
 +        format->es->video.height = geo.height_y;
 +        format->es->video.crop.x = 0;
@@ -125,7 +125,7 @@ diff -Naur ffmpeg-3.2.4/ffmpeg.c ffmpeg-3.2.4.patch/ffmpeg.c
 +    mmal_port_enable(display->input[0],display_cb_input);
 +    mmal_port_enable(display->control,display_cb_control);
 +
-+    printf("Allocated display %dx%d in %dx%d\n", w, h, geo.stride_y, geo.height_y);
++    printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt);
 +
 +    return display;
 +}
@@ -155,12 +155,24 @@ diff -Naur ffmpeg-3.2.4/ffmpeg.c ffmpeg-3.2.4.patch/ffmpeg.c
 +#ifdef RPI_ZERO_COPY
 +{
 +    const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
++    if (fr_buf == NULL) {
++        mmal_buffer_header_release(buf);
++        return;
++    }
 +
 +    buf->user_data = fr_buf;
 +    buf->data = av_rpi_zc_vc_handle(fr_buf);
-+    buf->alloc_size =
-+        buf->length = av_rpi_zc_numbytes(fr_buf);
-+
++    buf->offset = av_rpi_zc_offset(fr_buf);
++    buf->length = av_rpi_zc_length(fr_buf);
++    buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
++#if 0
++    {
++        unsigned int n;
++        for (n = 0; n < fr->width; n += 128) {
++            memset(fr->data[1] + n * fr->linesize[3], 0x80, 128 * fr->height / 2);
++        }
++    }
++#endif
 +    ++rpi_display_count;
 +}
 +#else
@@ -195,6 +207,7 @@ diff -Naur ffmpeg-3.2.4/ffmpeg.c ffmpeg-3.2.4.patch/ffmpeg.c
 +
 +static void display_exit(MMAL_COMPONENT_T* display)
 +{
++//    sleep(120);
 +    if (display) {
 +        mmal_component_destroy(display);
 +    }
@@ -209,7 +222,7 @@ diff -Naur ffmpeg-3.2.4/ffmpeg.c ffmpeg-3.2.4.patch/ffmpeg.c
  /* sub2video hack:
     Convert subtitles to video with alpha to insert them in filter graphs.
     This is a temporary solution until libavfilter gets real subtitles support.
-@@ -549,6 +736,11 @@
+@@ -549,6 +749,11 @@
          avformat_close_input(&input_files[i]->ctx);
          av_freep(&input_files[i]);
      }
@@ -221,7 +234,7 @@ diff -Naur ffmpeg-3.2.4/ffmpeg.c ffmpeg-3.2.4.patch/ffmpeg.c
      for (i = 0; i < nb_input_streams; i++) {
          InputStream *ist = input_streams[i];
  
-@@ -561,6 +753,9 @@
+@@ -561,6 +766,9 @@
          av_freep(&ist->hwaccel_device);
          av_freep(&ist->dts_buffer);
  
@@ -231,7 +244,7 @@ diff -Naur ffmpeg-3.2.4/ffmpeg.c ffmpeg-3.2.4.patch/ffmpeg.c
          avcodec_free_context(&ist->dec_ctx);
  
          av_freep(&input_streams[i]);
-@@ -591,6 +786,7 @@
+@@ -591,6 +799,7 @@
      }
      term_exit();
      ffmpeg_exited = 1;
@@ -239,7 +252,7 @@ diff -Naur ffmpeg-3.2.4/ffmpeg.c ffmpeg-3.2.4.patch/ffmpeg.c
  }
  
  void remove_avoptions(AVDictionary **a, AVDictionary *b)
-@@ -1019,6 +1215,15 @@
+@@ -1019,6 +1228,15 @@
      if (ost->source_index >= 0)
          ist = input_streams[ost->source_index];
  
@@ -247,7 +260,7 @@ diff -Naur ffmpeg-3.2.4/ffmpeg.c ffmpeg-3.2.4.patch/ffmpeg.c
 +    if (next_picture && ist != NULL)
 +    {
 +        if (!rpi_display)
-+           rpi_display = display_init(0,0,next_picture->width,next_picture->height);
++            rpi_display = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height);
 +        display_frame(ist->dec_ctx, rpi_display, next_picture);
 +    }
 +#endif
@@ -255,7 +268,7 @@ diff -Naur ffmpeg-3.2.4/ffmpeg.c ffmpeg-3.2.4.patch/ffmpeg.c
      if (filter->inputs[0]->frame_rate.num > 0 &&
          filter->inputs[0]->frame_rate.den > 0)
          duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
-@@ -2707,6 +2912,12 @@
+@@ -2707,6 +2925,12 @@
          ist->dec_ctx->opaque                = ist;
          ist->dec_ctx->get_format            = get_format;
          ist->dec_ctx->get_buffer2           = get_buffer;
@@ -270,7 +283,7 @@ diff -Naur ffmpeg-3.2.4/ffmpeg.c ffmpeg-3.2.4.patch/ffmpeg.c
          av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
 diff -Naur ffmpeg-3.2.4/libavcodec/allcodecs.c ffmpeg-3.2.4.patch/libavcodec/allcodecs.c
 --- ffmpeg-3.2.4/libavcodec/allcodecs.c	2017-02-10 14:25:26.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/allcodecs.c	2017-03-22 22:42:34.814798487 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/allcodecs.c	2017-05-28 20:42:45.714088580 +0200
 @@ -687,6 +687,7 @@
      REGISTER_PARSER(H261,               h261);
      REGISTER_PARSER(H263,               h263);
@@ -281,7 +294,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/allcodecs.c ffmpeg-3.2.4.patch/libavcodec/all
      REGISTER_PARSER(MLP,                mlp);
 diff -Naur ffmpeg-3.2.4/libavcodec/arm/cabac.h ffmpeg-3.2.4.patch/libavcodec/arm/cabac.h
 --- ffmpeg-3.2.4/libavcodec/arm/cabac.h	2016-03-29 04:25:10.000000000 +0200
-+++ ffmpeg-3.2.4.patch/libavcodec/arm/cabac.h	2017-03-22 22:42:34.815798490 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/arm/cabac.h	2017-05-28 20:42:45.715088584 +0200
 @@ -26,13 +26,34 @@
  #include "libavutil/internal.h"
  #include "libavcodec/cabac.h"
@@ -462,7 +475,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/cabac.h ffmpeg-3.2.4.patch/libavcodec/arm
  #endif /* AVCODEC_ARM_CABAC_H */
 diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevc_cabac.h ffmpeg-3.2.4.patch/libavcodec/arm/hevc_cabac.h
 --- ffmpeg-3.2.4/libavcodec/arm/hevc_cabac.h	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/arm/hevc_cabac.h	2017-03-22 22:42:34.816798492 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/arm/hevc_cabac.h	2017-05-28 20:42:45.716088588 +0200
 @@ -0,0 +1,491 @@
 +/*
 + * This file is part of FFmpeg.
@@ -957,15 +970,521 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevc_cabac.h ffmpeg-3.2.4.patch/libavcode
 +#endif /* AVCODEC_ARM_HEVC_CABAC_H */
 diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_deblock_neon.S ffmpeg-3.2.4.patch/libavcodec/arm/hevcdsp_deblock_neon.S
 --- ffmpeg-3.2.4/libavcodec/arm/hevcdsp_deblock_neon.S	2016-03-29 04:25:10.000000000 +0200
-+++ ffmpeg-3.2.4.patch/libavcodec/arm/hevcdsp_deblock_neon.S	2017-03-22 22:42:34.817798495 +0100
-@@ -383,3 +383,127 @@
++++ ffmpeg-3.2.4.patch/libavcodec/arm/hevcdsp_deblock_neon.S	2017-05-28 20:42:45.718088595 +0200
+@@ -15,7 +15,7 @@
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with FFmpeg; if not, write to the Free Software
+- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
+  */
+ 
+ 
+@@ -31,6 +31,9 @@
+         bxeq     lr
+ .endm
+ 
++@ Uses: d2, d4, d18, d19
++@ Returns: d2, d4
++@ Modifies: d0-d7, d22-d25
+ .macro hevc_loop_filter_chroma_body
+         vsubl.u8  q3, d4, d2
+         vsubl.u8  q11, d18, d19
+@@ -49,6 +52,33 @@
+         vqmovun.s16 d4, q2
+ .endm
+ 
++
++@ Uses r2[0:7], r2[8:15]
++@ Modifies: d0-d7, d22-d25
++.macro hevc_loop_filter_uv_body P1, P0, Q0, Q1
++        vsubl.u8  q3, \Q0, \P0
++        vsubl.u8  q11, \P1, \Q1
++        vshl.i16  q3, #2
++        vadd.i16  q11, q3
++
++        @ r2[0:7] -> d0.16 (all), r2[8:15] -> d1.16(all)
++        vdup.16   d0, r2
++        vmovl.u8  q0, d0
++        vuzp.16   d0, d1
++
++        vrshr.s16 q11, q11, #3
++        vneg.s16  q12, q0
++        vmovl.u8  q2, \Q0
++        vmin.s16  q11, q11, q0
++        vmax.s16  q11, q11, q12
++        vaddw.u8  q1, q11, \P0
++        vsub.i16  q2, q11
++        vqmovun.s16 \P0, q1
++        vqmovun.s16 \Q0, q2
++.endm
++
++
++
+ .macro hevc_loop_filter_luma_start
+         ldr     r12, [r3]
+         ldr      r3, [r3, #4]
+@@ -60,15 +90,17 @@
+         lsr      r3, #16
+ .endm
+ 
+-.macro hevc_loop_filter_luma_body
++@ Uses: r2, r3, r12
++@ Modifies: r5, r6, r7, r8, r9
++function hevc_loop_filter_luma_body
++        vmovl.u8  q15, d23
++        vmovl.u8  q14, d22
++        vmovl.u8  q13, d21
++        vmovl.u8  q12, d20
++        vmovl.u8  q11, d19
++        vmovl.u8  q10, d18
++        vmovl.u8  q9, d17
+         vmovl.u8  q8, d16
+-        vmovl.u8  q9, d18
+-        vmovl.u8  q10, d20
+-        vmovl.u8  q11, d22
+-        vmovl.u8  q12, d24
+-        vmovl.u8  q13, d26
+-        vmovl.u8  q14, d28
+-        vmovl.u8  q15, d30
+ 
+         vadd.i16   q7, q9, q11
+         vadd.i16   q6, q14, q12
+@@ -77,7 +109,6 @@
+         vabd.s16   q7, q7, q10
+         vabd.s16   q6, q6, q13
+ 
+-
+         vdup.16    q0, r2
+         vmov       q4, q7
+         vmov       q5, q6
+@@ -152,7 +183,7 @@
+ 
+         and        r9, r8, r7
+         cmp        r9, #0
+-        beq        weakfilter_\@
++        beq        weakfilter_
+ 
+         vadd.i16  q2, q11, q12
+         vadd.i16  q4, q9, q8
+@@ -210,11 +241,11 @@
+         vbit      q13, q3, q5
+         vbit      q14, q2, q5
+ 
+-weakfilter_\@:
++weakfilter_:
+         mvn       r8, r8
+         and       r9, r8, r7
+         cmp       r9, #0
+-        beq       ready_\@
++        beq       ready_
+ 
+         vdup.16    q4, r2
+ 
+@@ -275,75 +306,345 @@
+         vbit      q11, q0, q5
+         vbit      q12, q4, q5
+ 
+-ready_\@:
++ready_:
+         vqmovun.s16 d16, q8
+-        vqmovun.s16 d18, q9
+-        vqmovun.s16 d20, q10
+-        vqmovun.s16 d22, q11
+-        vqmovun.s16 d24, q12
+-        vqmovun.s16 d26, q13
+-        vqmovun.s16 d28, q14
+-        vqmovun.s16 d30, q15
+-.endm
++        vqmovun.s16 d17, q9
++        vqmovun.s16 d18, q10
++        vqmovun.s16 d19, q11
++        vqmovun.s16 d20, q12
++        vqmovun.s16 d21, q13
++        vqmovun.s16 d22, q14
++        vqmovun.s16 d23, q15
++        mov       pc, lr
++endfunc
++
++@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8]))
++function ff_hevc_v_loop_filter_luma2_neon_8, export=1
++        hevc_loop_filter_luma_start
++        push     {r4-r10,lr}       @ 8 regs = 32 bytes
++
++        ldr      r4, [sp, #40]
++        b        v_loop_luma_common
++endfunc
++
+ 
+ function ff_hevc_v_loop_filter_luma_neon, export=1
+         hevc_loop_filter_luma_start
+-        push     {r5-r11}
++        push     {r4-r10,lr}
++
++        sub      r4, r0, #4
++v_loop_luma_common:
++        @ Why this isn't a bitmask to start with I have no idea...
++        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
++        ldr      r5, [sp, #32]
++        ldrh     r10, [r5]
++        ldr      r5, [sp, #36]
++        ldrh     r5, [r5]
++        orr      r10, r10, r5, lsl #16  @ So should have b0:no_p[0], b8:no_p[1], b16: no_q[0], b24:no_q[1]
++
+         vpush    {d8-d15}
+-        sub      r0, #4
+-        vld1.8   {d16}, [r0], r1
+-        vld1.8   {d18}, [r0], r1
+-        vld1.8   {d20}, [r0], r1
+-        vld1.8   {d22}, [r0], r1
+-        vld1.8   {d24}, [r0], r1
+-        vld1.8   {d26}, [r0], r1
+-        vld1.8   {d28}, [r0], r1
+-        vld1.8   {d30}, [r0], r1
+-        sub      r0, r0, r1, lsl #3
+-        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
+-        hevc_loop_filter_luma_body
+-        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
+-        vst1.8   {d16}, [r0], r1
+-        vst1.8   {d18}, [r0], r1
+-        vst1.8   {d20}, [r0], r1
+-        vst1.8   {d22}, [r0], r1
+-        vst1.8   {d24}, [r0], r1
+-        vst1.8   {d26}, [r0], r1
+-        vst1.8   {d28}, [r0], r1
+-        vst1.8   {d30}, [r0]
++
++        @ Uses slightly fewer instructions to do laned loads than unlaned
++        @ and transpose.  This also means that we can use the same code for
++        @ both split & unsplit deblock
++        vld4.8  {d16[0],d17[0],d18[0],d19[0]}, [r4:32], r1
++        vld4.8  {d20[0],d21[0],d22[0],d23[0]}, [r0:32], r1
++
++        vld4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
++        vld4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
++
++        vld4.8  {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
++        vld4.8  {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
++
++        vld4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
++        vld4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
++
++        vld4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
++        vld4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
++
++        vld4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
++        vld4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
++
++        vld4.8  {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
++        vld4.8  {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
++
++        vld4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32]
++        vld4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32]
++
++        bl hevc_loop_filter_luma_body
++
++        neg     r1, r1
++
++        @ no_p[1]
++        tst     r10, #0xff00
++        itt ne
++        addne    r4, r4, r1, lsl #2
++        bne     1f
++        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
++        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
++        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
++        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
++
++1:
++        @ no_q[1]
++        tst     r10, #0xff000000
++        itt ne
++        addne    r0, r0, r1, lsl #2
++        bne     2f
++        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
++        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
++        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
++        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
++
++2:
++        @ no_p[0]
++        tst     r10, #0xff
++        bne     3f
++        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
++        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
++        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
++        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r4:32]
++
++3:
++        @ no_q[0]
++        tst     r10, #0xff0000
++        bne     4f
++        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
++        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
++        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
++        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r0:32]
++
++4:
++bypasswrite:
+         vpop     {d8-d15}
+-        pop      {r5-r11}
+-        bx lr
++        pop      {r4-r10,pc}
+ endfunc
+ 
++@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
++@                                 ptrdiff_t stride, [r1]
++@                                 int beta,         [r2]
++@                                 int32_t *tc,      [r3]
++@                                 uint8_t *no_p,    sp[0]
++@                                 uint8_t *no_q);   sp[4]
++@
++@ Src should always be on 8 byte boundry & all in the same slice
++
+ function ff_hevc_h_loop_filter_luma_neon, export=1
+         hevc_loop_filter_luma_start
+-        push     {r5-r11}
++        push     {r4-r10,lr}
++
+         vpush    {d8-d15}
+         sub      r0, r0, r1, lsl #2
++
+         vld1.8  {d16}, [r0], r1
++        vld1.8  {d17}, [r0], r1
+         vld1.8  {d18}, [r0], r1
++        vld1.8  {d19}, [r0], r1
+         vld1.8  {d20}, [r0], r1
++        vld1.8  {d21}, [r0], r1
+         vld1.8  {d22}, [r0], r1
+-        vld1.8  {d24}, [r0], r1
+-        vld1.8  {d26}, [r0], r1
+-        vld1.8  {d28}, [r0], r1
+-        vld1.8  {d30}, [r0], r1
+-        sub        r0, r0, r1, lsl #3
+-        add        r0, r1
+-        hevc_loop_filter_luma_body
+-        vst1.8   {d18}, [r0], r1
+-        vst1.8   {d20}, [r0], r1
+-        vst1.8   {d22}, [r0], r1
+-        vst1.8   {d24}, [r0], r1
+-        vst1.8   {d26}, [r0], r1
+-        vst1.8   {d28}, [r0]
+-bypasswrite:
++        vld1.8  {d23}, [r0]
++
++        bl hevc_loop_filter_luma_body
++
+         vpop     {d8-d15}
+-        pop      {r5-r11}
+-        bx lr
++
++        neg     r1, r1
++        add     r0, r0, r1
++
++        @ Why this isn't a bitmask to start with I have no idea...
++        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
++        ldr      r5, [sp, #32]
++        ldrh     r10, [r5]
++        ldr      r5, [sp, #36]
++        ldrh     r5, [r5]
++        orrs     r10, r10, r5, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
++        bne      1f
++
++        vst1.8  {d22}, [r0], r1
++        vst1.8  {d21}, [r0], r1
++        vst1.8  {d20}, [r0], r1
++        vst1.8  {d19}, [r0], r1
++        vst1.8  {d18}, [r0], r1
++        vst1.8  {d17}, [r0]
++
++        pop      {r4-r10,pc}
++
++@ Partial write
++1:
++        vmov     r2, r3, d22
++        vmov     r4, r5, d21
++        vmov     r6, r7, d20
++
++        tst      r10, #0xff0000
++        ittt eq
++        streq    r2, [r0]
++        streq    r4, [r0, r1]
++        streq    r6, [r0, r1, lsl # 1]
++
++        add      r0, r0, #4
++        tst      r10, #0xff000000
++        ittt eq
++        streq    r3, [r0]
++        streq    r5, [r0, r1]
++        streq    r7, [r0, r1, lsl # 1]
++
++        vmov     r2, r3, d19
++        vmov     r4, r5, d18
++        vmov     r6, r7, d17
++        add      r0, r0, r1
++        add      r0, r0, r1, lsl # 1
++
++        tst      r10, #0xff00
++        ittt eq
++        streq    r3, [r0]
++        streq    r5, [r0, r1]
++        streq    r7, [r0, r1, lsl # 1]
++
++        tst      r10, #0xff
++        ittt eq
++        streq    r2, [r0, #-4]!
++        streq    r4, [r0, r1]
++        streq    r6, [r0, r1, lsl # 1]
++
++        pop      {r4-r10,pc}
++
+ endfunc
+ 
++@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     unsigned int no_f);    // r3
++@
++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++function ff_hevc_h_loop_filter_uv_neon_8, export=1
++        sub      r0, r0, r1, lsl #1
++        vld2.8   {d16,d17}, [r0], r1
++        vld2.8   {d18,d19}, [r0], r1
++        vld2.8   {d26,d27}, [r0], r1
++        vld2.8   {d28,d29}, [r0]
++        sub      r0, r0, r1, lsl #1
++        hevc_loop_filter_uv_body d16, d18, d26, d28
++        lsr      r2, r2, #16
++        hevc_loop_filter_uv_body d17, d19, d27, d29
++        cmp      r3, #0
++        bne      1f
++        vst2.8   {d18,d19}, [r0], r1
++        vst2.8   {d26,d27}, [r0]
++        bx       lr
++
++        @ At least one no_f bit is set
++        @ Which means we need to break this apart in an ugly fashion
++1:      vzip.8   d18, d19
++        vzip.8   d26, d27
++        sub      r1, r1, #8
++
++        tst      r3, #1
++        bne      1f
++        vst1.8   {d18}, [r0]
++1:      add      r0, r0, #8
++        tst      r3, #2
++        bne      2f
++        vst1.8   {d19}, [r0]
++2:      add      r0, r0, r1
++
++        tst      r3, #4
++        bne      1f
++        vst1.8   {d26}, [r0]
++1:      add      r0, r0, #8
++        tst      r3, #8
++        it ne
++        bxne     lr
++        vst1.8   {d27}, [r0]
++        bx       lr
++
++endfunc
++
++
++@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     uint8_t * src_l,       // r3
++@                                     unsigned int no_f);   // sp[0]
++@
++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++function ff_hevc_v_loop_filter_uv2_neon_8, export=1
++        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3], r1
++        vld4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0], r1
++
++        vld4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
++        vld4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
++
++        vld4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
++        vld4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
++
++        vld4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
++        vld4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
++
++        vld4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
++        vld4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
++
++        vld4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
++        vld4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
++
++        vld4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
++        vld4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
++
++        vld4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3]
++        vld4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0]
++
++        hevc_loop_filter_uv_body d16, d18, d26, d28
++        lsr      r2, r2, #16
++        hevc_loop_filter_uv_body d17, d19, d27, d29
++
++        neg      r1, r1
++
++        ldr      r2, [sp, #0]
++
++        @ p[1]
++        tst      r2, #2
++        itt ne
++        addne    r3, r3, r1, lsl #2
++        bne      1f
++        vst4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3], r1
++        vst4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
++        vst4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
++        vst4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
++
++1:
++        @ q[1]
++        tst      r2, #8
++        itt ne
++        addne    r0, r0, r1, lsl #2
++        bne 2f
++        vst4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0], r1
++        vst4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
++        vst4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
++        vst4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
++
++2:
++        @ p[0]
++        tst      r2, #1
++        bne      3f
++        vst4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
++        vst4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
++        vst4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
++        vst4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3]
++
++3:
++        @ q[0]
++        tst      r2, #4
++        it ne
++        bxne     lr
++        vst4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
++        vst4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
++        vst4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
++        vst4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0]
++
++        bx       lr
++endfunc
++
++
+ function ff_hevc_v_loop_filter_chroma_neon, export=1
+         hevc_loop_filter_chroma_start
+         sub      r0, #4
+@@ -383,3 +684,128 @@
          vst1.8   {d4}, [r0]
          bx       lr
  endfunc
 +
-+/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
-+ *                                            int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-+ *                                            MvField *curr, MvField *neigh, uint8_t *bs)
++/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i
++ *                                            int *curr_rpl0, int *curr_
++ *                                            MvField *curr, MvField *ne
 + */
 +function ff_hevc_deblocking_boundary_strengths_neon, export=1
 +        add         ip, sp, #4*4
@@ -1086,9 +1605,10 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_deblock_neon.S ffmpeg-3.2.4.patch
 +90:     mov         a3, #1
 +        b           11b
 +endfunc
++
 diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_epel_neon.S ffmpeg-3.2.4.patch/libavcodec/arm/hevcdsp_epel_neon.S
 --- ffmpeg-3.2.4/libavcodec/arm/hevcdsp_epel_neon.S	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/arm/hevcdsp_epel_neon.S	2017-03-22 22:42:34.818798498 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/arm/hevcdsp_epel_neon.S	2017-05-28 20:42:45.719088598 +0200
 @@ -0,0 +1,337 @@
 +/*
 + * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
@@ -1429,8 +1949,8 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_epel_neon.S ffmpeg-3.2.4.patch/li
 +       .byte 2, 10, 58, 2
 diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_init_neon.c ffmpeg-3.2.4.patch/libavcodec/arm/hevcdsp_init_neon.c
 --- ffmpeg-3.2.4/libavcodec/arm/hevcdsp_init_neon.c	2016-03-29 04:25:10.000000000 +0200
-+++ ffmpeg-3.2.4.patch/libavcodec/arm/hevcdsp_init_neon.c	2017-03-22 22:42:34.818798498 +0100
-@@ -22,6 +22,8 @@
++++ ffmpeg-3.2.4.patch/libavcodec/arm/hevcdsp_init_neon.c	2017-05-28 20:42:45.720088602 +0200
+@@ -22,11 +22,26 @@
  #include "libavutil/arm/cpu.h"
  #include "libavcodec/hevcdsp.h"
  #include "hevcdsp_arm.h"
@@ -1439,7 +1959,25 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_init_neon.c ffmpeg-3.2.4.patch/li
  
  void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
  void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-@@ -43,6 +45,21 @@
+ void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+ void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++
++#ifdef RPI
++void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
++                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
++                             const uint8_t no_p[2], const uint8_t no_q[2],
++                             uint8_t * _pix_l);
++void ff_hevc_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
++                             unsigned int no_f);
++void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                             uint8_t * src_l,
++                             unsigned int no_f);
++#endif
++
+ void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
+ void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
+ void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
+@@ -43,6 +58,31 @@
  void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
                                        ptrdiff_t stride);
  
@@ -1457,11 +1995,21 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_init_neon.c ffmpeg-3.2.4.patch/li
 +void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
 +void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
 +void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
++
++void ff_hevc_sao_edge_c_w64_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height,
++                                   const int16_t *sao_offset_table_u, const int16_t *sao_offset_table_v, int eo);
++
++void ff_hevc_sao_band_c_neon_8(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++
 +
  #define PUT_PIXELS(name) \
      void name(int16_t *dst, uint8_t *src, \
                                  ptrdiff_t srcstride, int height, \
-@@ -58,6 +75,15 @@
+@@ -58,6 +98,15 @@
  PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
  PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
  #undef PUT_PIXELS
@@ -1477,7 +2025,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_init_neon.c ffmpeg-3.2.4.patch/li
  
  static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
                                     int height, int width);
-@@ -142,6 +168,132 @@
+@@ -142,14 +191,239 @@
      put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
  }
  
@@ -1523,6 +2071,50 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_init_neon.c ffmpeg-3.2.4.patch/li
 +    }
 +}
 +
++static void ff_hevc_sao_band_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
++{
++    // Width 32 already dealt with
++    // width 16 code works in double lines
++    if (width == 16 && (height & 1) == 0) {
++        ff_hevc_sao_band_c_neon_8(_dst, _src, stride_src, stride_dst,
++                                          sao_offset_val_u, sao_left_class_u,
++                                          sao_offset_val_v, sao_left_class_v,
++                                          width, height);
++    }
++    else
++    {
++        const int shift  = 3; // BIT_DEPTH - 5
++        int k, y, x;
++        pixel *dst = (pixel *)_dst;
++        pixel *src = (pixel *)_src;
++        int8_t offset_table_u[32] = { 0 };
++        int8_t offset_table_v[32] = { 0 };
++
++        stride_src /= sizeof(pixel);
++        stride_dst /= sizeof(pixel);
++
++        for (k = 0; k < 4; k++)
++            offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
++        for (k = 0; k < 4; k++)
++            offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
++
++        for (y = 0; y < height; y++) {
++            for (x = 0; x < width * 2; x += 2)
++            {
++                dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
++                dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
++            }
++            dst += stride_dst;
++            src += stride_src;
++
++        }
++    }
++}
++
 +#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
 +static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
 +                                          int16_t *_sao_offset_val, int eo, int width, int height)
@@ -1601,6 +2193,54 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_init_neon.c ffmpeg-3.2.4.patch/li
 +        }
 +    }
 +}
++
++
++static void ff_hevc_sao_edge_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height)
++{
++    const ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
++
++    if (width == 32 && (height & 7) == 0) {
++        ff_hevc_sao_edge_c_w64_neon_8(_dst, _src, stride_dst, stride_src, height, _sao_offset_val_u, _sao_offset_val_v, eo);
++    }
++    else
++    {
++        static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++        static const int8_t pos[4][2][2] = {
++            { { -1,  0 }, {  1, 0 } }, // horizontal
++            { {  0, -1 }, {  0, 1 } }, // vertical
++            { { -1, -1 }, {  1, 1 } }, // 45 degree
++            { {  1, -1 }, { -1, 1 } }, // 135 degree
++        };
++        int8_t sao_offset_val_u[8];  // padding of 3 for vld
++        int8_t sao_offset_val_v[8];  // padding of 3 for vld
++        pixel *dst = (pixel *)_dst;
++        pixel *src = (pixel *)_src;
++        int a_stride, b_stride;
++        int x, y;
++
++        for (x = 0; x < 5; x++) {
++            sao_offset_val_u[x] = _sao_offset_val_u[edge_idx[x]];
++            sao_offset_val_v[x] = _sao_offset_val_v[edge_idx[x]];
++        }
++
++        a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
++        b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
++        for (y = 0; y < height; y++) {
++            for (x = 0; x < width * 2; x += 2) {
++                int diff0u = CMP(src[x], src[x + a_stride]);
++                int diff1u = CMP(src[x], src[x + b_stride]);
++                int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
++                int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
++                dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[2 + diff0u + diff1u]);
++                dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[2 + diff0v + diff1v]);
++            }
++            src += stride_src;
++            dst += stride_dst;
++        }
++    }
++}
 +#undef CMP
 +
 +void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
@@ -1610,18 +2250,36 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_init_neon.c ffmpeg-3.2.4.patch/li
  av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
  {
      if (bit_depth == 8) {
-@@ -161,6 +313,10 @@
+         int x;
+         c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon;
++        c->hevc_v_loop_filter_luma_c   = ff_hevc_v_loop_filter_luma_neon;
+         c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon;
++        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon;
+         c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon;
+         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon;
++#ifdef RPI
++        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_8;
++        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_8;
++        c->hevc_v_loop_filter_uv2      = ff_hevc_v_loop_filter_uv2_neon_8;
++#endif
+         c->idct[0]                     = ff_hevc_transform_4x4_neon_8;
+         c->idct[1]                     = ff_hevc_transform_8x8_neon_8;
+         c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_8;
+@@ -161,6 +435,13 @@
          c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
          c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
          c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
 +        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
 +          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
++          c->sao_band_filter_c[x]      = ff_hevc_sao_band_c_neon_wrapper;
 +          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
++          c->sao_edge_filter_c[x]      = ff_hevc_sao_edge_c_neon_wrapper;
 +        }
++        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_neon_8;  // width=32
          put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
          put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
          put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
-@@ -201,7 +357,21 @@
+@@ -201,7 +482,21 @@
              c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
              c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
              c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
@@ -1643,7 +2301,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_init_neon.c ffmpeg-3.2.4.patch/li
          c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
          c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
          c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
-@@ -221,4 +391,9 @@
+@@ -221,4 +516,9 @@
          c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
          c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
      }
@@ -1655,8 +2313,8 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_init_neon.c ffmpeg-3.2.4.patch/li
  }
 diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_sao_neon.S ffmpeg-3.2.4.patch/libavcodec/arm/hevcdsp_sao_neon.S
 --- ffmpeg-3.2.4/libavcodec/arm/hevcdsp_sao_neon.S	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/arm/hevcdsp_sao_neon.S	2017-03-22 22:42:34.819798500 +0100
-@@ -0,0 +1,510 @@
++++ ffmpeg-3.2.4.patch/libavcodec/arm/hevcdsp_sao_neon.S	2017-05-28 20:42:45.721088605 +0200
+@@ -0,0 +1,862 @@
 +/*
 + * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
 + *
@@ -1782,24 +2440,186 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_sao_neon.S ffmpeg-3.2.4.patch/lib
 +
 +function ff_hevc_sao_band_w64_neon_8, export=1
 +        init_sao_band
++
++        push      {r4, lr}
++        subs      r12, #1
++        mov       r4, r1
++        it ne
++        addne     r4, r3
++
 +1:      subs      r12, #1
-+        pld       [r1, r3]
-+        vld1.8    {q8-q9}, [r1, :128]!
-+        vshr.u8  q12, q8, #3
-+        vshr.u8  q13, q9, #3
-+        vld1.8    {q10-q11}, [r1, :128], r3
-+        vshr.u8  q14, q10, #3
-+        vshr.u8  q15, q11, #3
-+        sub       r1, #32
++        vldm      r1, {q8-q11}
++        pld       [r4]
++        vshr.u8   q12, q8, #3
++        vshr.u8   q13, q9, #3
++        add       r1, r3
++        vshr.u8   q14, q10, #3
++        vshr.u8   q15, q11, #3
 +        sao_band_64
-+        vst1.8    {q8-q9}, [r0, :128]!
-+        vst1.8    {q10-q11}, [r0, :128], r2
++        it ne
++        addne     r4, r3
++        vstm      r0, {q8-q11}
++        add       r0, r2
++        bpl       1b
++
++        pop       {r4, pc}
++endfunc
++
++
++@ ff_hevc_sao_band_c_w64_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++@ As this is often done in-place on the frame buffer it is worth preloading
++@ the pixel values but we want to beware of loading ouside our buffer to avoid
++@ loading stuff into the cache that should still be invalid (in use by QPU, VPU)
++
++function ff_hevc_sao_band_c_neon_8, export=1
++        mov     r12, sp
++        push   {r4-r8, lr}  // 24 bytes
++
++        ldm     r12, {r4-r7}
++
++        add     r4, #2
++        add     r6, #2
++        vld1.16 {d16}, [r4]    @ Unaligned
++        lsl     r5, r5, #3
++        vld1.16 {d18}, [r6]
++        pld     [r1]
++        vmov.i8  d17, #0
++        mov     r4, r1
++        vmov.i8  d19, #0
++        lsl     r7, r7, #3
++        vdup.8  q1, r5
++        ldr     r5, [r12, #16]  @ width
++        vdup.8  q2, r7
++        ldr     r12, [r12, #20]
++        vqmovn.s16 d0, q8
++        cmp     r5, #16         @ At some point we may want a table lookup
++        vqmovn.s16 d1, q9
++        vmov.i8 q3, #128
++        beq     16f
++
++        @ d0 U lookup
++        @ d1 V lookup
++        @ q1 U raw offset
++        @ q2 V raw offset
++        @ q3 #128
++
++        @ r4 = r1 = src - Inteded for preload pointer
++        @ r12 = height
++
++        @ Might (unlikely) be called with height == 1
++        subs      r12, #1
++        it ne
++        addne     r4, r3
++
++1:
++        subs      r12, #1
++        vld2.8    {q8-q9}, [r1, :128]!
++        vsub.u8   q12, q8, q1
++        vld2.8    {q10-q11}, [r1, :128], r3
++        vsub.u8   q14, q10, q1
++        vsub.u8   q13, q9, q2
++        sub       r1, #32
++        vsub.u8   q15, q11, q2
++        pld       [r4]
++        vshr.u8   q12, #3
++        vadd.s8   q8, q3
++        vshr.u8   q13, #3
++        vadd.s8   q9, q3
++
++        vtbl.8   d24, {d0}, d24
++        vshr.u8  q14, #3
++        vtbl.8   d25, {d0}, d25
++        vshr.u8  q15, #3
++        vtbl.8   d26, {d1}, d26
++        vadd.s8  q10, q3
++        vtbl.8   d27, {d1}, d27
++        vadd.s8  q11, q3
++        vtbl.8   d28, {d0}, d28
++        vqadd.s8 q8, q12
++        vtbl.8   d29, {d0}, d29
++        vqadd.s8 q9, q13
++        vtbl.8   d30, {d1}, d30
++        vqadd.s8 q10, q14
++        vtbl.8   d31, {d1}, d31
++        vsub.s8  q8, q3
++        vqadd.s8 q11, q15
++        vsub.s8  q9, q3
++        vsub.s8  q10, q3
++        vsub.s8  q11, q3
++
++        it ne
++        addne     r4, r3        @ Do not inc on final pass
++        vst2.8    {q8-q9}, [r0, :128]!
++        vst2.8    {q10-q11}, [r0, :128], r2
 +        sub       r0, #32
-+        bne       1b
++        bpl       1b
++
++        pop    {r4-r8, pc}
++
++@ -- width 16 (UV pairs) --
++16:
++        subs    r12, #2
++        it ne
++        addne   r4, r4, r3, lsl #1
++
++1:
++        subs      r12, #2
++        vld2.8    {q8-q9}, [r1, :128], r3
++        vsub.u8   q12, q8, q1
++        vld2.8    {q10-q11}, [r1, :128], r3
++        vsub.u8   q14, q10, q1
++        vsub.u8   q13, q9, q2
++        pld       [r4]
++        vsub.u8   q15, q11, q2
++        pld       [r4, r3]
++        vshr.u8  q12, #3
++        vadd.s8  q8, q3
++        vshr.u8  q13, #3
++        vadd.s8  q9, q3
++
++        vtbl.8   d24, {d0}, d24
++        vshr.u8  q14, #3
++        vtbl.8   d25, {d0}, d25
++        vshr.u8  q15, #3
++        vtbl.8   d26, {d1}, d26
++        vadd.s8  q10, q3
++        vtbl.8   d27, {d1}, d27
++        vadd.s8  q11, q3
++        vtbl.8   d28, {d0}, d28
++        vqadd.s8 q8, q12
++        vtbl.8   d29, {d0}, d29
++        vqadd.s8 q9, q13
++        vtbl.8   d30, {d1}, d30
++        vqadd.s8 q10, q14
++        vtbl.8   d31, {d1}, d31
++        vsub.s8  q8, q3
++        vqadd.s8 q11, q15
++        vsub.s8  q9, q3
++        vsub.s8  q10, q3
++        vsub.s8  q11, q3
++
++        it ne
++        addne   r4, r4, r3, lsl #1
++        vst2.8    {q8-q9}, [r0, :128], r2
++        vst2.8    {q10-q11}, [r0, :128], r2
++        bpl       1b
++
++        pop    {r4-r8, pc}
 +
-+        bx lr
 +endfunc
 +
++
 +.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
 +        vcgt.u8 \out0, \in2, \in0  // c > a -> -1 , otherwise 0
 +        vcgt.u8 \tmp0,  \in0, \in2  // a > c -> -1 , otherwise 0
@@ -1809,71 +2629,120 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_sao_neon.S ffmpeg-3.2.4.patch/lib
 +        vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
 +.endm
 +
-+.macro table64
-+        vmov.s8 q13, #2 // 2 to all elements
-+        vmov.32  d24[0], r4  // load offset table from general registers
-+        vmov.32  d24[1], r5  // load rest of offset table
-+
-+        vadd.s8 q0, q13
-+        vadd.s8 q1, q13
-+        vadd.s8 q2, q13
-+        vadd.s8 q3, q13
-+
-+        vmov.u8  q15, #128 // s8 #-128
-+        vtbl.8   d0, {d24}, d0
-+        vadd.s8  q13,  q4, q15
-+        vtbl.8   d1, {d24}, d1
-+        vadd.s8  q14,  q5, q15
-+        vtbl.8   d2, {d24}, d2
-+        vqadd.s8 q0, q13
-+        vtbl.8   d3, {d24}, d3
-+        vqadd.s8 q1, q14
-+        vtbl.8   d4, {d24}, d4
-+        vadd.s8  q13,  q6, q15
-+        vtbl.8   d5, {d24}, d5
-+        vadd.s8  q14,  q7, q15
-+        vtbl.8   d6, {d24}, d6
-+        vqadd.s8 q2, q13
-+        vtbl.8   d7, {d24}, d7
-+        vqadd.s8 q3, q14
-+        vsub.s8   q0, q15
-+        vsub.s8   q1, q15
-+        vsub.s8   q2, q15
-+        vsub.s8   q3, q15
-+        vst1.8  {q0-q1}, [r0, :128]!
-+        vst1.8  {q2-q3}, [r0, :128], r2
-+        sub     r0, #32
-+.endm
 +
 +// input
 +// a in q0 - q3
 +// c in q4 - q7
 +// b in q8 - q11
-+// offset table in r7 and r5
++// offset table r4,r5 and r6,r7
++//   r4,r5 applied to even samples; r6 r7 applied to odd - allows filtering of C
 +// output in q0 - q3
 +// clobbers q12 - q15
-+.macro edge_w64_body
-+        diff32 q12, q13, q0, q1, q0, q1, q4, q5
-+        diff32 q0, q1, q14, q15, q8, q9, q4, q5
 +
-+        vadd.s8  q0, q12 //diff0 + diff1
-+        vadd.s8  q1, q13
++@ a <- c <- b
++@
++@ It appears that Neon can stall if you try and use results too soon so we try to
++@ spread our instruction out
++
++.macro edgeidx64
++
++        vcgt.u8 q12, q4, q0  // c > a -> -1 , otherwise 0
++        vcgt.u8 q13, q5, q1
++        vcgt.u8 q14, q6, q2
++        vcgt.u8 q15, q7, q3
++
++        vcgt.u8 q0, q0, q4  // a > c -> -1 , otherwise 0
++        vcgt.u8 q1, q1, q5
++        vcgt.u8 q2, q2, q6
++        vcgt.u8 q3, q3, q7
++
++        vsub.s8 q0, q0, q12 // a = sign(c-a)
++        vsub.s8 q1, q1, q13
++        vsub.s8 q2, q2, q14
++        vsub.s8 q3, q3, q15
++
++        vcgt.u8 q12, q4, q8  // c > b -> -1 , otherwise 0
++        vcgt.u8 q13, q5, q9
++        vcgt.u8 q14, q6, q10
++        vcgt.u8 q15, q7, q11
++
++        vsub.s8 q0, q0, q12
++        vsub.s8 q1, q1, q13
++        vsub.s8 q2, q2, q14
++        vsub.s8 q3, q3, q15
++
++        vcgt.u8 q12, q8, q4  // c < b -> -1 , otherwise 0
++        vcgt.u8 q13, q9, q5
++        vcgt.u8 q14, q10, q6
++        vcgt.u8 q15, q11, q7
++
++        vadd.s8 q0, q0, q12  // a = sign(c-a) + sign(c-b)
++        vadd.s8 q1, q1, q13
++        vmov.u8 q12, #2
++        vadd.s8 q2, q2, q14
++        vadd.s8 q3, q3, q15
++
++        vadd.s8 q0, q0, q12
++        vadd.s8 q1, q1, q12
++        @ whilst vmov dn, rm, rn exists it is a vfp instruction
++        @ and causes a stall till neon pipe empty - so don't do that!
++        vmov    d26[0], r4
++        vmov    d26[1], r5
++        vmov    d27[0], r6
++        vmov    d27[1], r7
++        vadd.s8 q2, q2, q12
++        vuzp.8    q0, q1
++        vmov.u8 q15, #128
++        vadd.s8 q3, q3, q12 // a = 2 + sign(c-a) + sign(c-b)
++
++        vtbl.8  d0, {d26}, d0
++        vadd.s8 q12, q4, q15  // Add -128 so we can use saturating signed add
++
++        vtbl.8  d1, {d26}, d1
++        vadd.s8 q14, q5, q15
++
++        vtbl.8  d2, {d27}, d2
++        vuzp.8    q2, q3
++
++        vtbl.8  d3, {d27}, d3
++
++        vtbl.8  d4, {d26}, d4
++        vzip.8    q0, q1
++
++        vtbl.8  d5, {d26}, d5
++        vqadd.s8 q0, q0, q12
++        vqadd.s8 q1, q1, q14
++        vadd.s8 q12, q6, q15  // Add -128 so we can use saturating signed add
++
++        vtbl.8  d6, {d27}, d6
++        vadd.s8 q14, q7, q15  // Add -128 so we can use saturating signed add
++
++        vtbl.8  d7, {d27}, d7
++        vzip.8   q2, q3
++
++        vsub.s8 q0, q0, q15
++        vqadd.s8 q2, q2, q12
++        vqadd.s8 q3, q3, q14
++        vsub.s8 q1, q1, q15
++        vsub.s8 q2, q2, q15
++        vsub.s8 q3, q3, q15
 +
-+        diff32  q14, q15, q2, q3, q2, q3, q6, q7
-+        diff32  q2, q3, q12, q13, q10, q11, q6, q7
-+
-+        vadd.s8  q2, q14
-+        vadd.s8  q3, q15
-+        table64
 +.endm
 +
++function edge_w64_body
++        edgeidx64
++        vstm    r0, {q0-q3}
++        add     r0, r0, r2
++        bx       lr
++endfunc
++
 +.macro init_edge_64
-+        push   {r4-r5}
-+        ldr    r12, [sp, #8] // height
-+        ldr    r5, [sp, #12] // sao_offset_val_table
-+        ldr    r4, [r5]
-+        add    r5, #4
-+        ldr    r5, [r5]
++        push   {r4-r8,lr}
++        ldr    r12, [sp, #24] // height
++        ldr    r5,  [sp, #28] // sao_offset_val_table
++        ldrd   r4, r5, [r5]
++        mov    r6, r4
++        mov    r7, r5
 +.endm
 +
 +function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
@@ -1896,11 +2765,10 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_sao_neon.S ffmpeg-3.2.4.patch/lib
 +        vext.8 q9, q5, q6, #1
 +        vext.8 q10, q6, q7, #1
 +        vext.8 q11, q7, q12, #1
-+        edge_w64_body
++        bl    edge_w64_body
 +        bne   1b
 +        vpop  {d8-d15}
-+        pop   {r4-r5}
-+        bx lr
++        pop   {r4-r8,pc}
 +endfunc
 +
 +function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
@@ -1920,7 +2788,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_sao_neon.S ffmpeg-3.2.4.patch/lib
 +        vld1.8  {q8-q9}, [r1, :128]!
 +        vld1.8  {q10-q11}, [r1, :128], r3
 +        sub     r1, #32
-+        edge_w64_body
++        bl      edge_w64_body
 +        // copy c to a
 +        vmov.64 q0, q4
 +        vmov.64 q1, q5
@@ -1933,8 +2801,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_sao_neon.S ffmpeg-3.2.4.patch/lib
 +        vmov.64 q7, q11
 +        bne   1b
 +        vpop  {d8-d15}
-+        pop   {r4-r5}
-+        bx lr
++        pop   {r4-r8,pc}
 +endfunc
 +
 +function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
@@ -1958,11 +2825,10 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_sao_neon.S ffmpeg-3.2.4.patch/lib
 +        vld1.8  {q8-q9}, [r1]!
 +        vld1.8  {q10-q11}, [r1]
 +        sub     r1, #33
-+        edge_w64_body
++        bl      edge_w64_body
 +        bne   1b
 +        vpop  {d8-d15}
-+        pop   {r4-r5}
-+        bx lr
++        pop   {r4-r8,pc}
 +endfunc
 +
 +function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
@@ -1986,13 +2852,157 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_sao_neon.S ffmpeg-3.2.4.patch/lib
 +        vld1.8  {q8-q9}, [r1]!
 +        vld1.8  {q10-q11}, [r1]
 +        sub     r1, #31
-+        edge_w64_body
++        bl      edge_w64_body
 +        bne   1b
 +        vpop  {d8-d15}
-+        pop   {r4-r5}
-+        bx lr
++        pop   {r4-r8,pc}
++endfunc
++
++
++@ void ff_hevc_sao_edge_c_eo1_w64_neon_8(
++@   uint8_t *_dst,               r0
++@   uint8_t *_src,               r1
++@   ptrdiff_t stride_dst,        r2
++@   ptrdiff_t stride_src,        r3
++@   int height,                  sp[0]
++@   int16_t *sao_offset_table_u,  sp[4]
++@   int16_t *sao_offset_table_v); sp[8]
++@   int eo                        sp[12]
++
++function ff_hevc_sao_edge_c_w64_neon_8, export=1
++        push   {r4-r8,lr}     // 6 reg = 24
++        ldr    r5,  [sp, #28] // sao_offset_val_table_u
++        ldr    r7,  [sp, #32] // sao_offset_val_table_v
++
++        @ Load and rearrange offsets
++        @ Also "convert" from 16bit to 8bit
++        ldrb    r4, [r5, #2]
++        ldrb    r8, [r5, #4]
++        ldrb    r6, [r7, #2]
++        ldrb    r12, [r7, #4]
++        orr     r4, r4, r8, lsl #8
++        orr     r6, r6, r12, lsl #8
++        ldrb    r8, [r5, #6]
++        ldrb    r12, [r7, #6]
++        orr     r4, r4, r8, lsl #24
++        orr     r6, r6, r12, lsl #24
++        ldrb    r5, [r5, #8]
++        ldrb    r7, [r7, #8]
++
++        ldr     r12, [sp, #36] // e0
++        adr     r8, edge_c_tbl_w64
++        ldr     r8, [r8, r12, lsl #2]
++
++        ldr     r12, [sp, #24] // height
++        vpush   {d8-d15}
++        mov     pc, r8
++
++edge_c_tbl_w64:
++        .word   ff_hevc_sao_edge_c_eo0_w64_neon_8
++        .word   ff_hevc_sao_edge_c_eo1_w64_neon_8
++        .word   ff_hevc_sao_edge_c_eo2_w64_neon_8
++        .word   ff_hevc_sao_edge_c_eo3_w64_neon_8
++
++ff_hevc_sao_edge_c_eo0_w64_neon_8:
++        sub    r1, #8
++1:      subs    r12, #1
++        vld1.64  {d7}, [r1, :64]!
++        vld1.64  {q4-q5}, [r1, :128]! // load c
++        vld1.64  {q6-q7}, [r1, :128]!
++        vld1.64  {d24}, [r1, :64], r3
++        sub      r1, #72
++        // load a
++        vext.8 q0, q3, q4, #14
++        vext.8 q1, q4, q5, #14
++        vext.8 q2, q5, q6, #14
++        vext.8 q3, q6, q7, #14
++        // load b
++        vext.8 q8, q4, q5, #2
++        vext.8 q9, q5, q6, #2
++        vext.8 q10, q6, q7, #2
++        vext.8 q11, q7, q12, #2
++        bl    edge_w64_body
++        bne   1b
++        vpop  {d8-d15}
++        pop   {r4-r8,pc}
++
++ff_hevc_sao_edge_c_eo1_w64_neon_8:
++        sub     r1, r3
++        // load a
++        vldm    r1, {q0-q3}
++        add     r1, r3
++        // load c
++        vldm    r1, {q4-q7}
++        add     r1, r3
++1:      subs    r12, #1
++        // load b
++        vldm    r1, {q8-q11}
++        add     r1, r3
++        bl      edge_w64_body
++        // copy c to a
++        vmov.64 q0, q4
++        vmov.64 q1, q5
++        vmov.64 q2, q6
++        vmov.64 q3, q7
++        // copy b to c
++        vmov.64 q4, q8
++        vmov.64 q5, q9
++        vmov.64 q6, q10
++        vmov.64 q7, q11
++        bne   1b
++        vpop  {d8-d15}
++        pop   {r4-r8,pc}
++
++ff_hevc_sao_edge_c_eo2_w64_neon_8:
++1:      sub     r1, r3
++        // load a
++        // TODO: fix unaligned load
++        //       don't reload a like in eo1
++        sub     r1, #2
++        vld1.8  {q0-q1}, [r1]!
++        vld1.8  {q2-q3}, [r1], r3
++        sub     r1, #30
++        subs    r12, #1
++        // load c
++        vld1.8  {q4-q5}, [r1, :128]!
++        vld1.8  {q6-q7}, [r1, :128], r3
++        sub     r1, #32
++        // load b
++        add     r1, #2
++        vld1.8  {q8-q9}, [r1]!
++        vld1.8  {q10-q11}, [r1]
++        sub     r1, #34
++        bl      edge_w64_body
++        bne   1b
++        vpop  {d8-d15}
++        pop   {r4-r8,pc}
++
++ff_hevc_sao_edge_c_eo3_w64_neon_8:
++1:      sub     r1, r3
++        // load a
++        // TODO: fix unaligned load
++        //       don't reload a like in eo1
++        add     r1, #2
++        vld1.8  {q0-q1}, [r1]!
++        vld1.8  {q2-q3}, [r1], r3
++        sub     r1, #34
++        subs    r12, #1
++        // load c
++        vld1.8  {q4-q5}, [r1, :128]!
++        vld1.8  {q6-q7}, [r1, :128], r3
++        sub     r1, #32
++        // load b
++        sub     r1, #2
++        vld1.8  {q8-q9}, [r1]!
++        vld1.8  {q10-q11}, [r1]
++        sub     r1, #30
++        bl      edge_w64_body
++        bne   1b
++        vpop  {d8-d15}
++        pop   {r4-r8,pc}
 +endfunc
 +
++
 +.macro init_edge_32
 +        ldr     r12, [sp, #4] // sao_offset_val_table
 +        vld1.32 {d31}, [r12]
@@ -2109,7 +3119,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_sao_neon.S ffmpeg-3.2.4.patch/lib
 +        vext.8  q7, q11, q12, #8
 +        vext.8  q5, q10, q11, #7
 +        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-+        diff32 q0, q1, q10, q11, q8, q9, q2, q3
++        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
 +        vadd.s8 q0, q12 //diff0 + diff1
 +        vadd.s8 q1, q13
 +        table32
@@ -2149,7 +3159,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_sao_neon.S ffmpeg-3.2.4.patch/lib
 +        vext.8  q14, q12, q10, #7
 +
 +        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-+        diff32 q0, q1, q10, q11, q8, q9, q2, q3
++        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
 +
 +        vadd.s8 q0, q12 //diff0 + diff1
 +        vadd.s8 q1, q13
@@ -2167,12 +3177,80 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevcdsp_sao_neon.S ffmpeg-3.2.4.patch/lib
 +        bx      lr
 +endfunc
 +
+diff -Naur ffmpeg-3.2.4/libavcodec/arm/hevc_misc_neon.S ffmpeg-3.2.4.patch/libavcodec/arm/hevc_misc_neon.S
+--- ffmpeg-3.2.4/libavcodec/arm/hevc_misc_neon.S	1970-01-01 01:00:00.000000000 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/arm/hevc_misc_neon.S	2017-05-28 20:42:45.717088591 +0200
+@@ -0,0 +1,62 @@
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ rpi_zap_coeff_vals_neon(
++@   uint16_t * buf,          [r0]
++@   unsigned int log_n_m2)   [r1]
++
++function rpi_zap_coeff_vals_neon, export=1
++        vmov.i64 q8, #0
++        adr     r12, zc_tab
++        vmov.i64 q9, #0
++        tst     r0, #63
++        vmov.i64 q10, #0
++        add     r0, #63
++        vmov.i64 q11, #0
++        and     r0, #~63
++        ldr     pc, [r12, r1, lsl #2]
++
++zc_tab:
++        .word   zc_lc2
++        .word   zc_lc3
++        .word   zc_lc4
++        .word   zc_lc5
++
++@ 4*4*2: "32 bytes" 64 or 0 depending on dst address
++zc_lc2:
++        it eq
++        vstmeq  r0, {q8-q11}
++        bx      lr
++
++@ 16*16*2 = 512 = 64 * 8
++zc_lc4:
++        vstm    r0!, {q8-q11}
++        vstm    r0!, {q8-q11}
++        vstm    r0!, {q8-q11}
++        vstm    r0!, {q8-q11}
++        vstm    r0!, {q8-q11}
++        vstm    r0!, {q8-q11}
++@ 8*8*2 = 128
++zc_lc3:
++        vstm    r0!, {q8-q11}
++        vstm    r0,  {q8-q11}
++        bx      lr
++
++@ 32*32*2 = 2048 = 128 * 16
++zc_lc5:
++        vmov.i64 q12, #0
++        vmov.i64 q13, #0
++        vmov.i64 q14, #0
++        vmov.i64 q15, #0
++        mov     r2, #4
++1:
++        vstm    r0!, {q8-q15}
++        subs    r2, #1
++        vstm    r0!, {q8-q15}
++        vstm    r0!, {q8-q15}
++        vstm    r0!, {q8-q15}
++        bne     1b
++        bx      lr
++
++endfunc
++
 diff -Naur ffmpeg-3.2.4/libavcodec/arm/Makefile ffmpeg-3.2.4.patch/libavcodec/arm/Makefile
 --- ffmpeg-3.2.4/libavcodec/arm/Makefile	2017-02-10 14:25:26.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/arm/Makefile	2017-03-22 22:42:34.814798487 +0100
-@@ -132,8 +132,10 @@
++++ ffmpeg-3.2.4.patch/libavcodec/arm/Makefile	2017-05-28 20:42:45.714088580 +0200
+@@ -131,9 +131,12 @@
+ NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
  NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
  NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
++                                          arm/hevc_misc_neon.o          \
                                            arm/hevcdsp_deblock_neon.o    \
 +                                          arm/hevcdsp_epel_neon.o       \
                                            arm/hevcdsp_idct_neon.o       \
@@ -2184,7 +3262,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/arm/Makefile ffmpeg-3.2.4.patch/libavcodec/ar
                                            arm/rv40dsp_neon.o
 diff -Naur ffmpeg-3.2.4/libavcodec/avcodec.h ffmpeg-3.2.4.patch/libavcodec/avcodec.h
 --- ffmpeg-3.2.4/libavcodec/avcodec.h	2017-02-10 14:25:26.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/avcodec.h	2017-03-22 22:43:54.521005055 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/avcodec.h	2017-05-28 20:43:41.666286716 +0200
 @@ -412,6 +412,8 @@
      AV_CODEC_ID_SHEERVIDEO,
      AV_CODEC_ID_YLC,
@@ -2227,7 +3305,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/avcodec.h ffmpeg-3.2.4.patch/libavcodec/avcod
  AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
 diff -Naur ffmpeg-3.2.4/libavcodec/cabac.h ffmpeg-3.2.4.patch/libavcodec/cabac.h
 --- ffmpeg-3.2.4/libavcodec/cabac.h	2017-02-10 14:25:26.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/cabac.h	2017-03-22 22:42:34.824798513 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/cabac.h	2017-05-28 20:42:45.725088620 +0200
 @@ -43,7 +43,14 @@
  typedef struct CABACContext{
      int low;
@@ -2246,7 +3324,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/cabac.h ffmpeg-3.2.4.patch/libavcodec/cabac.h
      const uint8_t *bytestream_end;
 diff -Naur ffmpeg-3.2.4/libavcodec/codec_desc.c ffmpeg-3.2.4.patch/libavcodec/codec_desc.c
 --- ffmpeg-3.2.4/libavcodec/codec_desc.c	2017-02-10 14:25:26.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/codec_desc.c	2017-03-22 22:42:34.825798516 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/codec_desc.c	2017-05-28 20:42:45.727088627 +0200
 @@ -1564,6 +1564,13 @@
          .props     = AV_CODEC_PROP_LOSSLESS,
          .mime_types= MT("image/png"),
@@ -2263,7 +3341,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/codec_desc.c ffmpeg-3.2.4.patch/libavcodec/co
      {
 diff -Naur ffmpeg-3.2.4/libavcodec/h264.h ffmpeg-3.2.4.patch/libavcodec/h264.h
 --- ffmpeg-3.2.4/libavcodec/h264.h	2017-02-10 14:25:26.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/h264.h	2017-03-22 22:45:08.817196788 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/h264.h	2017-05-28 20:44:24.351436408 +0200
 @@ -41,7 +41,9 @@
      H264_NAL_END_STREAM      = 11,
      H264_NAL_FILLER_DATA     = 12,
@@ -2276,7 +3354,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/h264.h ffmpeg-3.2.4.patch/libavcodec/h264.h
  #endif /* AVCODEC_H264_H */
 diff -Naur ffmpeg-3.2.4/libavcodec/h264_parser.c ffmpeg-3.2.4.patch/libavcodec/h264_parser.c
 --- ffmpeg-3.2.4/libavcodec/h264_parser.c	2017-02-10 14:25:26.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/h264_parser.c	2017-03-22 22:48:48.087757606 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/h264_parser.c	2017-05-28 20:45:07.870583994 +0200
 @@ -62,6 +62,8 @@
      int parse_last_mb;
      int64_t reference_dts;
@@ -2373,24 +3451,19 @@ diff -Naur ffmpeg-3.2.4/libavcodec/h264_parser.c ffmpeg-3.2.4.patch/libavcodec/h
  };
 diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 --- ffmpeg-3.2.4/libavcodec/hevc.c	2017-02-10 14:25:26.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/hevc.c	2017-03-22 22:42:34.834798539 +0100
-@@ -41,8 +41,186 @@
++++ ffmpeg-3.2.4.patch/libavcodec/hevc.c	2017-05-28 21:05:03.074076070 +0200
+@@ -41,8 +41,196 @@
  #include "hevc.h"
  #include "profiles.h"
  
 +#ifdef RPI
 +  #include "rpi_qpu.h"
-+  #include "rpi_user_vcsm.h"
-+  // Move Inter prediction into separate pass
-+  #define RPI_INTER
-+
-+  #ifdef RPI_INTER_QPU
-+    // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
-+    #define RPI_MULTI_MAILBOX
-+  #endif
++  #include "rpi_shader.h"
++  #include "rpi_shader_cmd.h"
++  #include "rpi_zc.h"
 +
 +  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
-+  // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
++  #define RPI_CACHE_UNIF_MVS  1
 +
 +  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*)
 +  //#define RPI_SIMULATE_QPUS
@@ -2398,19 +3471,24 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 +    #include "pthread.h"
 +  #endif
 +
-+  static void rpi_execute_dblk_cmds(HEVCContext *s);
-+  static void rpi_execute_transform(HEVCContext *s);
-+  static void rpi_launch_vpu_qpu(HEVCContext *s);
-+  static void rpi_execute_pred_cmds(HEVCContext *s);
-+  static void rpi_execute_inter_cmds(HEVCContext *s);
-+  static void rpi_begin(HEVCContext *s);
-+  static void flush_frame(HEVCContext *s,AVFrame *frame);
-+  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job);
++  static void worker_core(HEVCContext * const s);
 +
++  // We can pred any block height but annoyingly if we we do then the TMU cache
++  // explodes and it goes even slower :-(
++  #if 0
++  #define Y_P_MAX_H     16
++  #define Y_B_MAX_H     16
++  #else
++  #define Y_P_MAX_H     64
++  #define Y_B_MAX_H     64
++  #endif
 +#endif
 +
 +// #define DISABLE_MC
 +
++#define DISABLE_CHROMA 0
++#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
++
 +#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
 +
 +#ifndef av_mod_uintp2
@@ -2420,46 +3498,66 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 +}
 +#   define av_mod_uintp2   av_mod_uintp2_c
 +#endif
++
++#define Y_B_ONLY 0
 +
  const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
  
 +
-+#ifdef RPI_INTER_QPU
++#if RPI_INTER
++
++#define MC_DUMMY_X (-32)
++#define MC_DUMMY_Y (-32)
 +
 +// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
 +// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
 +// For each block of 64*64 the smallest block size is 8x4
 +// We also need an extra command for the setup information
 +
-+#define RPI_CHROMA_COMMAND_WORDS 12
-+#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
++#define UV_COMMANDS_PER_QPU (1 + RPI_NUM_CHUNKS*(64*64)*2/(8*4))
 +// The QPU code for UV blocks only works up to a block width of 8
 +#define RPI_CHROMA_BLOCK_WIDTH 8
 +
-+#define RPI_LUMA_COMMAND_WORDS 10
-+#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
-+
 +#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
 +
 +// TODO Chroma only needs 4 taps
 +
 +// Actual filter goes -ve, +ve, +ve, -ve using these values
-+static const uint32_t rpi_filter_coefs[8][1] = {
-+        { ENCODE_COEFFS(   0,  64,   0,   0) },
-+        { ENCODE_COEFFS(  2,  58,  10,  2) },
-+        { ENCODE_COEFFS(  4,  54,  16,  2) },
-+        { ENCODE_COEFFS(  6,  46,  28,  4) },
-+        { ENCODE_COEFFS(  4,  36,  36,  4) },
-+        { ENCODE_COEFFS(  4,  28,  46,  6) },
-+        { ENCODE_COEFFS(  2,  16,  54,  4) },
-+        { ENCODE_COEFFS(  2,  10,  58,  2) }
++static const uint32_t rpi_filter_coefs[8] = {
++        ENCODE_COEFFS(  0,  64,   0,  0),
++        ENCODE_COEFFS(  2,  58,  10,  2),
++        ENCODE_COEFFS(  4,  54,  16,  2),
++        ENCODE_COEFFS(  6,  46,  28,  4),
++        ENCODE_COEFFS(  4,  36,  36,  4),
++        ENCODE_COEFFS(  4,  28,  46,  6),
++        ENCODE_COEFFS(  2,  16,  54,  4),
++        ENCODE_COEFFS(  2,  10,  58,  2)
 +};
 +
++#define Y_COMMANDS_PER_QPU ((1+RPI_NUM_CHUNKS*(64*64)/(8*4)))
++
 +#endif
 +
 +
 +#ifdef RPI_WORKER
 +
++typedef struct worker_global_env_s
++{
++    volatile int arm_load;
++    pthread_mutex_t lock;
++
++    unsigned int arm_y;
++    unsigned int arm_c;
++    unsigned int gpu_y;
++    unsigned int gpu_c;
++} worker_global_env_t;
++
++static worker_global_env_t worker_global_env =
++{
++    .lock = PTHREAD_MUTEX_INITIALIZER
++};
++
++
 +//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
 +//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
 +
@@ -2538,17 +3636,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 +      break;
 +    }
 +    LOG_ENTER
-+    // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+    rpi_launch_vpu_qpu(s);
-+    // Perform inter prediction
-+    rpi_execute_inter_cmds(s);
-+    // Wait for transform completion
-+    vpu_wait(s->vpu_id);
-+
-+    // Perform intra prediction and residual reconstruction
-+    rpi_execute_pred_cmds(s);
-+    // Perform deblocking for CTBs in this row
-+    rpi_execute_dblk_cmds(s);
++    worker_core(s);
 +
 +    worker_complete_job(s);
 +    LOG_EXIT
@@ -2561,7 +3649,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
  /**
   * NOTE: Each function hls_foo correspond to the function foo in the
   * specification (HLS stands for High Level Syntax).
-@@ -55,6 +233,32 @@
+@@ -55,6 +243,32 @@
  /* free everything allocated  by pic_arrays_init() */
  static void pic_arrays_free(HEVCContext *s)
  {
@@ -2594,36 +3682,40 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
      av_freep(&s->sao);
      av_freep(&s->deblock);
  
-@@ -91,6 +295,87 @@
+@@ -91,6 +305,89 @@
      int ctb_count        = sps->ctb_width * sps->ctb_height;
      int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
  
 +#ifdef RPI
-+    int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-+    int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
-+    int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
-+    int coefs_per_row = coefs_per_luma + coefs_per_chroma;
++    const int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
++    const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS;
++    const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
++    const int coefs_per_row = coefs_per_luma + coefs_per_chroma;
 +    int job;
 +
 +    av_assert0(sps);
-+    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
-+    s->ctu_per_y_chan = s->max_ctu_count / 12;
-+    s->ctu_per_uv_chan = s->max_ctu_count / 8;
++//    s->max_ctu_count = sps->ctb_width;
++//    printf("CTB with=%d\n", sps->ctb_width);
++//    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
++    s->max_ctu_count = FFMIN(coefs_per_luma / coefs_in_ctb, sps->ctb_width);
++    s->ctu_per_y_chan = s->max_ctu_count / QPU_N_Y;
++    s->ctu_per_uv_chan = s->max_ctu_count / QPU_N_UV;
++
 +    for(job=0;job<RPI_MAX_JOBS;job++) {
-+      printf("Allocated %d\n",coefs_per_row);
-+      for(job=0;job<RPI_MAX_JOBS;job++) {
-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
-+        s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
-+        if (!s->coeffs_buf_arm[job][0])
-+            goto fail;
-+        gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
-+        s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
-+        s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
-+        if (!s->coeffs_buf_arm[job][2])
-+            goto fail;
-+        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
-+        s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
-+      }
++        for(job=0;job<RPI_MAX_JOBS;job++) {
++            gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
++            s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
++            if (!s->coeffs_buf_arm[job][0])
++                goto fail;
++
++            gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
++            s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
++            s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
++            if (!s->coeffs_buf_arm[job][2])
++                goto fail;
++            s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
++            s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
++        }
 +    }
 +#endif
 +#ifdef RPI_DEBLOCK_VPU
@@ -2670,8 +3762,6 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 +
 +            dvq->uv_setup_arm = (void*)p_arm;
 +            dvq->uv_setup_vc = (void*)p_vc;
-+
-+            dvq->cmd_id = -1;
 +        }
 +
 +        s->dvq_n = 0;
@@ -2682,7 +3772,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
      s->bs_width  = (width  >> 2) + 1;
      s->bs_height = (height >> 2) + 1;
  
-@@ -137,6 +422,29 @@
+@@ -137,6 +434,29 @@
      return AVERROR(ENOMEM);
  }
  
@@ -2712,7 +3802,52 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
  static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
  {
      int i = 0;
-@@ -677,6 +985,11 @@
+@@ -331,7 +651,7 @@
+ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt)
+ {
+     #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL)
+-    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
++    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts;
+     int ret, i;
+ 
+     pic_arrays_free(s);
+@@ -350,6 +670,12 @@
+     switch (sps->pix_fmt) {
+     case AV_PIX_FMT_YUV420P:
+     case AV_PIX_FMT_YUVJ420P:
++#if RPI_HEVC_SAND
++        // Currently geometry calc is stuffed for big sizes
++        if (sps->width < 2048 && sps->height <= 1088) {
++            *fmt++ = AV_PIX_FMT_SAND128;
++        }
++#endif
+ #if CONFIG_HEVC_DXVA2_HWACCEL
+         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+ #endif
+@@ -383,6 +709,7 @@
+         ret = ff_thread_get_format(s->avctx, pix_fmts);
+         if (ret < 0)
+             goto fail;
++
+         s->avctx->pix_fmt = ret;
+     }
+     else {
+@@ -405,11 +732,12 @@
+         for(c_idx = 0; c_idx < c_count; c_idx++) {
+             int w = sps->width >> sps->hshift[c_idx];
+             int h = sps->height >> sps->vshift[c_idx];
++            // ******** Very very nasty allocation kludge for plaited Chroma
+             s->sao_pixel_buffer_h[c_idx] =
+-                av_malloc((w * 2 * sps->ctb_height) <<
++                av_malloc((w * 2 * sps->ctb_height * (1 + (c_idx == 1))) <<
+                           sps->pixel_shift);
+             s->sao_pixel_buffer_v[c_idx] =
+-                av_malloc((h * 2 * sps->ctb_width) <<
++                av_malloc((h * 2 * sps->ctb_width  * (1 + (c_idx == 1))) <<
+                           sps->pixel_shift);
+         }
+     }
+@@ -677,6 +1005,11 @@
                  (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
                  pred_weight_table(s, gb);
              }
@@ -2724,33 +3859,42 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
  
              sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
              if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
-@@ -934,6 +1247,25 @@
+@@ -934,6 +1267,34 @@
      return 0;
  }
  
 +#ifdef RPI
 +static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
 +{
++    // U & V done on U call in the case of sliced frames
++    if (rpi_sliced_frame(s->frame) && c_idx > 1)
++        return;
++
 +    if (s->enable_rpi) {
 +        HEVCLocalContext *lc = s->HEVClc;
 +        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
 +        cmd->type = RPI_PRED_INTRA;
 +        cmd->size = log2_trafo_size;
-+        cmd->c_idx = c_idx;
-+        cmd->x = x0;
-+        cmd->y = y0;
 +        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
-+        cmd->mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
-+    } else {
++        cmd->c_idx = c_idx;
++        cmd->i_pred.x = x0;
++        cmd->i_pred.y = y0;
++        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
++    }
++    else if (rpi_sliced_frame(s->frame) && c_idx != 0) {
++        s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx);
++    }
++    else {
 +        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
 +    }
++
 +}
 +#endif
 +
  static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                                int xBase, int yBase, int cb_xBase, int cb_yBase,
                                int log2_cb_size, int log2_trafo_size,
-@@ -946,8 +1278,11 @@
+@@ -946,8 +1307,11 @@
      if (lc->cu.pred_mode == MODE_INTRA) {
          int trafo_size = 1 << log2_trafo_size;
          ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
@@ -2763,7 +3907,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
      }
  
      if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
-@@ -1033,7 +1368,11 @@
+@@ -1033,7 +1397,11 @@
              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
@@ -2775,7 +3919,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
                  }
                  if (cbf_cb[i])
                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-@@ -1062,7 +1401,11 @@
+@@ -1062,7 +1430,11 @@
              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
@@ -2787,7 +3931,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
                  }
                  if (cbf_cr[i])
                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-@@ -1091,7 +1434,11 @@
+@@ -1091,7 +1463,11 @@
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                      trafo_size_h, trafo_size_v);
@@ -2799,7 +3943,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
                  }
                  if (cbf_cb[i])
                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-@@ -1101,7 +1448,11 @@
+@@ -1101,7 +1477,11 @@
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                  trafo_size_h, trafo_size_v);
@@ -2811,7 +3955,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
                  }
                  if (cbf_cr[i])
                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-@@ -1113,26 +1464,46 @@
+@@ -1113,26 +1493,46 @@
              int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
              int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
              ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
@@ -2858,17 +4002,162 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
              }
          }
      }
-@@ -1335,6 +1706,93 @@
+@@ -1278,47 +1678,120 @@
+     return 0;
+ }
+ 
+-static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
++
++static int pcm_extract(HEVCContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
+ {
+-    HEVCLocalContext *lc = s->HEVClc;
+     GetBitContext gb;
+-    int cb_size   = 1 << log2_cb_size;
+-    int stride0   = s->frame->linesize[0];
+-    uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
+-    int   stride1 = s->frame->linesize[1];
+-    uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+-    int   stride2 = s->frame->linesize[2];
+-    uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
+-
+-    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth +
+-                         (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) +
+-                          ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) *
+-                          s->ps.sps->pcm.bit_depth_chroma;
+-    const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
+     int ret;
+ 
+-    if (!s->sh.disable_deblocking_filter_flag)
+-        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
+-
+     ret = init_get_bits(&gb, pcm, length);
+     if (ret < 0)
+         return ret;
+ 
+-    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
+-    if (s->ps.sps->chroma_format_idc) {
+-        s->hevcdsp.put_pcm(dst1, stride1,
++#ifdef RPI
++    if (rpi_sliced_frame(s->frame)) {
++        s->hevcdsp.put_pcm(rpi_sliced_frame_pos_y(s->frame, x0, y0),
++                           s->frame->linesize[0],
++                           cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
++
++        s->hevcdsp.put_pcm_c(rpi_sliced_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
++                           s->frame->linesize[1],
+                            cb_size >> s->ps.sps->hshift[1],
+                            cb_size >> s->ps.sps->vshift[1],
+                            &gb, s->ps.sps->pcm.bit_depth_chroma);
+-        s->hevcdsp.put_pcm(dst2, stride2,
+-                           cb_size >> s->ps.sps->hshift[2],
+-                           cb_size >> s->ps.sps->vshift[2],
+-                           &gb, s->ps.sps->pcm.bit_depth_chroma);
+     }
++    else
++#endif
++    {
++        const int stride0   = s->frame->linesize[0];
++        uint8_t * const dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
++        const int   stride1 = s->frame->linesize[1];
++        uint8_t * const dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
++        const int   stride2 = s->frame->linesize[2];
++        uint8_t * const dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
++
++        s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
++        if (s->ps.sps->chroma_format_idc) {
++            s->hevcdsp.put_pcm(dst1, stride1,
++                               cb_size >> s->ps.sps->hshift[1],
++                               cb_size >> s->ps.sps->vshift[1],
++                               &gb, s->ps.sps->pcm.bit_depth_chroma);
++            s->hevcdsp.put_pcm(dst2, stride2,
++                               cb_size >> s->ps.sps->hshift[2],
++                               cb_size >> s->ps.sps->vshift[2],
++                               &gb, s->ps.sps->pcm.bit_depth_chroma);
++        }
+ 
++    }
+     return 0;
+ }
+ 
++#ifdef RPI
++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n)
++{
++    int16_t * const coeffs = (buf_no != 3) ?
++        s->coeffs_buf_arm[s->pass0_job][buf_no] + s->num_coeffs[s->pass0_job][buf_no] :
++        s->coeffs_buf_arm[s->pass0_job][buf_no] - s->num_coeffs[s->pass0_job][buf_no] - n;
++    s->num_coeffs[s->pass0_job][buf_no] += n;
++    return coeffs;
++}
++#endif
++
++// x * 2^(y*2)
++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
++{
++    return x << (y * 2);
++}
++
++static int hls_pcm_sample(HEVCContext * const s, const int x0, const int y0, unsigned int log2_cb_size)
++{
++    // Length in bits
++    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[1]) +
++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[2]);
++
++    const uint8_t * const pcm = skip_bytes(&s->HEVClc->cc, (length + 7) >> 3);
++
++    if (!s->sh.disable_deblocking_filter_flag)
++        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
++
++#ifdef RPI
++    if (s->enable_rpi) {
++        // Copy coeffs
++        const int blen = (length + 7) >> 3;
++        // Round allocated bytes up to nearest 32 to avoid alignment confusion
++        // Allocation is in int16_t s
++        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
++        // sample this rounding doesn't affect the total size we need to allocate for
++        // the coeff buffer
++        int16_t * const coeffs = rpi_alloc_coeff_buf(s, 0, ((blen + 31) & ~31) >> 1);
++        memcpy(coeffs, pcm, blen);
++
++        // Our coeff stash assumes that any partially allocated 64byte lump
++        // is zeroed so make that true.
++        {
++            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
++            if ((-(intptr_t)eopcm & 63) != 0)
++                memset(eopcm, 0, -(intptr_t)eopcm & 63);
++        }
++
++        // Add command
++        {
++            HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
++            cmd->type = RPI_PRED_I_PCM;
++            cmd->size = log2_cb_size;
++            cmd->i_pcm.src = coeffs;
++            cmd->i_pcm.x = x0;
++            cmd->i_pcm.y = y0;
++            cmd->i_pcm.src_len = length;
++        }
++        return 0;
++    }
++#endif
++
++    return pcm_extract(s, pcm, length, x0, y0, 1 << log2_cb_size);
++}
++
+ /**
+  * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
+  *
+@@ -1335,6 +1808,91 @@
   * @param luma_offset additive offset applied to the luma prediction value
   */
  
-+#ifdef RPI_INTER
-+#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn)
++#if RPI_INTER
 +static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
 +                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
 +                        int block_w, int block_h, int luma_weight, int luma_offset)
 +{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
++    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
 +    cmd->cmd = RPI_CMD_LUMA_UNI;
 +    cmd->dst = dst;
 +    cmd->dststride = dststride;
@@ -2885,9 +4174,10 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 +
 +static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
 +                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-+                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
++                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1,
++                       const struct MvField * const current_mv)
 +{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
++    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
 +    cmd->cmd = RPI_CMD_LUMA_BI;
 +    cmd->dst = dst;
 +    cmd->dststride = dststride;
@@ -2905,17 +4195,17 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 +    cmd->ref_idx[1] = current_mv->ref_idx[1];
 +}
 +
-+static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
-+                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
++static inline void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
++                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride,
++                          int x_off, int y_off, int block_w, int block_h, const Mv * const mv, int chroma_weight, int chroma_offset)
 +{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
++    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
 +    cmd->cmd = RPI_CMD_CHROMA_UNI;
 +    cmd->dst = dst0;
 +    cmd->dststride = dststride;
 +    cmd->src = src0;
 +    cmd->srcstride = srcstride;
-+    cmd->mv = current_mv->mv[reflist];
++    cmd->mv = *mv;
 +    cmd->x_off = x_off;
 +    cmd->y_off = y_off;
 +    cmd->block_w = block_w;
@@ -2924,10 +4214,10 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 +    cmd->offset = chroma_offset;
 +}
 +
-+static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-+                         int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
++static inline void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
++                         int x_off, int y_off, int block_w, int block_h, const struct MvField * const current_mv, int cidx)
 +{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
++    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
 +    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
 +    cmd->dst = dst0;
 +    cmd->dststride = dststride;
@@ -2945,14 +4235,12 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 +    cmd->ref_idx[1] = current_mv->ref_idx[1];
 +}
 +
-+#else
-+#define RPI_REDIRECT(fn) fn
 +#endif
 +
  static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
                          AVFrame *ref, const Mv *mv, int x_off, int y_off,
                          int block_w, int block_h, int luma_weight, int luma_offset)
-@@ -1350,6 +1808,10 @@
+@@ -1350,6 +1908,10 @@
                             (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
      int idx              = ff_hevc_pel_weight[block_w];
  
@@ -2963,7 +4251,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
      x_off += mv->x >> 2;
      y_off += mv->y >> 2;
      src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1396,7 +1858,7 @@
+@@ -1396,7 +1958,7 @@
   * @param mv1 motion vector1 (relative to block position) to get pixel data from
   * @param current_mv current motion vector structure
   */
@@ -2972,7 +4260,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
                         AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
                         int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
  {
-@@ -1420,6 +1882,10 @@
+@@ -1420,6 +1982,10 @@
      uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
      uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
  
@@ -2983,7 +4271,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
      if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
          x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
          y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
-@@ -1505,6 +1971,10 @@
+@@ -1505,6 +2071,10 @@
      intptr_t _mx         = mx << (1 - hshift);
      intptr_t _my         = my << (1 - vshift);
  
@@ -2994,7 +4282,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
      x_off += mv->x >> (2 + hshift);
      y_off += mv->y >> (2 + vshift);
      src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1569,6 +2039,10 @@
+@@ -1569,6 +2139,10 @@
      int hshift = s->ps.sps->hshift[1];
      int vshift = s->ps.sps->vshift[1];
  
@@ -3005,13 +4293,422 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
      intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
      intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
      intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
-@@ -1696,14 +2170,14 @@
+@@ -1696,14 +2270,423 @@
      }
  }
  
 -static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
 -                                int nPbW, int nPbH,
 -                                int log2_cb_size, int partIdx, int idx)
++
++#if RPI_INTER
++
++static HEVCRpiLumaPred *
++rpi_nxt_pred_y(HEVCContext *const s, const unsigned int load_val)
++{
++    HEVCRpiLumaPred * yp = s->curr_pred_y;
++    HEVCRpiLumaPred * ypt = yp + 1;
++    for (unsigned int i = 1; i != QPU_N_GRP_Y; ++i, ++ypt) {
++        if (ypt->load < yp->load)
++            yp = ypt;
++    }
++
++//        yp->load += load_val;
++    ++yp->load;
++    return yp;
++}
++
++static void
++rpi_pred_y(HEVCContext *const s, const int x0, const int y0,
++           const int nPbW, const int nPbH,
++           const Mv *const mv,
++           const int weight_mul,
++           const int weight_offset,
++           AVFrame *const src_frame)
++{
++    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
++
++//    rpi_luma_mc_uni(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
++//                    mv, x0, y0, nPbW, nPbH,
++//                    weight_mul, weight_offset);
++
++    {
++        const unsigned int mx          = mv->x & 3;
++        const unsigned int my          = mv->y & 3;
++        const unsigned int my_mx       = (my << 8) | mx;
++        const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
++        const int x1_m3 = x0 + (mv->x >> 2) - 3;
++        const int y1_m3 = y0 + (mv->y >> 2) - 3;
++        const uint32_t src_vc_address_y = get_vc_address_y(src_frame);
++        uint32_t dst_addr = get_vc_address_y(s->frame) + y_off;
++        const uint32_t wo = PACK2(weight_offset * 2 + 1, weight_mul);
++
++        // Potentially we could change the assembly code to support taller sizes in one go
++        for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * 16)
++        {
++            const uint32_t src_yx_y = y1_m3 + start_y;
++            int start_x = 0;
++            const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H);
++
++#if 1
++            // As Y-pred operates on two independant 8-wide src blocks we can merge
++            // this pred with the previous one if it the previous one is 8 pel wide,
++            // the same height as the current block, immediately to the left of our
++            // current dest block and mono-pred.
++
++            qpu_mc_pred_y_t *const last_y8_p = s->last_y8_p;
++            if (last_y8_p != NULL && last_y8_p->p.h == bh && last_y8_p->p.dst_addr + 8 == dst_addr)
++            {
++                const int bw = FFMIN(nPbW, 8);
++                qpu_mc_pred_y_t *const last_y8_lx = s->last_y8_lx;
++
++                last_y8_lx->next_src2_x = x1_m3;
++                last_y8_lx->next_src2_y = src_yx_y;
++                last_y8_lx->next_src2_base = src_vc_address_y;
++                last_y8_p->p.w += bw;
++                last_y8_p->p.mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->p.mymx21);
++                last_y8_p->p.wo2 = wo;
++
++                s->last_y8_p = NULL;
++                s->last_y8_lx = NULL;
++                start_x = bw;
++#if RPI_TSTATS
++                ++s->tstats.y_pred1_y8_merge;
++#endif
++            }
++#endif
++
++            for (; start_x < nPbW; start_x += 16)
++            {
++                const int bw = FFMIN(nPbW - start_x, 16);
++                HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7);
++                qpu_mc_pred_y_t *const cmd_lx = yp->last_lx;
++                qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr;
++#if RPI_TSTATS
++                {
++                    HEVCRpiStats *const ts = &s->tstats;
++                    if (mx == 0 && my == 0)
++                        ++ts->y_pred1_x0y0;
++                    else if (mx == 0)
++                        ++ts->y_pred1_x0;
++                    else if (my == 0)
++                        ++ts->y_pred1_y0;
++                    else
++                        ++ts->y_pred1_xy;
++
++                    if (nPbW > 8)
++                        ++ts->y_pred1_wgt8;
++                    else
++                        ++ts->y_pred1_wle8;
++
++                    if (nPbH > 16)
++                        ++ts->y_pred1_hgt16;
++                    else
++                        ++ts->y_pred1_hle16;
++                }
++#endif
++                cmd_y[-1].next_fn = s->qpu_filter;
++                cmd_lx->next_src1_x = x1_m3 + start_x;
++                cmd_lx->next_src1_y = src_yx_y;
++                cmd_lx->next_src1_base = src_vc_address_y;
++                if (bw <= 8)
++                {
++                    cmd_lx->next_src2_x = MC_DUMMY_X;
++                    cmd_lx->next_src2_y = MC_DUMMY_Y;
++                    cmd_lx->next_src2_base = s->qpu_dummy_frame;
++                }
++                else
++                {
++                    cmd_lx->next_src2_x = x1_m3 + start_x + 8;
++                    cmd_lx->next_src2_y = src_yx_y;
++                    cmd_lx->next_src2_base = src_vc_address_y;
++                }
++                cmd_y->p.w = bw;
++                cmd_y->p.h = bh;
++                cmd_y->p.mymx21 = my2_mx2_my_mx;
++                cmd_y->p.wo1 = wo;
++                cmd_y->p.wo2 = wo;
++                cmd_y->p.dst_addr =  dst_addr + start_x;
++                yp->last_lx = cmd_y;
++                yp->qpu_mc_curr = cmd_y + 1;
++
++                if (bw == 8) {
++                    s->last_y8_lx = cmd_lx;
++                    s->last_y8_p = cmd_y;
++                }
++            }
++        }
++    }
++}
++
++static void
++rpi_pred_y_b(HEVCContext * const s,
++           const int x0, const int y0,
++           const int nPbW, const int nPbH,
++           const struct MvField *const mv_field,
++           AVFrame *const src_frame,
++           AVFrame *const src_frame2)
++{
++    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
++    const Mv * const mv  = mv_field->mv + 0;
++    const Mv * const mv2 = mv_field->mv + 1;
++
++//    rpi_luma_mc_bi(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
++//           mv, x0, y0, nPbW, nPbH,
++//           src_frame2, mv2, mv_field);
++    {
++        const unsigned int mx          = mv->x & 3;
++        const unsigned int my          = mv->y & 3;
++        const unsigned int my_mx = (my<<8) | mx;
++        const unsigned int mx2          = mv2->x & 3;
++        const unsigned int my2          = mv2->y & 3;
++        const unsigned int my2_mx2 = (my2<<8) | mx2;
++        const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
++        const int x1 = x0 + (mv->x >> 2) - 3;
++        const int y1 = y0 + (mv->y >> 2) - 3;
++        const int x2 = x0 + (mv2->x >> 2) - 3;
++        const int y2 = y0 + (mv2->y >> 2) - 3;
++        const unsigned int ref_idx0 = mv_field->ref_idx[0];
++        const unsigned int ref_idx1 = mv_field->ref_idx[1];
++        const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] +
++                     s->sh.luma_offset_l1[ref_idx1] + 1;
++        const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
++        const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
++
++        uint32_t dst = get_vc_address_y(s->frame) + y_off;
++        const uint32_t src1_base = get_vc_address_y(src_frame);
++        const uint32_t src2_base = get_vc_address_y(src_frame2);
++
++        for (int start_y=0; start_y < nPbH; start_y += Y_B_MAX_H)
++        {
++            const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H);
++
++            for (int start_x=0; start_x < nPbW; start_x += 8)
++            { // B blocks work 8 at a time
++                HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7);
++                qpu_mc_pred_y_t *const cmd_lx = yp->last_lx;
++                qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr;
++#if RPI_TSTATS
++              {
++                  HEVCRpiStats *const ts = &s->tstats;
++                  const unsigned int mmx = mx | mx2;
++                  const unsigned int mmy = my | my2;
++                  if (mmx == 0 && mmy == 0)
++                      ++ts->y_pred2_x0y0;
++                  else if (mmx == 0)
++                      ++ts->y_pred2_x0;
++                  else if (mmy == 0)
++                      ++ts->y_pred2_y0;
++                  else
++                      ++ts->y_pred2_xy;
++
++                  if (nPbH > 16)
++                      ++ts->y_pred2_hgt16;
++                  else
++                      ++ts->y_pred2_hle16;
++              }
++#endif
++              cmd_y[-1].next_fn = s->qpu_filter_b;
++              cmd_lx->next_src1_x = x1 + start_x;
++              cmd_lx->next_src1_y = y1 + start_y;
++              cmd_lx->next_src1_base = src1_base;
++              cmd_lx->next_src2_x = x2 + start_x;
++              cmd_lx->next_src2_y = y2 + start_y;
++              cmd_lx->next_src2_base = src2_base;
++              cmd_y->p.w = FFMIN(nPbW - start_x, 8);
++              cmd_y->p.h = bh;
++              cmd_y->p.mymx21 = my2_mx2_my_mx;
++              cmd_y->p.wo1 = wo1;
++              cmd_y->p.wo2 = wo2;
++              cmd_y->p.dst_addr =  dst + start_x;
++              yp->last_lx = cmd_y;
++              yp->qpu_mc_curr = cmd_y + 1;
++          }
++          dst += s->frame->linesize[0] * 16;
++        }
++    }
++}
++
++
++static HEVCRpiChromaPred *
++rpi_nxt_pred_c(HEVCContext *const s, const unsigned int load_val)
++{
++    HEVCRpiChromaPred * cp = s->curr_pred_c;
++    HEVCRpiChromaPred * cpt = cp + 1;
++    for (unsigned int i = 1; i != QPU_N_GRP_UV; ++i, ++cpt) {
++        if (cpt->load < cp->load)
++            cp = cpt;
++    }
++    // Actual use of load_val is noticably better but we haven't sorted Q length problems yet
++    ++cp->load;
++//    cp->load += load_val;
++    return cp;
++}
++
++static void
++rpi_pred_c(HEVCContext * const s, const int x0_c, const int y0_c,
++  const int nPbW_c, const int nPbH_c,
++  const Mv * const mv,
++  const int16_t * const c_weights,
++  const int16_t * const c_offsets,
++  AVFrame * const src_frame)
++{
++
++    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
++#if 0
++    av_assert0(s->frame->linesize[1] == s->frame->linesize[2]);
++
++    rpi_chroma_mc_uni(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame->data[1], src_frame->linesize[1],
++                x0_c, y0_c, nPbW_c, nPbH_c, mv,
++                c_weights[0], c_offsets[0]);
++
++    rpi_chroma_mc_uni(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame->data[2], src_frame->linesize[2],
++                x0_c, y0_c, nPbW_c, nPbH_c, mv,
++                c_weights[1], c_offsets[1]);
++#endif
++    {
++        const int hshift           = s->ps.sps->hshift[1];
++        const int vshift           = s->ps.sps->vshift[1];
++
++        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
++        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++        const uint32_t src_base_u = get_vc_address_u(src_frame);
++        const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
++        const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
++        const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]);
++        const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]);
++        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
++
++        for(int start_y=0;start_y < nPbH_c;start_y+=16)
++        {
++            const int bh = FFMIN(nPbH_c-start_y, 16);
++
++            for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
++            {
++                HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh + 3);
++                qpu_mc_pred_c_t * const u = cp->qpu_mc_curr;
++                qpu_mc_pred_c_t * const last_l0 = cp->last_l0;
++                const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++
++                u[-1].next_fn  = s->qpu_filter_uv;
++                last_l0->next_src_x = x1_c + start_x;
++                last_l0->next_src_y = y1_c + start_y;
++                last_l0->next_src_base_c = src_base_u;
++                u[0].p.h = bh;
++                u[0].p.w = bw;
++                u[0].p.coeffs_x = x_coeffs;
++                u[0].p.coeffs_y = y_coeffs;
++                u[0].p.wo_u = wo_u;
++                u[0].p.wo_v = wo_v;
++                u[0].p.dst_addr_c = dst_base_u + start_x * 2;
++                cp->last_l0 = u;
++                cp->qpu_mc_curr = u + 1;
++            }
++
++            dst_base_u += s->frame->linesize[1] * 16;
++        }
++    }
++  return;
++}
++
++static void
++rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c,
++  const int nPbW_c, const int nPbH_c,
++  const struct MvField * const mv_field,
++  const int16_t * const c_weights,
++  const int16_t * const c_offsets,
++  const int16_t * const c_weights2,
++  const int16_t * const c_offsets2,
++  AVFrame * const src_frame,
++  AVFrame * const src_frame2)
++{
++    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
++#if 0
++    rpi_chroma_mc_bi(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame, src_frame2,
++                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 0);
++
++    rpi_chroma_mc_bi(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame, src_frame2,
++                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 1);
++#endif
++    {
++        const int hshift = s->ps.sps->hshift[1];
++        const int vshift = s->ps.sps->vshift[1];
++        const Mv * const mv = mv_field->mv + 0;
++        const Mv * const mv2 = mv_field->mv + 1;
++
++        const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
++        const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
++        const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
++        const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
++        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
++        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++
++        const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
++        const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
++        const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
++        const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
++
++        const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
++        const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
++
++        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
++
++        for (int start_y = 0; start_y < nPbH_c; start_y += 16) {
++          const unsigned int bh = FFMIN(nPbH_c-start_y, 16);
++
++          // We are allowed 3/4 powers of two as well as powers of 2
++          av_assert2(bh == 16 || bh == 12 || bh == 8 || bh == 6 || bh == 4 || bh == 2);
++
++          for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) {
++              const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++
++              HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh * 2 + 3);
++              qpu_mc_pred_c_t * const u = cp->qpu_mc_curr;
++              qpu_mc_pred_c_t * const last_l0 = cp->last_l0;
++              qpu_mc_pred_c_t * const last_l1 = cp->last_l1;
++
++              u[-1].next_fn = s->qpu_filter_uv_b0;
++              last_l0->next_src_x = x1_c + start_x;
++              last_l0->next_src_y = y1_c + start_y;
++              last_l0->next_src_base_c = get_vc_address_u(src_frame);
++
++              u[0].next_fn = 0;  // Ignored - 2 block cmd
++              u[0].next_src_x = x2_c + start_x;
++              u[0].next_src_y = y2_c + start_y;
++              u[0].next_src_base_c = get_vc_address_u(src_frame2);
++
++              u[0].b0.h = (bh<16 ? bh : 16);
++              u[0].b0.w = (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH);
++              u[0].b0.coeffs_x = coefs0_x;
++              u[0].b0.coeffs_y = coefs0_y;
++              u[0].b0.weight_u = c_weights[0]; // Weight L0 U
++              u[0].b0.weight_v = c_weights[1]; // Weight L0 V
++              u[0].b0.dummy0 = 0;  // Intermediate results are not written back in first pass of B filtering
++
++              last_l1->next_src_x = x2_c + start_x;
++              last_l1->next_src_y = y2_c + start_y;
++              last_l1->next_src_base_c = get_vc_address_u(src_frame2);
++
++              u[1].b1.dummy0 = 0;  // w,h inherited from b0
++              u[1].b1.coeffs_x = coefs1_x;
++              u[1].b1.coeffs_y = coefs1_y;
++              u[1].b1.wo_u = PACK2(c_offsets[0] + c_offsets2[0] + 1, c_weights2[0]);
++              u[1].b1.wo_v = PACK2(c_offsets[1] + c_offsets2[1] + 1, c_weights2[1]);
++              u[1].b1.dst_addr_c = dst_base_u + start_x * 2;
++
++              cp->last_l0 = u;
++              cp->last_l1 = u + 1;
++              cp->qpu_mc_curr = u + 2;
++          }
++
++          dst_base_u += s->frame->linesize[1] * 16;
++        }
++    }
++}
++#endif
++
++
++
 +static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
 +                                const int nPbW, const int nPbH,
 +                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
@@ -3024,7 +4721,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
      int merge_idx = 0;
      struct MvField current_mv = {{{ 0 }}};
  
-@@ -1721,8 +2195,7 @@
+@@ -1721,8 +2704,7 @@
      int y_cb             = y0 >> log2_min_cb_size;
      int x_pu, y_pu;
      int i, j;
@@ -3034,315 +4731,112 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
  
      if (!skip_flag)
          lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
-@@ -1766,16 +2239,89 @@
+@@ -1766,12 +2748,29 @@
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
 -        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
-+#ifdef RPI_LUMA_QPU
++#if RPI_INTER
 +        if (s->enable_rpi) {
-+            const Mv * const mv    = &current_mv.mv[0];
-+            const unsigned int mx          = mv->x & 3;
-+            const unsigned int my          = mv->y & 3;
-+            const unsigned int my_mx       = (my<<8) | mx;
-+            const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
-+            const int x1_m3 = x0 + (mv->x >> 2) - 3;
-+            const int y1_m3 = y0 + (mv->y >> 2) - 3;
-+            const uint32_t src_vc_address_y = get_vc_address_y(ref0->frame);
-+            uint32_t * y = s->curr_y_mvs;
-+
-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+              const uint32_t src_yx_hi = ((y1_m3 + start_y) << 16);
-+
-+              for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                  const int bw = nPbW-start_x;
-+                  const int bh = nPbH-start_y;
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + 8 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
-+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
-+                  *y++ = my2_mx2_my_mx;
-+                  *y++ = s->sh.luma_weight_l0[current_mv.ref_idx[0]];
-+                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] * 2 + 1;
-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-+                }
-+            }
-+            s->curr_y_mvs = y;
++            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 0,
++              s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
++              ref0->frame);
 +        } else
 +#endif
 +        {
-+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
++            luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
                      &current_mv.mv[0], x0, y0, nPbW, nPbH,
                      s->sh.luma_weight_l0[current_mv.ref_idx[0]],
                      s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
 +        }
  
          if (s->ps.sps->chroma_format_idc) {
--            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
-+#ifdef RPI_INTER_QPU
-+          if (s->enable_rpi) {
-+                int hshift           = s->ps.sps->hshift[1];
-+                int vshift           = s->ps.sps->vshift[1];
-+                const Mv *mv         = &current_mv.mv[0];
-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-+                intptr_t _mx         = mx << (1 - hshift);
-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-+
-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
-+
-+                uint32_t *u = s->curr_u_mvs;
-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                      int bw = nPbW_c-start_x;
-+                      int bh = nPbH_c-start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                      *u++ = rpi_filter_coefs[_mx][0];
-+                      *u++ = rpi_filter_coefs[_my][0];
-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] * 2 + 1,
-+                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]);
-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] * 2 + 1,
-+                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]);
-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                    }
-+                }
-+                s->curr_u_mvs = u;
++#if RPI_INTER
++            if (s->enable_rpi) {
++                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
++                  s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++                  ref0->frame);
 +                return;
 +            }
 +#endif
-+            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
                            0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                            s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
--            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
-+            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
-                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
-         }
-@@ -1785,17 +2331,89 @@
+@@ -1785,12 +2784,29 @@
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
 -        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
-+#ifdef RPI_LUMA_QPU
++#if RPI_INTER
 +        if (s->enable_rpi) {
-+            const int reflist = 1;
-+            const Mv *mv    = &current_mv.mv[reflist];
-+            int mx          = mv->x & 3;
-+            int my          = mv->y & 3;
-+            int my_mx = (my<<8) + mx;
-+            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
-+            int x1 = x0 + (mv->x >> 2);
-+            int y1 = y0 + (mv->y >> 2);
-+            uint32_t *y = s->curr_y_mvs;
-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+              for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                  int bw = nPbW-start_x;
-+                  int bh = nPbH-start_y;
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
-+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
-+                  *y++ = my2_mx2_my_mx;
-+                  *y++ = s->sh.luma_weight_l1[current_mv.ref_idx[reflist]];
-+                  *y++ = s->sh.luma_offset_l1[current_mv.ref_idx[reflist]] * 2 + 1;
-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-+                }
-+            }
-+            s->curr_y_mvs = y;
++            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 1,
++              s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
++              ref1->frame);
 +        } else
 +#endif
-+
 +        {
-+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
++            luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
                      &current_mv.mv[1], x0, y0, nPbW, nPbH,
                      s->sh.luma_weight_l1[current_mv.ref_idx[1]],
                      s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
 +        }
  
          if (s->ps.sps->chroma_format_idc) {
--            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
-+#ifdef RPI_INTER_QPU
++#if RPI_INTER
 +            if (s->enable_rpi) {
-+                const int reflist = 1;
-+                const int hshift           = s->ps.sps->hshift[1];
-+                const int vshift           = s->ps.sps->vshift[1];
-+                const Mv * const mv        = &current_mv.mv[reflist];
-+                const intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-+                const intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-+                const intptr_t _mx         = mx << (1 - hshift);
-+                const intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-+
-+                const int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                const int y1_c = y0_c + (mv->y >> (2 + hshift));
-+
-+                uint32_t * u = s->curr_u_mvs;
-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                      const int bw = nPbW_c-start_x;
-+                      const int bh = nPbH_c-start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                      *u++ = rpi_filter_coefs[_mx][0];
-+                      *u++ = rpi_filter_coefs[_my][0];
-+                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][0] * 2 + 1,
-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][0]);
-+                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][1] * 2 + 1,
-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][1]);
-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                    }
-+                }
-+                s->curr_u_mvs = u;
++                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
++                  s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++                  ref1->frame);
 +                return;
 +            }
 +#endif
-+            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
                            1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                            s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
- 
--            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
-+            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
-                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
-         }
-@@ -1805,15 +2423,118 @@
+@@ -1805,11 +2821,31 @@
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
 -        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
-+#ifdef RPI_LUMA_QPU
-+        if (s->enable_rpi && 0) {
-+            const Mv *mv    = &current_mv.mv[0];
-+            int mx          = mv->x & 3;
-+            int my          = mv->y & 3;
-+            int my_mx = (my<<8) + mx;
-+            const Mv *mv2    = &current_mv.mv[1];
-+            int mx2          = mv2->x & 3;
-+            int my2          = mv2->y & 3;
-+            int my2_mx2 = (my2<<8) + mx2;
-+            int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx;
-+            int x1 = x0 + (mv->x >> 2);
-+            int y1 = y0 + (mv->y >> 2);
-+            int x2 = x0 + (mv2->x >> 2);
-+            int y2 = y0 + (mv2->y >> 2);
-+            uint32_t *y = s->curr_y_mvs;
-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+              for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
-+                  int bw = nPbW-start_x;
-+                  int bh = nPbH-start_y;
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
-+                  *y++ = PACK2(bw<8 ? bw : 8, bh<16 ? bh : 16);
-+                  *y++ = my2_mx2_my_mx;
-+
-+                  *y++ = PACK2(s->sh.luma_weight_l1[current_mv.ref_idx[1]],
-+                               s->sh.luma_weight_l0[current_mv.ref_idx[0]]);
-+                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] +
-+                         s->sh.luma_offset_l1[current_mv.ref_idx[1]] + 1;
-+
-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
-+                }
-+            }
-+            s->curr_y_mvs = y;
++#if RPI_INTER
++        if (s->enable_rpi) {
++            rpi_pred_y_b(s, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
 +        } else
 +#endif
 +        {
-+            RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
++            luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
                     ref1->frame, &current_mv.mv[1], &current_mv);
 +        }
  
          if (s->ps.sps->chroma_format_idc) {
--            chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
-+#ifdef RPI_INTER_QPU
++#if RPI_INTER
 +          if (s->enable_rpi) {
-+                int hshift           = s->ps.sps->hshift[1];
-+                int vshift           = s->ps.sps->vshift[1];
-+                const Mv *mv         = &current_mv.mv[0];
-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-+                intptr_t _mx         = mx << (1 - hshift);
-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
-+
-+                const Mv *mv2         = &current_mv.mv[1];
-+                intptr_t mx2          = av_mod_uintp2(mv2->x, 2 + hshift);
-+                intptr_t my2          = av_mod_uintp2(mv2->y, 2 + vshift);
-+                intptr_t _mx2         = mx2 << (1 - hshift);
-+                intptr_t _my2         = my2 << (1 - vshift); // Fractional part of motion vector
-+
-+                int x2_c = x0_c + (mv2->x >> (2 + hshift));
-+                int y2_c = y0_c + (mv2->y >> (2 + hshift));
-+
-+
-+                uint32_t *u = s->curr_u_mvs;
-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                      int bw = nPbW_c-start_x;
-+                      int bh = nPbH_c-start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                      *u++ = rpi_filter_coefs[_mx][0];
-+                      *u++ = rpi_filter_coefs[_my][0];
-+                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]; // Weight L0 U
-+                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]; // Weight L0 V
-+                      *u++ = 0;  // Intermediate results are not written back in first pass of B filtering
-+                      *u++ = 0;
-+
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                      *u++ = rpi_filter_coefs[_mx2][0];
-+                      *u++ = rpi_filter_coefs[_my2][0];
-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] +
-+                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0] + 1,
-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0]);
-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] +
-+                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1] + 1,
-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1]);
-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                    }
-+                }
-+                s->curr_u_mvs = u;
++              rpi_pred_c_b(s, x0_c, y0_c, nPbW_c, nPbH_c,
++                           &current_mv,
++                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
++                           s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
++                           s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++                           ref0->frame,
++                           ref1->frame);
 +                return;
 +            }
 +#endif
-+            RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+             chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
                           x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
  
--            chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
-+            RPI_REDIRECT(chroma_mc_bi)(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
-                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
-         }
-     }
-@@ -2307,6 +3028,734 @@
+@@ -2084,7 +3120,9 @@
+                 intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
+                 ret = hls_pcm_sample(s, x0, y0, log2_cb_size);
+                 if (s->ps.sps->pcm.loop_filter_disable_flag)
++                {
+                     set_deblocking_bypass(s, x0, y0, log2_cb_size);
++                }
+ 
+                 if (ret < 0)
+                     return ret;
+@@ -2307,6 +3345,529 @@
      lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
  }
  
@@ -3359,6 +4853,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 +    s->num_dblk_cmds[job] = 0;
 +}
 +
++#if 0
 +static void rpi_execute_transform(HEVCContext *s)
 +{
 +    int i=2;
@@ -3374,7 +4869,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 +        s->hevcdsp.idct[5-2](coeffs, 32);
 +    }*/
 +
-+    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
++    rpi_cache_flush_one_gm_ptr(&s->coeffs_buf_accelerated[job], RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
 +    s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
 +                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
 +                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
@@ -3385,12 +4880,16 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 +    for(i=0;i<4;i++)
 +        s->num_coeffs[job][i] = 0;
 +}
++#endif
++
 +
-+static void rpi_execute_pred_cmds(HEVCContext *s)
++// I-pred, transform_and_add for all blocks types done here
++// All ARM
++static void rpi_execute_pred_cmds(HEVCContext * const s)
 +{
 +  int i;
 +  int job = s->pass1_job;
-+  HEVCPredCmd *cmd = s->univ_pred_cmds[job];
++  const HEVCPredCmd *cmd = s->univ_pred_cmds[job];
 +#ifdef RPI_WORKER
 +  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
 +#else
@@ -3398,43 +4897,65 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 +#endif
 +
 +  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
-+      //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
-+      if (cmd->type == RPI_PRED_INTRA) {
-+          lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
-+          lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
-+          lc->na.cand_left         = (cmd->na >> 3) & 1;
-+          lc->na.cand_up_left      = (cmd->na >> 2) & 1;
-+          lc->na.cand_up           = (cmd->na >> 1) & 1;
-+          lc->na.cand_up_right     = (cmd->na >> 0) & 1;
-+          s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
-+      } else {
-+#ifdef RPI_PRECLEAR
-+          int trafo_size = 1 << cmd->size;
-+#endif
-+          s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
++//      printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
++
++      switch (cmd->type)
++      {
++          case RPI_PRED_INTRA:
++              lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode;
++              lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
++              lc->na.cand_left         = (cmd->na >> 3) & 1;
++              lc->na.cand_up_left      = (cmd->na >> 2) & 1;
++              lc->na.cand_up           = (cmd->na >> 1) & 1;
++              lc->na.cand_up_right     = (cmd->na >> 0) & 1;
++              if (!rpi_sliced_frame(s->frame) || cmd->c_idx == 0)
++                  s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
++              else
++                  s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
++              break;
++
++          case RPI_PRED_ADD_RESIDUAL:
++              s->hevcdsp.transform_add[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
 +#ifdef RPI_PRECLEAR
-+          memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
++              memset(cmd->buf, 0, sizeof(int16_t) << (cmd->size * 2)); // Clear coefficients here while they are in the cache
 +#endif
++              break;
++          case RPI_PRED_ADD_RESIDUAL_U:
++              s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++              break;
++          case RPI_PRED_ADD_RESIDUAL_V:
++              s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++              break;
++
++          case RPI_PRED_I_PCM:
++              pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
++              break;
++
++          default:
++              av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
++              abort();
 +      }
 +  }
 +  s->num_pred_cmds[job] = 0;
 +}
 +
-+static void rpi_execute_inter_cmds(HEVCContext *s)
++// Do any inter-pred that we want to do in software
++// With both RPI_INTER_QPU && RPI_LUMA_QPU defined we should do nothing here
++// All ARM
++static void do_yc_inter_cmds(HEVCContext * const s, const HEVCMvCmd *cmd, unsigned int n, const int b_only)
 +{
-+    int job = s->pass1_job;
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[job];
-+    int n,cidx;
++    unsigned int cidx;
 +    AVFrame myref;
 +    AVFrame myref1;
 +    struct MvField mymv;
-+    if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) {
-+        printf("Overflow inter_cmds\n");
-+        exit(-1);
-+    }
-+    for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) {
++
++    for(; n>0 ; n--, cmd++) {
++        av_assert0(0);
++
 +        switch(cmd->cmd) {
 +        case RPI_CMD_LUMA_UNI:
++            if (b_only)
++                break;
 +            myref.data[0] = cmd->src;
 +            myref.linesize[0] = cmd->srcstride;
 +            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
@@ -3451,6 +4972,8 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 +                       &myref1, &cmd->mv1, &mymv);
 +            break;
 +        case RPI_CMD_CHROMA_UNI:
++            if (b_only)
++                break;
 +            mymv.mv[0] = cmd->mv;
 +            chroma_mc_uni(s, cmd->dst,
 +                          cmd->dststride, cmd->src, cmd->srcstride, 0,
@@ -3472,618 +4995,385 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 +            break;
 +        }
 +    }
-+    s->num_mv_cmds[job] = 0;
 +}
 +
-+static void rpi_do_all_passes(HEVCContext *s)
++static void rpi_execute_inter_cmds(HEVCContext *s, const int qpu_luma, const int qpu_chroma, const int luma_b_only, const int chroma_b_only)
 +{
-+    // Kick off QPUs and VPUs
-+    rpi_launch_vpu_qpu(s);
-+    // Perform luma inter prediction
-+    rpi_execute_inter_cmds(s);
-+    // Wait for transform completion
-+    vpu_wait(s->vpu_id);
-+    // Perform intra prediction and residual reconstruction
-+    rpi_execute_pred_cmds(s);
-+    // Perform deblocking for CTBs in this row
-+    rpi_execute_dblk_cmds(s);
-+    // Prepare next batch
-+    rpi_begin(s);
++    const int job = s->pass1_job;
++
++    if (!qpu_luma || luma_b_only)
++        do_yc_inter_cmds(s, s->unif_mv_cmds_y[job], s->num_mv_cmds_y[job], qpu_luma);
++    s->num_mv_cmds_y[job] = 0;
++    if (!qpu_chroma || chroma_b_only)
++        do_yc_inter_cmds(s, s->unif_mv_cmds_c[job], s->num_mv_cmds_c[job], qpu_chroma);
++    s->num_mv_cmds_c[job] = 0;
 +}
 +
 +#endif
 +
 +#ifdef RPI
++// Set initial uniform job values & zero ctu_count
 +static void rpi_begin(HEVCContext *s)
 +{
++#if RPI_INTER
 +    int job = s->pass0_job;
 +    int i;
-+#ifdef RPI_INTER_QPU
-+    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
-+    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
-+
-+    for(i=0;i<8;i++) {
-+        s->u_mvs[job][i] = s->mvs_base[job][i];
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = pic_width;
-+        *s->u_mvs[job][i]++ = pic_height;
-+        *s->u_mvs[job][i]++ = s->frame->linesize[1];
-+        *s->u_mvs[job][i]++ = s->frame->linesize[2];
-+        *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
-+    }
-+    s->curr_u_mvs = s->u_mvs[job][0];
-+#endif
 +
-+#ifdef RPI_LUMA_QPU
-+    for(i=0;i<12;i++) {
-+        // This needs to have a generally similar structure to the
-+        // actual filter code as various pipelined bits need to land correctly
-+        // when inserted by the filter requests
-+        s->y_mvs[job][i] = s->y_mvs_base[job][i];
-+        *s->y_mvs[job][i]++ = 0; // y_x
-+        *s->y_mvs[job][i]++ = 0; // ref_y_base
-+        *s->y_mvs[job][i]++ = 0; // y2_x2
-+        *s->y_mvs[job][i]++ = 0; // ref_y2_base
-+        *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
-+        *s->y_mvs[job][i]++ = s->sh.luma_log2_weight_denom + 6;  // weight demon + 6
-+        *s->y_mvs[job][i]++ = 0; // Unused - alignment with per-block
-+        *s->y_mvs[job][i]++ = 0; // Next kernel
++    const uint16_t pic_width_y        = s->ps.sps->width;
++    const uint16_t pic_height_y       = s->ps.sps->height;
++
++    const uint16_t pic_width_c        = s->ps.sps->width >> s->ps.sps->hshift[1];
++    const uint16_t pic_height_c       = s->ps.sps->height >> s->ps.sps->vshift[1];
++
++    for(i=0; i < QPU_N_UV;i++) {
++        HEVCRpiChromaPred * const cp = s->jobs[job].chroma_mvs + i;
++        qpu_mc_pred_c_t * u = cp->qpu_mc_base;
++
++        // Chroma setup is a double block with L0 fetch
++        // and other stuff in the 1st block and L1 fetch
++        // in the 2nd along with a lot of dummy vars
++        // This could be packed a lot tighter but it would make
++        // L0, L1 management a lot harder
++
++        u->next_fn = 0;
++        u->next_src_x = 0;
++        u->next_src_y = 0;
++        u->next_src_base_c = 0;
++        u->s0.pic_cw = pic_width_c;
++        u->s0.pic_ch = pic_height_c;
++        u->s0.stride2 = rpi_sliced_frame_stride2(s->frame);
++        u->s0.stride1 = s->frame->linesize[1];
++        u->s0.wdenom = s->sh.chroma_log2_weight_denom + 6;
++        u->s0.dummy0 = 0;
++        cp->last_l0 = u;
++        ++u;
++
++        u->next_fn = 0;
++        u->next_src_x = 0;
++        u->next_src_y = 0;
++        u->next_src_base_c = 0;
++        u->s1.dummy0 = 0;
++        u->s1.dummy1 = 0;
++        u->s1.dummy2 = 0;
++        u->s1.dummy3 = 0;
++        u->s1.dummy4 = 0;
++        u->s1.dummy5 = 0;
++        cp->last_l1 = u;
++        ++u;
++
++        cp->load = 0;
++        cp->qpu_mc_curr = u;
++    }
++    s->curr_pred_c = NULL;
++
++    for(i=0;i < QPU_N_Y;i++) {
++        HEVCRpiLumaPred * const yp = s->jobs[job].luma_mvs + i;
++        qpu_mc_pred_y_t * y = yp->qpu_mc_base;
++
++        y->next_src1_x = 0;
++        y->next_src1_y = 0;
++        y->next_src1_base = 0;
++        y->next_src2_x = 0;
++        y->next_src2_y = 0;
++        y->next_src2_base = 0;
++        y->s.pic_h = pic_height_y;
++        y->s.pic_w = pic_width_y;
++        y->s.stride2 = rpi_sliced_frame_stride2(s->frame);
++        y->s.stride1 = s->frame->linesize[0];
++        y->s.wdenom = s->sh.luma_log2_weight_denom + 6;
++        y->s.dummy0 = 0;
++        y->next_fn = 0;
++        yp->last_lx = y;
++        ++y;
++
++        yp->load = 0;
++        yp->qpu_mc_curr = y;
 +    }
-+    s->curr_y_mvs = s->y_mvs[job][0];
++    s->curr_pred_y = NULL;
++    s->last_y8_p = NULL;
++    s->last_y8_lx = NULL;
 +#endif
 +    s->ctu_count = 0;
 +}
 +#endif
 +
-+#ifdef RPI_SIMULATE_QPUS
 +
-+static int32_t clipx(int x,int FRAME_WIDTH)
++#if RPI_INTER
++static unsigned int mc_terminate_y(HEVCContext * const s, const int job)
 +{
-+	if (x<=0) return 0;
-+	if (x>=FRAME_WIDTH) return FRAME_WIDTH-1;
-+	return x;
-+}
++    unsigned int i;
++    const uint32_t exit_fn = qpu_fn(mc_exit);
++    const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12);
++    unsigned int tc = 0;
++    HEVCRpiJob * const jb = s->jobs + job;
++
++    // Add final commands to Q
++    for(i = 0; i != QPU_N_Y; ++i) {
++        HEVCRpiLumaPred * const yp = jb->luma_mvs + i;
++        qpu_mc_pred_y_t *const px = yp->qpu_mc_curr - 1; // *** yp->last_lx;
++
++        // We will always have had L0 if we have L1 so only test L0
++        if (px != yp->qpu_mc_base)
++            tc = 1;
++
++        yp->qpu_mc_curr[-1].next_fn = (i != QPU_N_Y - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
++
++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++        px->next_src1_x = MC_DUMMY_X;
++        px->next_src1_y = MC_DUMMY_Y;
++        px->next_src1_base = s->qpu_dummy_frame;
++        px->next_src2_x = MC_DUMMY_X;
++        px->next_src2_y = MC_DUMMY_Y;
++        px->next_src2_base = s->qpu_dummy_frame;
++
++        yp->last_lx = NULL;
++    }
 +
-+static int32_t clipy(int y,int FRAME_HEIGHT)
-+{
-+	if (y<=0) return 0;
-+	if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1;
-+	return y;
++    return tc;
 +}
 +
-+/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset)
-+{
-+   int32_t vsum = 0;
-+   int x, y;
++#define MC_EXIT_FN_C2(n) mc_interrupt_exit ## n ## c
++#define MC_EXIT_FN_C(n) MC_EXIT_FN_C2(n)
 +
-+   for (y = 0; y < 8; y++) {
-+      int32_t hsum = 0;
++static unsigned int mc_terminate_uv(HEVCContext * const s, const int job)
++{
++    unsigned int i;
++    const uint32_t exit_fn = qpu_fn(mc_exit_c);
++    const uint32_t exit_fn2 = qpu_fn(MC_EXIT_FN_C(QPU_N_UV));
++    unsigned int tc = 0;
++    HEVCRpiJob * const jb = s->jobs + job;
++
++    // Add final commands to Q
++    for(i = 0; i != QPU_N_UV; ++i) {
++        HEVCRpiChromaPred * const cp = jb->chroma_mvs + i;
++        qpu_mc_pred_c_t *const p0 = cp->last_l0;
++        qpu_mc_pred_c_t *const p1 = cp->last_l1;
++
++        // We will always have had L0 if we have L1 so only test L0
++        if (p0 != cp->qpu_mc_base)
++            tc = 1;
++
++        cp->qpu_mc_curr[-1].next_fn = (i != QPU_N_UV - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
++
++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++        p0->next_src_x = MC_DUMMY_X;
++        p0->next_src_y = MC_DUMMY_Y;
++        p0->next_src_base_c = s->qpu_dummy_frame;
++        p1->next_src_x = MC_DUMMY_X;
++        p1->next_src_y = MC_DUMMY_Y;
++        p1->next_src_base_c = s->qpu_dummy_frame;;
++
++        cp->last_l0 = NULL;
++        cp->last_l1 = NULL;
++    }
 +
-+      for (x = 0; x < 8; x++)
-+         hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch];
++    return tc;
++}
++#endif
 +
-+      vsum += lumaFilter[my][y]*hsum;
-+   }
-+   vsum >>= 6;
-+   vsum = (((vsum*weight)+round)>>denom)+offset;
++#ifdef RPI
 +
-+   return av_clip_uint8( vsum );
-+}*/
 +
-+static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
++static void flush_frame(HEVCContext *s,AVFrame *frame)
 +{
-+  int32_t vsum = 0;
-+  int x, y;
-+  int chromaFilterH[4];
-+  int chromaFilterV[4];
-+  int i;
-+  int offset_after = offset_weight>>16;
-+  int weight = (offset_weight<<16)>>16;
-+  for(i=0;i<4;i++) {
-+    chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24;
-+    chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24;
-+  }
-+
-+   for (y = 0; y < 4; y++) {
-+      int32_t hsum = 0;
++  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
++  rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++  rpi_cache_flush_finish(rfe);
++}
 +
-+      for (x = 0; x < 4; x++)
-+         hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
 +
-+      vsum += chromaFilterV[y]*hsum;
-+   }
-+   vsum >>= 6;
-+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
++// Core execution tasks
++static void worker_core(HEVCContext * const s)
++{
++    worker_global_env_t * const wg = &worker_global_env;
++    int arm_cost = 0;
++//    vpu_qpu_wait_h sync_c;
++    vpu_qpu_wait_h sync_y;
++    int qpu_luma = 0;
++    int qpu_chroma = 0;
++    int gpu_load;
++    int arm_load;
++    static const int arm_const_cost = 2;
++
++//    static int z = 0;
++
++    const int job = s->pass1_job;
++    unsigned int flush_start = 0;
++    unsigned int flush_count = 0;
++
++    const vpu_qpu_job_h vqj = vpu_qpu_job_new();
++    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
++
++    if (s->num_coeffs[job][3] + s->num_coeffs[job][2] != 0) {
++        vpu_qpu_job_add_vpu(vqj,
++            vpu_get_fn(),
++            vpu_get_constants(),
++            s->coeffs_buf_vc[job][2],
++            s->num_coeffs[job][2] >> 8,
++            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
++            s->num_coeffs[job][3] >> 10,
++            0);
 +
-+   return vsum;
-+}
++        rpi_cache_flush_add_gm_ptr(rfe, s->coeffs_buf_accelerated + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++    }
 +
-+int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} };
 +
-+static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
-+{
-+  int32_t vsum = 0;
-+  int x, y;
-+  int i;
-+  int offset_after = offset_weight>>16;
-+  int weight = (offset_weight<<16)>>16;
++#if RPI_INTER
++    pthread_mutex_lock(&wg->lock);
 +
-+   for (y = 0; y < 8; y++) {
-+      int32_t hsum = 0;
++//    ++z;
++    gpu_load = vpu_qpu_current_load();
++    arm_load = avpriv_atomic_int_get(&wg->arm_load);
++#if 0 // Y_B_ONLY
++    qpu_luma =  gpu_load + 2 < arm_load;
++    qpu_chroma = gpu_load < arm_load + 8;
++#elif 0
++    qpu_luma =  gpu_load < arm_load + 2;
++    qpu_chroma = gpu_load < arm_load + 8;
++#else
++    qpu_chroma = 1;
++    qpu_luma = 1;
++#endif
 +
-+      for (x = 0; x < 8; x++)
-+         hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
++    arm_cost = !qpu_chroma * 2 + !qpu_luma * 3;
++    avpriv_atomic_int_add_and_fetch(&wg->arm_load, arm_cost + arm_const_cost);
 +
-+      vsum += lumaFilter[(my_mx>>8)&3][y]*hsum;
-+   }
-+   vsum >>= 6;
-+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
++    wg->gpu_c += qpu_chroma;
++    wg->gpu_y += qpu_luma;
++    wg->arm_c += !qpu_chroma;
++    wg->arm_y += !qpu_luma;
 +
-+   return vsum;
-+}
 +
-+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx)
-+{
-+  //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
-+  int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
-+  int pitch = frame->linesize[cIdx];
-+  uint32_t base = cIdx == 0 ? get_vc_address_y(frame) :
-+    cIdx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame);
-+  if (p>=base && p<base+pitch*pic_height) {
-+    return frame->data[cIdx] + (p-base);
-+  }
-+  return NULL;
-+}
++//    if ((z & 511) == 0) {
++//        printf("Arm load=%d, GPU=%d, chroma=%d/%d, luma=%d/%d    \n", arm_load, gpu_load, wg->gpu_c, wg->arm_c, wg->gpu_y, wg->arm_y);
++//    }
 +
-+static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx)
-+{
-+  SliceHeader *sh   = &s->sh;
-+  uint8_t *arm = test_frame(s,p,s->frame,cIdx);
-+  int i;
-+  if (arm) return arm;
-+  if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE)
-+  {
-+    for(i=0;i<sh->nb_refs[L0];i++) {
-+      arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx);
-+      if (arm) return arm;
-+    }
-+  }
-+  if (sh->slice_type == B_SLICE) {
-+    for(i=0;i<sh->nb_refs[L1];i++) {
-+      arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx);
-+      if (arm) return arm;
-+    }
-+  }
-+  printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT));
-+  exit(-1);
-+  return NULL;
-+}
 +
-+static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
-+{
-+  uint32_t next_kernel;
-+  uint32_t x0;
-+  uint32_t y0;
-+  uint8_t *ref_u_base;
-+  uint8_t *ref_v_base;
-+  uint32_t frame_width = p[5];
-+  uint32_t frame_height = p[6];
-+  uint32_t pitch = p[7];
-+  uint32_t dst_pitch = p[8];
-+  int32_t offset_before = p[9];
-+  int32_t denom = p[10];
-+  uint32_t vpm_id = p[11];
-+  uint32_t tmp_u_dst[256];
-+  uint32_t tmp_v_dst[256];
-+  while(1) {
-+    p += 12;
-+    next_kernel = p[0-12];
-+    x0 = p[1-12];
-+    y0 = p[2-12];
-+    if (next_kernel==s->mc_filter_uv || next_kernel==s->mc_filter_uv_b0 || next_kernel==s->mc_filter_uv_b) {
-+      int x,y;
-+      uint32_t width_height = p[5];
-+      uint32_t hcoeffs = p[6];
-+      uint32_t vcoeffs = p[7];
-+      uint32_t offset_weight_u = p[8];
-+      uint32_t offset_weight_v = p[9];
-+      uint8_t *this_u_dst;
-+      uint8_t *this_v_dst;
-+      uint32_t width = width_height >> 16;
-+      uint32_t height = (width_height << 16) >> 16;
-+      ref_u_base = compute_arm_addr(s,p[3-12],1);
-+      ref_v_base = compute_arm_addr(s,p[4-12],2);
-+      if (next_kernel!=s->mc_filter_uv_b0)
-+      {
-+        this_u_dst = compute_arm_addr(s,p[10],1);
-+        this_v_dst = compute_arm_addr(s,p[11],2);
-+      }
-+      for (y=0; y<height; ++y) {
-+        for (x=0; x<width; ++x) {
-+          if (next_kernel==s->mc_filter_uv) {
-+            int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height);
-+            int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height);
-+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
-+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-+          } else if (next_kernel==s->mc_filter_uv_b0) {
-+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
-+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
-+            tmp_u_dst[x+y*16] = refa;
-+            tmp_v_dst[x+y*16] = refb;
-+          } else {
-+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height);
-+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height);
-+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
-+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-+          }
++    {
++        int (*d)[2] = s->dblk_cmds[job];
++        unsigned int high=(*d)[1];
++        int n;
++
++        flush_start = high;
++        for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
++            unsigned int y = (*d)[1];
++            flush_start = FFMIN(flush_start, y);
++            high=FFMAX(high,y);
 +        }
-+      }
-+    } else {
-+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
-+      break;
++        // Avoid flushing past end of frame
++        flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->frame->height) - flush_start;
 +    }
-+  }
-+}
-+
-+// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
-+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
-+{
-+  uint32_t next_kernel;
-+  int y_x,y2_x2;
-+  int x0;
-+  int y0;
-+  int x2;
-+  int y2;
-+  uint32_t *p0 = p;
-+  uint8_t *ref_y_base;
-+  uint8_t *ref_y2_base;
-+  uint32_t frame_width_height = p[4];
-+  uint32_t frame_width = frame_width_height>>16;
-+  uint32_t frame_height = (frame_width_height<<16)>>16;
-+  uint32_t pitch = p[5];
-+  uint32_t dst_pitch = p[6];
-+  int offset_shift = p[7];
-+  int32_t offset_before = offset_shift>>16;
-+  int32_t denom = (offset_shift<<16)>>16;
-+  while(1) {
-+    p += 9;
-+    next_kernel = p[8-9];
-+    y_x = p[0-9];
-+    x0 = (y_x<<16)>>16;
-+    y0 = y_x>>16;
-+    y2_x2 = p[2-9];
-+    x2 = (y2_x2<<16)>>16;
-+    y2 = y2_x2>>16;
-+
-+    if (next_kernel==s->mc_filter || next_kernel==s->mc_filter_b) {
-+      // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-+      int x,y;
-+      uint32_t width_height = p[4];
-+      uint32_t my2_mx2_my_mx = p[5];
-+      uint32_t offset_weight = p[6];
-+      uint8_t *this_dst = compute_arm_addr(s,p[7],0);
-+      uint32_t width = width_height >> 16;
-+      uint32_t height = (width_height << 16) >> 16;
-+      uint8_t *dst_base = s->frame->data[0];
-+      ref_y_base = compute_arm_addr(s,p[1-9],0);
-+      ref_y2_base = compute_arm_addr(s,p[3-9],0);
-+      for (y=0; y<height; ++y) {
-+        for (x=0; x<width; ++x) {
-+          if (next_kernel==s->mc_filter) {
-+            int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
-+            refa = av_clip_uint8(refa);
-+            this_dst[x+y*dst_pitch] = refa;
-+          }
-+          else {
-+            int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
-+            int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height);
-+            this_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-+          }
-+        }
-+      }
-+    } else {
-+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
-+      break;
-+    }
-+  }
-+}
 +
-+static void rpi_simulate_inter_qpu(HEVCContext *s)
-+{
-+  // First run the transform as normal
-+  int i;
-+  rpi_execute_transform(s);
-+  for(i=0;i<8;i++)
-+  {
-+    rpi_simulate_inter_chroma(s,s->mvs_base[i]);
-+  }
-+  for(i=0;i<12;i++)
-+  {
-+    rpi_simulate_inter_luma(s,s->y_mvs_base[i],i);
-+  }
-+}
++#if !DISABLE_CHROMA
++    if (qpu_chroma && mc_terminate_uv(s, job) != 0)
++    {
++        HEVCRpiJob * const jb = s->jobs + job;
++        const uint32_t code = qpu_fn(mc_setup_c);
++        uint32_t * p;
++        unsigned int i;
++        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
 +
-+#endif
++        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
++            *p++ = jb->chroma_mvs_gptr.vc + ((uint8_t *)jb->chroma_mvs[i].qpu_mc_base - jb->chroma_mvs_gptr.arm);
++            *p++ = code;
++        }
 +
-+#ifdef RPI_INTER_QPU
++        vpu_qpu_job_add_qpu(vqj, QPU_N_UV, 2, mail_uv);
 +
-+static void rpi_launch_vpu_qpu(HEVCContext *s)
-+{
-+    int k;
-+    int job = s->pass1_job;
-+    int i;
-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
-+#ifdef RPI_LUMA_QPU
-+    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc;
++#if RPI_CACHE_UNIF_MVS
++        rpi_cache_flush_add_gm_ptr(rfe, &jb->chroma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
 +#endif
-+    if (s->sh.slice_type == I_SLICE) {
-+#ifdef RPI_MULTI_MAILBOX
-+      rpi_execute_transform(s);
-+      return;
-+#endif
-+    }
-+    for(k=0;k<8;k++) {
-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
-+        av_assert0(s->u_mvs[job][k] - s->mvs_base[job][k] < UV_COMMANDS_PER_QPU);
-+    }
-+
-+    s->u_mvs[job][8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+
-+#ifdef RPI_LUMA_QPU
-+    for(k=0;k<12;k++) {
-+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
-+        s->y_mvs[job][k][-1] = qpu_get_fn(QPU_MC_EXIT); // Add exit command (Final uniform)
-+        av_assert0(s->y_mvs[job][k] - s->y_mvs_base[job][k] < Y_COMMANDS_PER_QPU);
++        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++          flush_start, flush_count, s->ps.sps->vshift[1], 0, 1);
 +    }
-+    s->y_mvs[job][12-1][-1] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+#endif
-+
-+#ifdef RPI_SIMULATE_QPUS
-+    rpi_simulate_inter_qpu(s);
-+    return;
 +#endif
 +
-+#ifdef RPI_MULTI_MAILBOX
-+#ifdef RPI_CACHE_UNIF_MVS
-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job);
-+#else
-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
-+#endif
++// We can take a sync here and try to locally overlap QPU processing with ARM
++// but testing showed a slightly negative benefit with noticable extra complexity
++//    vpu_qpu_job_add_sync_this(vqj, &sync_c);
 +
-+#if 1
++    if (qpu_luma && mc_terminate_y(s, job) != 0)
 +    {
-+        unsigned int i;
++        HEVCRpiJob * const jb = s->jobs + job;
++        const uint32_t code = qpu_fn(mc_setup);
 +        uint32_t * p;
-+        uint32_t code = qpu_get_fn(QPU_MC_SETUP_UV);
-+        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
++        unsigned int i;
 +        uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
 +
-+        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
-+            *p++ = (uint32_t)(unif_vc + (s->mvs_base[job][i] - (uint32_t*)s->unif_mvs_ptr[job].arm));
-+            *p++ = code;
-+        }
-+
-+        code = qpu_get_fn(QPU_MC_SETUP);
 +        for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
-+            *p++ = (uint32_t)(y_unif_vc + (s->y_mvs_base[job][i] - (uint32_t*)s->y_unif_mvs_ptr[job].arm));
++            *p++ = jb->luma_mvs_gptr.vc + ((uint8_t *)jb->luma_mvs[i].qpu_mc_base - jb->luma_mvs_gptr.arm);
 +            *p++ = code;
 +        }
 +
-+        s->vpu_id = vpu_qpu_post_code2(vpu_get_fn(),
-+            vpu_get_constants(),
-+            s->coeffs_buf_vc[job][2],
-+            s->num_coeffs[job][2] >> 8,
-+            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
-+            s->num_coeffs[job][3] >> 10,
-+            0,
-+            // QPU job 1
-+            QPU_N_UV,
-+            mail_uv,
-+            // QPU job 2
-+            QPU_N_Y,
-+            mail_y
-+            );
-+    }
++        vpu_qpu_job_add_qpu(vqj, QPU_N_Y, 4, mail_y);
 +
-+#else
-+    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8,
-+                                                                      s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0,
-+                                   qpu_get_fn(QPU_MC_SETUP_UV),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+#ifdef RPI_LUMA_QPU
-+                                   qpu_get_fn(QPU_MC_SETUP),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][0 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][1 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][2 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][3 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][4 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][5 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][6 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][7 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][8 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][9 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][10 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][11 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm))
-+#else
-+                                   0,
-+                                   0,0,0,0,
-+                                   0,0,0,0,
-+                                   0,0,0,0
-+#endif
-+                                 );
-+#endif
-+    for(i=0;i<4;i++)
-+        s->num_coeffs[job][i] = 0;
-+#else
-+#error Code rotted here
-+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm))
-+      );
++#if RPI_CACHE_UNIF_MVS
++        rpi_cache_flush_add_gm_ptr(rfe, &jb->luma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
 +#endif
++        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++          flush_start, flush_count, s->ps.sps->vshift[1], 1, 0);
++    }
 +
++    pthread_mutex_unlock(&wg->lock);
 +
-+}
-+#else
-+
-+#ifdef RPI
-+static void rpi_launch_vpu_qpu(HEVCContext *s)
-+{
-+  rpi_execute_transform(s);
-+}
 +#endif
 +
-+#endif
++    vpu_qpu_job_add_sync_this(vqj, &sync_y);
 +
-+#ifdef RPI
++    // Having accumulated some commands - do them
++    rpi_cache_flush_finish(rfe);
++    vpu_qpu_job_finish(vqj);
 +
-+#ifndef RPI_FAST_CACHEFLUSH
-+#error RPI_FAST_CACHEFLUSH is broken
-+static void flush_buffer(AVBufferRef *bref) {
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-+    gpu_cache_flush(p);
-+}
++    memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job]));  //???? Surely we haven't done the smaller
++
++#if Y_B_ONLY
++    if (qpu_luma)
++        vpu_qpu_wait(&sync_y);
 +#endif
++    // Perform inter prediction
++    rpi_execute_inter_cmds(s, qpu_luma, qpu_chroma, Y_B_ONLY, 0);
 +
-+static void flush_frame(HEVCContext *s,AVFrame *frame)
-+{
-+#ifdef RPI_FAST_CACHEFLUSH
-+    struct vcsm_user_clean_invalid_s iocache = {};
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
-+    int n = s->ps.sps->height;
-+    int curr_y = 0;
-+    int curr_uv = 0;
-+    int n_uv = n >> s->ps.sps->vshift[1];
-+    int sz,base;
-+    sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+    base = s->frame->linesize[1] * curr_uv;
-+    iocache.s[0].handle = p.vcsm_handle;
-+    iocache.s[0].cmd = 3; // clean+invalidate
-+    iocache.s[0].addr = (int)(p.arm) + base;
-+    iocache.s[0].size  = sz;
-+    p = get_gpu_mem_ptr_v(s->frame);
-+    iocache.s[1].handle = p.vcsm_handle;
-+    iocache.s[1].cmd = 3; // clean+invalidate
-+    iocache.s[1].addr = (int)(p.arm) + base;
-+    iocache.s[1].size  = sz;
-+    p = get_gpu_mem_ptr_y(s->frame);
-+    sz = s->frame->linesize[0] * (n-curr_y);
-+    base = s->frame->linesize[0] * curr_y;
-+    iocache.s[2].handle = p.vcsm_handle;
-+    iocache.s[2].cmd = 3; // clean+invalidate
-+    iocache.s[2].addr = (int)(p.arm) + base;
-+    iocache.s[2].size  = sz;
-+    vcsm_clean_invalid( &iocache );
++    // Wait for transform completion
++
++    // Perform intra prediction and residual reconstruction
++    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_cost);
++#if Y_B_ONLY
++    if (!qpu_luma)
++        vpu_qpu_wait(&sync_y);
 +#else
-+    flush_buffer(frame->buf[0]);
-+    flush_buffer(frame->buf[1]);
-+    flush_buffer(frame->buf[2]);
++    vpu_qpu_wait(&sync_y);
 +#endif
++    rpi_execute_pred_cmds(s);
++
++    // Perform deblocking for CTBs in this row
++    rpi_execute_dblk_cmds(s);
++
++    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_const_cost);
 +}
 +
-+static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job)
++static void rpi_do_all_passes(HEVCContext *s)
 +{
-+#ifdef RPI_FAST_CACHEFLUSH
-+    struct vcsm_user_clean_invalid_s iocache = {};
-+    int n;
-+    int curr_y;
-+    int curr_uv;
-+    int n_uv;
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
-+    int sz,base;
-+    int (*d)[2] = s->dblk_cmds[job];
-+    int low=(*d)[1];
-+    int high=(*d)[1];
-+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
-+        int y = (*d)[1];
-+        low=FFMIN(low,y);
-+        high=FFMAX(high,y);
-+    }
-+    curr_y = low;
-+    n = high+(1 << s->ps.sps->log2_ctb_size);
-+    curr_uv = curr_y >> s->ps.sps->vshift[1];
-+    n_uv = n >> s->ps.sps->vshift[1];
-+
-+    sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+    base = s->frame->linesize[1] * curr_uv;
-+    iocache.s[0].handle = p.vcsm_handle;
-+    iocache.s[0].cmd = 3; // clean+invalidate
-+    iocache.s[0].addr = (int)(p.arm) + base;
-+    iocache.s[0].size  = sz;
-+    p = get_gpu_mem_ptr_v(s->frame);
-+    iocache.s[1].handle = p.vcsm_handle;
-+    iocache.s[1].cmd = 3; // clean+invalidate
-+    iocache.s[1].addr = (int)(p.arm) + base;
-+    iocache.s[1].size  = sz;
-+    p = get_gpu_mem_ptr_y(s->frame);
-+    sz = s->frame->linesize[0] * (n-curr_y);
-+    base = s->frame->linesize[0] * curr_y;
-+    iocache.s[2].handle = p.vcsm_handle;
-+    iocache.s[2].cmd = 3; // clean+invalidate
-+    iocache.s[2].addr = (int)(p.arm) + base;
-+    iocache.s[2].size  = sz;
-+
-+    iocache.s[3].handle = p0->vcsm_handle;
-+    iocache.s[3].cmd = 3; // clean+invalidate
-+    iocache.s[3].addr = (int) p0->arm;
-+    iocache.s[3].size  = p0->numbytes;
-+    if (p1) {
-+      iocache.s[4].handle = p1->vcsm_handle;
-+      iocache.s[4].cmd = 3; // clean+invalidate
-+      iocache.s[4].addr = (int) p1->arm;
-+      iocache.s[4].size  = p1->numbytes;
-+    }
-+    if (p2) {
-+      iocache.s[5].handle = p2->vcsm_handle;
-+      iocache.s[5].cmd = 3; // clean+invalidate
-+      iocache.s[5].addr = (int) p2->arm;
-+      iocache.s[5].size  = p2->numbytes;
-+    }
-+    vcsm_clean_invalid( &iocache );
-+#else
-+    flush_buffer(frame->buf[0]);
-+    flush_buffer(frame->buf[1]);
-+    flush_buffer(frame->buf[2]);
-+    gpu_cache_flush3(p0, p1, p2);
-+#endif
++    // Do the various passes - common with the worker code
++    worker_core(s);
++    // Prepare next batch
++    rpi_begin(s);
 +}
 +
++
++
 +#endif
 +
  static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
  {
      HEVCContext *s  = avctxt->priv_data;
-@@ -2316,6 +3765,17 @@
+@@ -2316,6 +3877,18 @@
      int y_ctb       = 0;
      int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
  
 +#ifdef RPI
-+    s->enable_rpi = s->ps.sps->bit_depth == 8
-+                    && !s->ps.pps->cross_component_prediction_enabled_flag;
++    s->enable_rpi = s->ps.sps->bit_depth == 8 &&
++        s->frame->format == AV_PIX_FMT_SAND128 &&
++        !s->ps.pps->cross_component_prediction_enabled_flag;
 +
 +    if (!s->enable_rpi) {
 +      if (s->ps.pps->cross_component_prediction_enabled_flag)
@@ -4095,7 +5385,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
      if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
          av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
          return AVERROR_INVALIDDATA;
-@@ -2329,6 +3789,14 @@
+@@ -2329,6 +3902,14 @@
          }
      }
  
@@ -4110,26 +5400,25 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
      while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
          int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
  
-@@ -2344,7 +3812,57 @@
+@@ -2336,6 +3917,7 @@
+         y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
+         hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
+ 
++
+         ff_hevc_cabac_init(s, ctb_addr_ts);
+ 
+         hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
+@@ -2344,7 +3926,52 @@
          s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
          s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
  
-+#ifdef RPI_INTER_QPU
-+        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % 8];
-+#endif
-+#ifdef RPI_LUMA_QPU
-+        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12];
++#if RPI_INTER
++        s->curr_pred_c = s->jobs[s->pass0_job].chroma_mvs + (s->ctu_count * QPU_N_GRP_UV) % QPU_N_UV;
++        s->curr_pred_y = s->jobs[s->pass0_job].luma_mvs + (s->ctu_count * QPU_N_GRP_Y) % QPU_N_Y;
 +#endif
 +
          more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
 +
-+#ifdef RPI_INTER_QPU
-+        s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs;
-+#endif
-+#ifdef RPI_LUMA_QPU
-+        s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs;
-+#endif
-+
 +#ifdef RPI
 +        if (s->enable_rpi) {
 +          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
@@ -4139,14 +5428,18 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 +          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
 +          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
 +          s->ctu_count++;
-+          //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job);
 +
 +          if ( s->ctu_count >= s->max_ctu_count ) {
 +#ifdef RPI_WORKER
-+            if (s->used_for_ref) {
++            if (s->used_for_ref)
++            {
++//              printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
++
++//                worker_wait(s);
 +              // Split work load onto separate threads so we make as rapid progress as possible with this frame
 +              // Pass on this job to worker thread
 +              worker_submit_job(s);
++
 +              // Make sure we have space to prepare the next job
 +              worker_pass0_ready(s);
 +
@@ -4168,7 +5461,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
          if (more_data < 0) {
              s->tab_slice_address[ctb_addr_rs] = -1;
              return more_data;
-@@ -2353,9 +3871,29 @@
+@@ -2353,9 +3980,42 @@
  
          ctb_addr_ts++;
          ff_hevc_save_states(s, ctb_addr_ts);
@@ -4193,12 +5486,25 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 +        rpi_do_all_passes(s);
 +    }
 +
++#if RPI_TSTATS
++    {
++        HEVCRpiStats *const ts = &s->tstats;
++
++        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
++               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
++               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
++               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
++               ts->y_pred2_hgt16, ts->y_pred2_hle16);
++        memset(ts, 0, sizeof(*ts));
++    }
++#endif
++
 +#endif
 +
      if (x_ctb + ctb_size >= s->ps.sps->width &&
          y_ctb + ctb_size >= s->ps.sps->height)
          ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
-@@ -2390,6 +3928,11 @@
+@@ -2390,6 +4050,11 @@
      s = s1->sList[self_id];
      lc = s->HEVClc;
  
@@ -4210,16 +5516,32 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
      if(ctb_row) {
          ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
  
-@@ -2770,6 +4313,16 @@
+@@ -2770,6 +4435,32 @@
          if (ret < 0)
              return ret;
  
-+        s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
++        // The definition of _N unit types is "non-reference for other frames
++        // with the same temporal_id" so they may/will be ref frames for pics
++        // with a higher temporal_id.
++        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
++            !(s->nal_unit_type == NAL_TRAIL_N ||
 +                        s->nal_unit_type == NAL_TSA_N   ||
 +                        s->nal_unit_type == NAL_STSA_N  ||
 +                        s->nal_unit_type == NAL_RADL_N  ||
 +                        s->nal_unit_type == NAL_RASL_N);
 +
++#if DEBUG_DECODE_N
++        {
++            static int z = 0;
++            if (IS_IDR(s)) {
++                z = 1;
++            }
++            if (z != 0 && z++ > DEBUG_DECODE_N) {
++                s->is_decoded = 0;
++                break;
++            }
++        }
++#endif
 +        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
 +            s->is_decoded = 0;
 +            break;
@@ -4227,27 +5549,30 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
          if (s->max_ra == INT_MAX) {
              if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
                  s->max_ra = s->poc;
-@@ -2894,9 +4447,17 @@
+@@ -2893,10 +4584,19 @@
+         }
      }
  
- fail:
+-fail:
 -    if (s->ref && s->threads_type == FF_THREAD_FRAME)
++fail:  // Also success path
 +    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
-+#ifdef RPI_INTER_QPU
-+        ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
++#if RPI_INTER
++        rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height);
 +#endif
          ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
 -
-+    } else if (s->ref) {
-+#ifdef RPI_INTER_QPU
++    }
++#if RPI_INTER
++    else if (s->ref && s->enable_rpi) {
 +      // When running single threaded we need to flush the whole frame
 +      flush_frame(s,s->frame);
-+#endif
 +    }
++#endif
      return ret;
  }
  
-@@ -3067,6 +4628,41 @@
+@@ -3067,6 +4767,41 @@
      return AVERROR(ENOMEM);
  }
  
@@ -4289,7 +5614,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
  static av_cold int hevc_decode_free(AVCodecContext *avctx)
  {
      HEVCContext       *s = avctx->priv_data;
-@@ -3078,6 +4674,32 @@
+@@ -3078,6 +4813,29 @@
  
      av_freep(&s->cabac_state);
  
@@ -4300,29 +5625,26 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
 +#endif
 +
 +    for(i=0;i<RPI_MAX_JOBS;i++) {
-+      av_freep(&s->unif_mv_cmds[i]);
-+      av_freep(&s->univ_pred_cmds[i]);
 +
-+#ifdef RPI_INTER_QPU
-+      if (s->unif_mvs[i]) {
-+        gpu_free( &s->unif_mvs_ptr[i] );
-+        s->unif_mvs[i] = 0;
-+      }
-+#endif
-+#ifdef RPI_LUMA_QPU
-+      if (s->y_unif_mvs[i]) {
-+        gpu_free( &s->y_unif_mvs_ptr[i] );
-+        s->y_unif_mvs[i] = 0;
-+      }
++        av_freep(&s->unif_mv_cmds_y[i]);
++        av_freep(&s->unif_mv_cmds_c[i]);
++        av_freep(&s->univ_pred_cmds[i]);
++
++#if RPI_INTER
++        gpu_free(&s->jobs[i].chroma_mvs_gptr);
++        gpu_free(&s->jobs[i].luma_mvs_gptr);
 +#endif
 +    }
 +
++    vpu_qpu_term();
++
++    av_rpi_zc_uninit(avctx);
 +#endif
 +
      for (i = 0; i < 3; i++) {
          av_freep(&s->sao_pixel_buffer_h[i]);
          av_freep(&s->sao_pixel_buffer_v[i]);
-@@ -3119,10 +4741,23 @@
+@@ -3119,10 +4877,25 @@
      return 0;
  }
  
@@ -4342,75 +5664,76 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
  {
      HEVCContext *s = avctx->priv_data;
      int i;
-+    int job;
++#ifdef RPI
++    unsigned int job;
++#endif
  
      s->avctx = avctx;
  
-@@ -3132,6 +4767,78 @@
+@@ -3132,6 +4905,77 @@
      s->HEVClcList[0] = s->HEVClc;
      s->sList[0] = s;
  
 +#ifdef RPI
-+    for(job=0;job<RPI_MAX_JOBS;job++) {
-+        s->unif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
-+        if (!s->unif_mv_cmds[job])
++    // Whilst FFmpegs init fn is only called once the close fn is called as
++    // many times as we have threads (init_thread_copy is called for the
++    // threads).  So to match init & term put the init here where it will be
++    // called by both init & copy
++    av_rpi_zc_init(avctx);
++
++    if (vpu_qpu_init() != 0)
++        goto fail;
++
++    for(job = 0; job < RPI_MAX_JOBS; job++) {
++        s->unif_mv_cmds_y[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_Y);
++        if (!s->unif_mv_cmds_y[job])
++            goto fail;
++        s->unif_mv_cmds_c[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_C);
++        if (!s->unif_mv_cmds_c[job])
 +            goto fail;
 +        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
 +        if (!s->univ_pred_cmds[job])
 +            goto fail;
 +    }
 +
-+#ifdef RPI_INTER_QPU
++#if RPI_INTER
 +    // We divide the image into blocks 256 wide and 64 high
 +    // We support up to 2048 widths
 +    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
 +    // Also add space for the startup command for each stream.
 +
-+    {
-+        int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
-+        uint32_t *p;
-+		for(job=0;job<RPI_MAX_JOBS;job++) {
-+#ifdef RPI_CACHE_UNIF_MVS
-+          gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
++    for (job = 0; job < RPI_MAX_JOBS; job++) {
++        HEVCRpiJob * const jb = s->jobs + job;
++#if RPI_CACHE_UNIF_MVS
++        gpu_malloc_cached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr);
++        gpu_malloc_cached(QPU_N_Y  * Y_COMMANDS_PER_QPU  * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr);
 +#else
-+          gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
++        gpu_malloc_uncached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr);
++        gpu_malloc_uncached(QPU_N_Y  * Y_COMMANDS_PER_QPU  * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr);
 +#endif
-+          s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
 +
-+          // Set up initial locations for uniform streams
-+          p = s->unif_mvs[job];
-+          for(i = 0; i < 8; i++) {
-+            s->mvs_base[job][i] = p;
-+            p += uv_commands_per_qpu;
-+          }
++        {
++            qpu_mc_pred_c_t * p = (qpu_mc_pred_c_t *)jb->chroma_mvs_gptr.arm;
++            for(i = 0; i < QPU_N_UV; i++) {
++                jb->chroma_mvs[i].qpu_mc_base = p;
++                jb->chroma_mvs[i].qpu_mc_curr = p;
++                p += UV_COMMANDS_PER_QPU;
++            }
 +        }
-+        s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
-+        s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
-+        s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
-+    }
-+
-+#endif
-+#ifdef RPI_LUMA_QPU
-+    for(job=0;job<RPI_MAX_JOBS;job++)
-+    {
-+        int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
-+        uint32_t *p;
-+#ifdef RPI_CACHE_UNIF_MVS
-+        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
-+#else
-+        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
-+#endif
-+        s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
-+
-+        // Set up initial locations for uniform streams
-+        p = s->y_unif_mvs[job];
-+        for(i = 0; i < 12; i++) {
-+            s->y_mvs_base[job][i] = p;
-+            p += y_commands_per_qpu;
++        {
++            qpu_mc_pred_y_t * p = (qpu_mc_pred_y_t *)jb->luma_mvs_gptr.arm;
++            for(i = 0; i < QPU_N_Y; i++) {
++                jb->luma_mvs[i].qpu_mc_base = p;
++                jb->luma_mvs[i].qpu_mc_curr = p;
++                p += Y_COMMANDS_PER_QPU;
++            }
 +        }
 +    }
-+    s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
-+    s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
++    s->qpu_filter_uv = qpu_fn(mc_filter_uv);
++    s->qpu_filter_uv_b0 = qpu_fn(mc_filter_uv_b0);
++    s->qpu_dummy_frame = qpu_fn(mc_setup_c);  // Use our code as a dummy frame
++    s->qpu_filter = qpu_fn(mc_filter);
++    s->qpu_filter_b = qpu_fn(mc_filter_b);
 +#endif
 +    //gpu_malloc_uncached(2048*64,&s->dummy);
 +
@@ -4425,10 +5748,32 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.c ffmpeg-3.2.4.patch/libavcodec/hevc.c
      s->cabac_state = av_malloc(HEVC_CONTEXTS);
      if (!s->cabac_state)
          goto fail;
+@@ -3346,9 +5190,9 @@
+     }
+ 
+     if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
+-            s->threads_type = FF_THREAD_FRAME;
+-        else
+-            s->threads_type = FF_THREAD_SLICE;
++        s->threads_type = FF_THREAD_FRAME;
++    else
++        s->threads_type = FF_THREAD_SLICE;
+ 
+     return 0;
+ }
+@@ -3407,6 +5251,8 @@
+     .update_thread_context = hevc_update_thread_context,
+     .init_thread_copy      = hevc_init_thread_copy,
+     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
++//                             0,
++//                             AV_CODEC_CAP_FRAME_THREADS,
+                              AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS,
+     .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
+ };
 diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/hevc_cabac.c
 --- ffmpeg-3.2.4/libavcodec/hevc_cabac.c	2017-02-10 14:25:26.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/hevc_cabac.c	2017-03-22 22:42:34.838798549 +0100
-@@ -21,14 +21,72 @@
++++ ffmpeg-3.2.4.patch/libavcodec/hevc_cabac.c	2017-05-28 20:42:45.738088666 +0200
+@@ -21,14 +21,76 @@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
@@ -4441,6 +5786,10 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
  #include "hevc.h"
 +#include "cabac_functions.h"
 +
++#ifdef RPI
++#include "rpi_zc.h"
++#endif
++
 +// BY22 is probably faster than simple bypass if the processor has
 +// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
 +// x86 has fast int divide
@@ -4502,7 +5851,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
  /**
   * number of bin by SyntaxElement.
   */
-@@ -445,6 +503,211 @@
+@@ -445,6 +507,211 @@
      { 28, 36, 43, 49, 54, 58, 61, 63, },
  };
  
@@ -4714,7 +6063,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
  void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
  {
      if (s->ps.pps->entropy_coding_sync_enabled_flag &&
-@@ -863,19 +1126,19 @@
+@@ -863,19 +1130,19 @@
      return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
  }
  
@@ -4740,7 +6089,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
  }
  
  int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
-@@ -891,14 +1154,14 @@
+@@ -891,14 +1158,14 @@
      return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
  }
  
@@ -4757,7 +6106,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
          ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
          ctx_shift = (log2_size + 1) >> 2;
      } else {
-@@ -929,22 +1192,16 @@
+@@ -929,22 +1196,16 @@
      return value;
  }
  
@@ -4783,7 +6132,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
  {
      return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
  }
-@@ -966,90 +1223,366 @@
+@@ -966,90 +1227,378 @@
      return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
  }
  
@@ -4965,8 +6314,8 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
 +    }
 +
 +    return rv;
- }
- 
++}
++
 +// extended_precision_processing_flag must be false given we are
 +// putting the result into a 16-bit array
 +// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
@@ -5079,7 +6428,46 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
 +    return i;
 +}
 +
-+
++#ifdef RPI
++static void rpi_add_residual(HEVCContext * const s,
++    const unsigned int log2_trafo_size, const unsigned int c_idx,
++    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
++{
++    const AVFrame * const frame = s->frame;
++    unsigned int stride = frame->linesize[c_idx];
++    unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
++    unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
++    const int is_sliced = rpi_sliced_frame(frame);
++    uint8_t * dst = !is_sliced ?
++            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
++        c_idx == 0 ?
++            rpi_sliced_frame_pos_y(frame, x, y) :
++            rpi_sliced_frame_pos_c(frame, x, y);
++
++//    if (c_idx != 0) {
++//        return;
++//    }
++    if (s->enable_rpi) {
++        HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
++        cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
++        cmd->size = log2_trafo_size;
++        cmd->c_idx = c_idx;
++        cmd->ta.buf = coeffs;
++        cmd->ta.dst = dst;
++        cmd->ta.stride = stride;
++    }
++    else if (!is_sliced || c_idx == 0) {
++        s->hevcdsp.transform_add[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
++    }
++    else if (c_idx == 1) {
++        s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
++    }
++    else {
++        s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
++    }
+ }
++#endif
+ 
  void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                                  int log2_trafo_size, enum ScanType scan_idx,
                                  int c_idx)
@@ -5108,17 +6496,20 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
 +    const uint8_t *scan_x_cg, *scan_y_cg;
 +    const xy_off_t * scan_xy_off;
  
++#ifndef RPI
      ptrdiff_t stride = s->frame->linesize[c_idx];
      int hshift = s->ps.sps->hshift[c_idx];
      int vshift = s->ps.sps->vshift[c_idx];
-     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+-    uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
++    uint8_t * const dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
                                            ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+-    int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+-    uint8_t significant_coeff_group_flag[8][8] = {{0}};
++#endif
 +#ifdef RPI
-+    //***** transform_skip_flag decoded later!
-+    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag /* && !transform_skip_flag*/ && !lc->tu.cross_pf && log2_trafo_size>=4;
++    int use_vpu;
 +#endif
-     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
--    uint8_t significant_coeff_group_flag[8][8] = {{0}};
++    int16_t *coeffs;
 +    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
      int explicit_rdpcm_flag = 0;
      int explicit_rdpcm_dir_flag;
@@ -5133,38 +6524,11 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
      int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
                                           lc->tu.intra_pred_mode_c;
  
+-    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
 +    int prev_sig = 0;
 +    const int c_idx_nz = (c_idx != 0);
 +
 +    int may_hide_sign;
-+
-+#ifdef RPI
-+    if (s->enable_rpi) {
-+        int n = trafo_size * trafo_size;
-+        if (use_vpu) {
-+            // We support size 4 and size 5.
-+            // Size 4 grows from the front  (Coeffs_buf_arm[2] points to start of buf)
-+            // Size 5 grows from the back   (Coeffs_buf_arm[3] points to end of buf)
-+            // num_coeffs is indexed by log2_trafo_size-2
-+            if (log2_trafo_size == 4)
-+                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
-+            else
-+                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] - s->num_coeffs[s->pass0_job][log2_trafo_size - 2] - n;
-+            s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
-+        } else {
-+            coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
-+            s->num_coeffs[s->pass0_job][0] += n;
-+        }
-+    }
-+    // We now do the memset after transform_add while we know the data is cached.
-+    #ifdef RPI_PRECLEAR
-+    #else
-     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-+    #endif
-+#else
-+    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-+#endif
-+
 +
  
      // Derive QP for dequant
@@ -5174,7 +6538,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
          static const uint8_t rem6[51 + 4 * 6 + 1] = {
              0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
              3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-@@ -1065,9 +1598,19 @@
+@@ -1065,9 +1614,19 @@
          };
          int qp_y = lc->qp_y;
  
@@ -5195,7 +6559,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
          }
  
          if (c_idx == 0) {
-@@ -1100,39 +1643,73 @@
+@@ -1100,39 +1659,76 @@
              qp += s->ps.sps->qp_bd_offset;
          }
  
@@ -5266,6 +6630,9 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
 +        may_hide_sign = 0;
      }
  
++
++
++
      if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
 -        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
 -        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
@@ -5283,7 +6650,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
                                             &last_significant_coeff_x, &last_significant_coeff_y);
  
      if (last_significant_coeff_x > 3) {
-@@ -1160,119 +1737,113 @@
+@@ -1160,119 +1756,134 @@
          int last_x_c = last_significant_coeff_x & 3;
          int last_y_c = last_significant_coeff_y & 3;
  
@@ -5368,13 +6735,33 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
 -            significant_coeff_group_flag[x_cg][y_cg] =
 -            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
 -             (x_cg == 0 && y_cg == 0));
--        }
++    {
++        const unsigned int ccount = 1 << (log2_trafo_size * 2);
++#ifdef RPI
++        use_vpu = 0;
++        if (s->enable_rpi) {
++            use_vpu = !trans_skip_or_bypass && !lc->tu.cross_pf && log2_trafo_size>=4;
++            coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
++#if HAVE_NEON
++            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
++#else
++            memset(coeffs, 0, ccount * sizeof(int16_t));
++#endif
++        }
++        else
++#endif
++        {
++            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
++            memset(coeffs, 0, ccount * sizeof(int16_t));
+         }
++    }
+ 
+-        last_scan_pos = num_coeff - offset - 1;
 +    i = num_last_subset;
 +    do {
 +        int implicit_non_zero_coeff = 0;
 +        int n_end;
- 
--        last_scan_pos = num_coeff - offset - 1;
++
 +        uint8_t significant_coeff_flag_idx[16];
 +        unsigned int nb_significant_coeff_flag = 0;
  
@@ -5461,7 +6848,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
                          if (log2_trafo_size == 3) {
                              scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
                          } else {
-@@ -1286,34 +1857,30 @@
+@@ -1286,34 +1897,30 @@
                      }
                  }
              }
@@ -5510,7 +6897,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
                      significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
                      nb_significant_coeff_flag++;
                  }
-@@ -1323,141 +1890,185 @@
+@@ -1323,141 +1930,185 @@
              }
          }
  
@@ -5618,7 +7005,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
                  }
 -            }
 -            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
- 
+-
 -            if (lc->cu.cu_transquant_bypass_flag ||
 -                (lc->cu.pred_mode ==  MODE_INTRA  &&
 -                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
@@ -5627,7 +7014,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
 -                sign_hidden = 0;
 -            else
 -                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
--
+ 
 -            if (first_greater1_coeff_idx != -1) {
 -                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
 -            }
@@ -5816,7 +7203,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
  
      if (lc->cu.cu_transquant_bypass_flag) {
          if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-@@ -1467,7 +2078,7 @@
+@@ -1467,7 +2118,7 @@
              s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
          }
      } else {
@@ -5825,7 +7212,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
              int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
                        log2_trafo_size == 2 &&
                        lc->cu.pred_mode == MODE_INTRA;
-@@ -1475,7 +2086,6 @@
+@@ -1475,7 +2126,6 @@
                  for (i = 0; i < 8; i++)
                      FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
              }
@@ -5833,7 +7220,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
              s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
  
              if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-@@ -1486,8 +2096,26 @@
+@@ -1486,8 +2136,26 @@
                  s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
              }
          } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
@@ -5861,7 +7248,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
              if (max_xy == 0)
                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
-@@ -1501,6 +2129,7 @@
+@@ -1501,6 +2169,7 @@
                      col_limit = FFMIN(24, col_limit);
                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
              }
@@ -5869,27 +7256,21 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_cabac.c ffmpeg-3.2.4.patch/libavcodec/he
          }
      }
      if (lc->tu.cross_pf) {
-@@ -1510,6 +2139,17 @@
+@@ -1510,7 +2179,11 @@
              coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
          }
      }
 +#ifdef RPI
-+    if (s->enable_rpi) {
-+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
-+        cmd->type = RPI_PRED_TRANSFORM_ADD;
-+        cmd->size = log2_trafo_size;
-+        cmd->buf = coeffs;
-+        cmd->dst = dst;
-+        cmd->stride = stride;
-+        return;
-+    }
-+#endif
++    rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs);
++#else
      s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
++#endif
  }
  
+ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
 diff -Naur ffmpeg-3.2.4/libavcodec/hevcdsp.c ffmpeg-3.2.4.patch/libavcodec/hevcdsp.c
 --- ffmpeg-3.2.4/libavcodec/hevcdsp.c	2016-06-27 01:54:29.000000000 +0200
-+++ ffmpeg-3.2.4.patch/libavcodec/hevcdsp.c	2017-03-22 22:42:34.841798557 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/hevcdsp.c	2017-05-28 20:42:45.742088680 +0200
 @@ -123,6 +123,120 @@
  #include "hevcdsp_template.c"
  #undef BIT_DEPTH
@@ -6011,7 +7392,68 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevcdsp.c ffmpeg-3.2.4.patch/libavcodec/hevcd
  void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
  {
  #undef FUNC
-@@ -257,6 +371,8 @@
+@@ -193,6 +307,16 @@
+     PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
+     PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
+ 
++#ifndef RPI
++#define SLICED_LOOP_FILTERS(depth)
++#else
++#define SLICED_LOOP_FILTERS(depth)\
++    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
++    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
++    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
++#endif
++
++
+ #define HEVC_DSP(depth)                                                     \
+     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
+     hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
+@@ -200,6 +324,15 @@
+     hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
+     hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
+     hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
++    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth);                 \
++    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
++    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
++    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
++    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
++    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
++    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
++    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
++    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
+     hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
+     hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
+     hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
+@@ -225,6 +358,19 @@
+     hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
+     hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
+                                                                                \
++    hevcdsp->sao_band_filter_c[0] =                                            \
++    hevcdsp->sao_band_filter_c[1] =                                            \
++    hevcdsp->sao_band_filter_c[2] =                                            \
++    hevcdsp->sao_band_filter_c[3] =                                            \
++    hevcdsp->sao_band_filter_c[4] = FUNC(sao_band_filter_c, depth);            \
++    hevcdsp->sao_edge_filter_c[0] =                                            \
++    hevcdsp->sao_edge_filter_c[1] =                                            \
++    hevcdsp->sao_edge_filter_c[2] =                                            \
++    hevcdsp->sao_edge_filter_c[3] =                                            \
++    hevcdsp->sao_edge_filter_c[4] = FUNC(sao_edge_filter_c, depth);            \
++    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);        \
++    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth);        \
++                                                                               \
+     QPEL_FUNCS(depth);                                                         \
+     QPEL_UNI_FUNCS(depth);                                                     \
+     QPEL_BI_FUNCS(depth);                                                      \
+@@ -232,6 +378,7 @@
+     EPEL_UNI_FUNCS(depth);                                                     \
+     EPEL_BI_FUNCS(depth);                                                      \
+                                                                                \
++    SLICED_LOOP_FILTERS(depth);                                                \
+     hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
+     hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
+     hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
+@@ -257,6 +404,8 @@
          break;
      }
  
@@ -6022,8 +7464,8 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevcdsp.c ffmpeg-3.2.4.patch/libavcodec/hevcd
      if (ARCH_ARM)
 diff -Naur ffmpeg-3.2.4/libavcodec/hevcdsp.h ffmpeg-3.2.4.patch/libavcodec/hevcdsp.h
 --- ffmpeg-3.2.4/libavcodec/hevcdsp.h	2016-06-27 01:54:29.000000000 +0200
-+++ ffmpeg-3.2.4.patch/libavcodec/hevcdsp.h	2017-03-22 22:42:34.841798557 +0100
-@@ -42,6 +42,17 @@
++++ ffmpeg-3.2.4.patch/libavcodec/hevcdsp.h	2017-05-28 20:42:45.743088684 +0200
+@@ -42,11 +42,26 @@
      uint8_t type_idx[3];    ///< sao_type_idx
  } SAOParams;
  
@@ -6041,56 +7483,1030 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevcdsp.h ffmpeg-3.2.4.patch/libavcodec/hevcd
  typedef struct HEVCDSPContext {
      void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
                      struct GetBitContext *gb, int pcm_bit_depth);
-@@ -120,6 +131,9 @@
++    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++                    struct GetBitContext *gb, int pcm_bit_depth);
+ 
+-    void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
++    void (*transform_add[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++    void (*add_residual_u[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++    void (*add_residual_v[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+ 
+     void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
+ 
+@@ -60,14 +75,23 @@
+ 
+     void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++    void (*sao_band_filter_c[5])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                               const int16_t *sao_offset_val_u, int sao_left_class_u,
++                               const int16_t *sao_offset_val_v, int sao_left_class_v,
++                               int width, int height);
+ 
+     /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
+     void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+                                int16_t *sao_offset_val, int sao_eo_class, int width, int height);
++    void (*sao_edge_filter_c[5])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
+ 
+     void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                                 struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+                                 uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
++                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+ 
+     void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                     int height, intptr_t mx, intptr_t my, int width);
+@@ -120,6 +144,22 @@
      void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
                                          int32_t *tc, uint8_t *no_p,
                                          uint8_t *no_q);
++#ifdef RPI
++    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
++                                 unsigned int _stride, unsigned int beta, const int32_t tc[2],
++                                 const uint8_t no_p[2], const uint8_t no_q[2],
++                                 uint8_t * _pix_l);
++    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
++                                 unsigned int no_f);
++    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                                 uint8_t * src_l,
++                                 unsigned int no_f);
++
++#endif
++
 +    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
 +                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
 +                                               MvField *curr, MvField *neigh, uint8_t *bs);
  } HEVCDSPContext;
  
  void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
-diff -Naur ffmpeg-3.2.4/libavcodec/hevc_filter.c ffmpeg-3.2.4.patch/libavcodec/hevc_filter.c
---- ffmpeg-3.2.4/libavcodec/hevc_filter.c	2017-02-10 14:25:26.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/hevc_filter.c	2017-03-22 22:42:34.840798554 +0100
-@@ -22,6 +22,12 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+//#define DISABLE_SAO
-+//#define DISABLE_DEBLOCK
-+//#define DISABLE_STRENGTHS
-+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
-+//#define DISABLE_DEBLOCK_NONREF
-+
- #include "libavutil/common.h"
- #include "libavutil/internal.h"
- 
-@@ -30,6 +36,11 @@
- 
+diff -Naur ffmpeg-3.2.4/libavcodec/hevcdsp_template.c ffmpeg-3.2.4.patch/libavcodec/hevcdsp_template.c
+--- ffmpeg-3.2.4/libavcodec/hevcdsp_template.c	2016-06-27 01:54:29.000000000 +0200
++++ ffmpeg-3.2.4.patch/libavcodec/hevcdsp_template.c	2017-05-28 20:42:45.744088687 +0200
+@@ -26,6 +26,9 @@
  #include "bit_depth_template.c"
+ #include "hevcdsp.h"
  
 +#ifdef RPI
-+#include "rpi_user_vcsm.h"
-+#include "rpi_qpu.h"
++#include "rpi_zc.h"
 +#endif
-+
- #define LUMA 0
- #define CB 1
- #define CR 2
-@@ -272,6 +283,10 @@
-     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
+ 
+ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
+                           GetBitContext *gb, int pcm_bit_depth)
+@@ -42,6 +45,29 @@
+     }
+ }
+ 
++static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
++                          GetBitContext *gb, int pcm_bit_depth)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++        dst += stride;
++    }
++
++    dst = (pixel *)_dst + 1;
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++        dst += stride;
++    }
++}
++
++
+ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
+                                                      ptrdiff_t stride, int size)
+ {
+@@ -59,6 +85,23 @@
+     }
+ }
+ 
++static av_always_inline void FUNC(add_residual_uv)(uint8_t *_dst, int16_t *res,
++                                                ptrdiff_t stride, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + *res);
++            res++;
++        }
++        dst += stride;
++    }
++}
++
+ static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
+                                        ptrdiff_t stride)
+ {
+@@ -83,6 +126,58 @@
+     FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
+ }
+ 
++// -- U -- (plaited)
++
++static void FUNC(add_residual4x4_u)(uint8_t *_dst, int16_t *res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_uv)(_dst, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8_u)(uint8_t *_dst, int16_t *res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_uv)(_dst, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16_u)(uint8_t *_dst, int16_t *res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual_uv)(_dst, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32_u)(uint8_t *_dst, int16_t *res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual_uv)(_dst, res, stride, 32);
++}
++
++// -- V -- (plaited)
++
++static void FUNC(add_residual4x4_v)(uint8_t *_dst, int16_t *res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_uv)(_dst + 1, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8_v)(uint8_t *_dst, int16_t *res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_uv)(_dst + 1, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16_v)(uint8_t *_dst, int16_t *res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual_uv)(_dst + 1, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32_v)(uint8_t *_dst, int16_t *res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual_uv)(_dst + 1, res, stride, 32);
++}
++
+ 
+ static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
+ {
+@@ -367,7 +462,6 @@
+     int x, y;
+     pixel *dst = (pixel *)_dst;
+     pixel *src = (pixel *)_src;
+-    int16_t *sao_offset_val = sao->offset_val[c_idx];
+     int sao_eo_class    = sao->eo_class[c_idx];
+     int init_x = 0, width = _width, height = _height;
+ 
+@@ -376,33 +470,29 @@
+ 
+     if (sao_eo_class != SAO_EO_VERT) {
+         if (borders[0]) {
+-            int offset_val = sao_offset_val[0];
+             for (y = 0; y < height; y++) {
+-                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
++                dst[y * stride_dst] = src[y * stride_src];
+             }
+             init_x = 1;
+         }
+         if (borders[2]) {
+-            int offset_val = sao_offset_val[0];
+             int offset     = width - 1;
+             for (x = 0; x < height; x++) {
+-                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
+             }
+             width--;
+         }
+     }
+     if (sao_eo_class != SAO_EO_HORIZ) {
+         if (borders[1]) {
+-            int offset_val = sao_offset_val[0];
+             for (x = init_x; x < width; x++)
+-                dst[x] = av_clip_pixel(src[x] + offset_val);
++                dst[x] = src[x];
+         }
+         if (borders[3]) {
+-            int offset_val   = sao_offset_val[0];
+-            int y_stride_dst = stride_dst * (height - 1);
+-            int y_stride_src = stride_src * (height - 1);
++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
++            ptrdiff_t y_stride_src = stride_src * (height - 1);
+             for (x = init_x; x < width; x++)
+-                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
++                dst[x + y_stride_dst] = src[x + y_stride_src];
+             height--;
+         }
+     }
+@@ -417,7 +507,6 @@
+     int x, y;
+     pixel *dst = (pixel *)_dst;
+     pixel *src = (pixel *)_src;
+-    int16_t *sao_offset_val = sao->offset_val[c_idx];
+     int sao_eo_class    = sao->eo_class[c_idx];
+     int init_x = 0, init_y = 0, width = _width, height = _height;
+ 
+@@ -426,34 +515,30 @@
+ 
+     if (sao_eo_class != SAO_EO_VERT) {
+         if (borders[0]) {
+-            int offset_val = sao_offset_val[0];
+             for (y = 0; y < height; y++) {
+-                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
++                dst[y * stride_dst] = src[y * stride_src];
+             }
+             init_x = 1;
+         }
+         if (borders[2]) {
+-            int offset_val = sao_offset_val[0];
+             int offset     = width - 1;
+             for (x = 0; x < height; x++) {
+-                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
+             }
+             width--;
+         }
+     }
+     if (sao_eo_class != SAO_EO_HORIZ) {
+         if (borders[1]) {
+-            int offset_val = sao_offset_val[0];
+             for (x = init_x; x < width; x++)
+-                dst[x] = av_clip_pixel(src[x] + offset_val);
++                dst[x] = src[x];
+             init_y = 1;
+         }
+         if (borders[3]) {
+-            int offset_val   = sao_offset_val[0];
+-            int y_stride_dst = stride_dst * (height - 1);
+-            int y_stride_src = stride_src * (height - 1);
++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
++            ptrdiff_t y_stride_src = stride_src * (height - 1);
+             for (x = init_x; x < width; x++)
+-                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
++                dst[x + y_stride_dst] = src[x + y_stride_src];
+             height--;
+         }
+     }
+@@ -494,6 +579,127 @@
+     }
+ }
+ 
++
++// --- Plaited chroma versions
++
++#if BIT_DEPTH != 8
++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
++{
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
++    abort();                                                                        \
++}
++#else
++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
++{
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int offset_table_u[32] = { 0 };
++    int offset_table_v[32] = { 0 };
++    int k, y, x;
++    int shift  = BIT_DEPTH - 5;
++
++    stride_dst /= sizeof(pixel);
++    stride_src /= sizeof(pixel);
++    width *= 2;
++
++    for (k = 0; k < 4; k++)
++    {
++        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
++        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
++    }
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x += 2)
++        {
++            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
++            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
++        }
++        dst += stride_dst;
++        src += stride_src;
++    }
++}
++#endif
++
++#if BIT_DEPTH != 8
++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
++                                  int eo, int width, int height) {
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
++    abort();                                                                        \
++}
++#else
++
++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
++                                  int eo, int width, int height) {
++
++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++    static const int8_t pos[4][2][2] = {
++        { { -1,  0 }, {  1, 0 } }, // horizontal
++        { {  0, -1 }, {  0, 1 } }, // vertical
++        { { -1, -1 }, {  1, 1 } }, // 45 degree
++        { {  1, -1 }, { -1, 1 } }, // 135 degree
++    };
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int a_stride, b_stride;
++    int x, y;
++    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
++    stride_dst /= sizeof(pixel);
++    width *= 2;
++
++    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
++    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x += 2) {
++            int diff0u = CMP(src[x], src[x + a_stride]);
++            int diff1u = CMP(src[x], src[x + b_stride]);
++            int offset_valu        = edge_idx[2 + diff0u + diff1u];
++            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
++            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
++            int offset_valv        = edge_idx[2 + diff0v + diff1v];
++            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
++            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
++        }
++        src += stride_src;
++        dst += stride_dst;
++    }
++}
++#endif
++
++#if BIT_DEPTH != 8
++static void FUNC(sao_edge_restore_c_0)(uint8_t *_dst, uint8_t *_src,
++                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
++                                    int *borders, int _width, int _height,
++                                    int c_idx, uint8_t *vert_edge,
++                                    uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
++    abort();                                                                        \
++}
++static void FUNC(sao_edge_restore_c_1)(uint8_t *_dst, uint8_t *_src,
++                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
++                                    int *borders, int _width, int _height,
++                                    int c_idx, uint8_t *vert_edge,
++                                    uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
++    abort();                                                                        \
++}
++#else
++// Any old 2 byte 'normal' restore will work for these
++#define sao_edge_restore_c_0_8 sao_edge_restore_0_10
++#define sao_edge_restore_c_1_8 sao_edge_restore_1_10
++#endif
++
++
+ #undef CMP
+ 
+ ////////////////////////////////////////////////////////////////////////////////
+@@ -1694,3 +1900,217 @@
+ #undef TQ1
+ #undef TQ2
+ #undef TQ3
++
++#ifdef RPI
++
++// line zero
++#define P3 pix_l[0 * xstride]
++#define P2 pix_l[1 * xstride]
++#define P1 pix_l[2 * xstride]
++#define P0 pix_l[3 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
++#define Q2 pix_r[2 * xstride]
++#define Q3 pix_r[3 * xstride]
++
++// line three. used only for deblocking decision
++#define TP3 pix_l[0 * xstride + 3 * ystride]
++#define TP2 pix_l[1 * xstride + 3 * ystride]
++#define TP1 pix_l[2 * xstride + 3 * ystride]
++#define TP0 pix_l[3 * xstride + 3 * ystride]
++#define TQ0 pix_r[0 * xstride + 3 * ystride]
++#define TQ1 pix_r[1 * xstride + 3 * ystride]
++#define TQ2 pix_r[2 * xstride + 3 * ystride]
++#define TQ3 pix_r[3 * xstride + 3 * ystride]
++
++// This is identical to hevc_loop_filter_luma except that the P/Q
++// components are on separate pointers
++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
++                                 unsigned int _stride, unsigned int beta, const int32_t _tc[2],
++                                 const uint8_t _no_p[2], const uint8_t _no_q[2],
++                                 uint8_t * _pix_l)
++{
++    int d, j;
++    pixel *pix_l        = (pixel *)_pix_l;
++    pixel *pix_r        = (pixel *)_pix_r;
++    const ptrdiff_t xstride = 1;
++    const ptrdiff_t ystride = _stride / sizeof(pixel);
++
++    beta <<= BIT_DEPTH - 8;
++
++    for (j = 0; j < 2; j++) {
++        const int dp0  = abs(P2  - 2 * P1  + P0);
++        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
++        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
++        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
++        const int d0   = dp0 + dq0;
++        const int d3   = dp3 + dq3;
++        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
++        const int no_p = _no_p[j];
++        const int no_q = _no_q[j];
++
++        if (d0 + d3 >= beta) {
++            pix_l += 4 * ystride;
++            pix_r += 4 * ystride;
++            continue;
++        } else {
++            const int beta_3 = beta >> 3;
++            const int beta_2 = beta >> 2;
++            const int tc25   = ((tc * 5 + 1) >> 1);
++
++            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
++                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
++                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
++                // strong filtering
++                const int tc2 = tc << 1;
++                for (d = 0; d < 4; d++) {
++                    const int p3 = P3;
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    const int q3 = Q3;
++                    if (!no_p) {
++                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
++                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
++                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
++                    }
++                    if (!no_q) {
++                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
++                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
++                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
++                    }
++                    pix_l += ystride;
++                    pix_r += ystride;
++                }
++            } else { // normal filtering
++                int nd_p = 1;
++                int nd_q = 1;
++                const int tc_2 = tc >> 1;
++                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
++                    nd_p = 2;
++                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
++                    nd_q = 2;
++
++                for (d = 0; d < 4; d++) {
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
++                    if (abs(delta0) < 10 * tc) {
++                        delta0 = av_clip(delta0, -tc, tc);
++                        if (!no_p)
++                            P0 = av_clip_pixel(p0 + delta0);
++                        if (!no_q)
++                            Q0 = av_clip_pixel(q0 - delta0);
++                        if (!no_p && nd_p > 1) {
++                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
++                            P1 = av_clip_pixel(p1 + deltap1);
++                        }
++                        if (!no_q && nd_q > 1) {
++                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
++                            Q1 = av_clip_pixel(q1 + deltaq1);
++                        }
++                    }
++                    pix_l += ystride;
++                    pix_r += ystride;
++                }
++            }
++        }
++    }
++}
++
++#undef TP3
++#undef TP2
++#undef TP1
++#undef TP0
++#undef TQ0
++#undef TQ1
++#undef TQ2
++#undef TQ3
++
++#undef P3
++#undef P2
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++#undef Q2
++#undef Q3
++
++#define P1 pix_l[0 * xstride]
++#define P0 pix_l[1 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
++
++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
++                                          ptrdiff_t _ystride, const int32_t *_tc,
++                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
++{
++    int d, j, no_p, no_q;
++    pixel *pix_l        = (pixel *)_pix_l;
++    pixel *pix_r        = (pixel *)_pix_r;
++    ptrdiff_t xstride = _xstride / sizeof(pixel);
++    ptrdiff_t ystride = _ystride / sizeof(pixel);
++
++    for (j = 0; j < 2; j++) {
++        const int tc = _tc[j] << (BIT_DEPTH - 8);
++        if (tc <= 0) {
++            pix_l += 4 * ystride;
++            pix_r += 4 * ystride;
++            continue;
++        }
++        no_p = _no_p[j];
++        no_q = _no_q[j];
++
++        for (d = 0; d < 4; d++) {
++            int delta0;
++            const int p1 = P1;
++            const int p0 = P0;
++            const int q0 = Q0;
++            const int q1 = Q1;
++            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
++            if (!no_p)
++                P0 = av_clip_pixel(p0 + delta0);
++            if (!no_q)
++                Q0 = av_clip_pixel(q0 - delta0);
++            pix_l += ystride;
++            pix_r += ystride;
++        }
++    }
++}
++
++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
++                                 unsigned int no_f)
++{
++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
++    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
++}
++
++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                                 uint8_t * src_l,
++                                 unsigned int no_f)
++{
++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
++    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
++}
++
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++
++
++#endif
++
+diff -Naur ffmpeg-3.2.4/libavcodec/hevc_filter.c ffmpeg-3.2.4.patch/libavcodec/hevc_filter.c
+--- ffmpeg-3.2.4/libavcodec/hevc_filter.c	2017-02-10 14:25:26.000000000 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/hevc_filter.c	2017-05-28 20:42:45.741088677 +0200
+@@ -22,6 +22,12 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++//#define DISABLE_SAO
++//#define DISABLE_DEBLOCK
++//#define DISABLE_STRENGTHS
++// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
++//#define DISABLE_DEBLOCK_NONREF
++
+ #include "libavutil/common.h"
+ #include "libavutil/internal.h"
+ 
+@@ -30,6 +36,11 @@
+ 
+ #include "bit_depth_template.c"
+ 
++#ifdef RPI
++#include "rpi_qpu.h"
++#include "rpi_zc.h"
++#endif
++
+ #define LUMA 0
+ #define CB 1
+ #define CR 2
+@@ -138,6 +149,15 @@
+     return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
+ }
+ 
++static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx)
++{
++#ifdef RPI
++    return c_idx != 0 && rpi_sliced_frame(s->frame) ? 1 : s->ps.sps->pixel_shift;
++#else
++    return s->ps.sps->pixel_shift;
++#endif
++}
++
+ static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
+                      intptr_t stride_dst, intptr_t stride_src)
+ {
+@@ -192,7 +212,7 @@
+                            int stride_src, int x, int y, int width, int height,
+                            int c_idx, int x_ctb, int y_ctb)
+ {
+-    int sh = s->ps.sps->pixel_shift;
++    const unsigned int sh = pixel_shift(s, c_idx);
+     int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
+     int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
+ 
+@@ -223,13 +243,14 @@
+         int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
+         int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
+         int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
+-        int len          = (min_pu_size >> hshift) << s->ps.sps->pixel_shift;
++        const unsigned int sh = pixel_shift(s, c_idx);
++        int len          = (min_pu_size >> hshift) << sh;
+         for (y = y_min; y < y_max; y++) {
+             for (x = x_min; x < x_max; x++) {
+                 if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
+                     int n;
+-                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+-                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
++                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
++                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
+                     for (n = 0; n < (min_pu_size >> vshift); n++) {
+                         memcpy(src, dst, len);
+                         src += stride_src;
+@@ -245,7 +266,7 @@
+ 
+ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+ {
+-    static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
++    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
+     HEVCLocalContext *lc = s->HEVClc;
+     int c_idx;
+     int edges[4];  // 0 left 1 top 2 right 3 bottom
+@@ -266,12 +287,22 @@
+     uint8_t right_tile_edge  = 0;
+     uint8_t up_tile_edge     = 0;
+     uint8_t bottom_tile_edge = 0;
++#ifdef RPI
++    const int sliced = rpi_sliced_frame(s->frame);
++    const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1);
++#else
++    const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1);
++#endif
+ 
+     edges[0]   = x_ctb == 0;
+     edges[1]   = y_ctb == 0;
+     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
      edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
  
 +#ifdef DISABLE_SAO
 +    return;
 +#endif
-+
-     if (restore) {
-         if (!edges[0]) {
-             left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-@@ -495,6 +510,15 @@
++
+     if (restore) {
+         if (!edges[0]) {
+             left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
+@@ -303,7 +334,7 @@
+         }
+     }
+ 
+-    for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
++    for (c_idx = 0; c_idx < plane_count; c_idx++) {
+         int x0       = x >> s->ps.sps->hshift[c_idx];
+         int y0       = y >> s->ps.sps->vshift[c_idx];
+         int stride_src = s->frame->linesize[c_idx];
+@@ -312,28 +343,82 @@
+         int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> s->ps.sps->hshift[c_idx]) - x0);
+         int height   = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
+         int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
+-        uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
+-        int stride_dst;
++        ptrdiff_t stride_dst;
+         uint8_t *dst;
+ 
++#ifdef RPI
++        const unsigned int sh = (sliced && c_idx != 0) ? 1 : s->ps.sps->pixel_shift;
++        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
++        uint8_t * const src = !sliced ?
++                &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)] :
++            c_idx == 0 ?
++                rpi_sliced_frame_pos_y(s->frame, x0, y0) :
++                rpi_sliced_frame_pos_c(s->frame, x0, y0);
++        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
++            !sliced ? src - (1 << sh) :
++            c_idx == 0 ?
++                rpi_sliced_frame_pos_y(s->frame, x0 - 1, y0) :
++                rpi_sliced_frame_pos_c(s->frame, x0 - 1, y0);
++        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
++            !sliced ? src + (width << sh) :
++            c_idx == 0 ?
++                rpi_sliced_frame_pos_y(s->frame, x0 + width, y0) :
++                rpi_sliced_frame_pos_c(s->frame, x0 + width, y0);
++
++
++        if (sliced && c_idx > 1) {
++            break;
++        }
++#else
++        const unsigned int sh = s->ps.sps->pixel_shift;
++        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
++        uint8_t * const src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
++        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : src - (1 << sh);
++        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : src + (width << sh);
++#endif
++
+         switch (sao->type_idx[c_idx]) {
+         case SAO_BAND:
+             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                            x_ctb, y_ctb);
+             if (s->ps.pps->transquant_bypass_enable_flag ||
+                 (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
+-            dst = lc->edge_emu_buffer;
+-            stride_dst = 2*MAX_PB_SIZE;
+-            copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src);
+-            s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
+-                                            sao->offset_val[c_idx], sao->band_position[c_idx],
+-                                            width, height);
+-            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+-                               x, y, width, height, c_idx);
++                dst = lc->edge_emu_buffer;
++                stride_dst = 2*MAX_PB_SIZE;
++                copy_CTB(dst, src, width << sh, height, stride_dst, stride_src);
++#ifdef RPI
++                if (sliced && c_idx != 0)
++                {
++                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
++                                                    sao->offset_val[1], sao->band_position[1],
++                                                    sao->offset_val[2], sao->band_position[2],
++                                                    width, height);
++                }
++                else
++#endif
++                {
++                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
++                                                    width, height);
++                }
++                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++                                   x, y, width, height, c_idx);
+             } else {
+-            s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
+-                                            sao->offset_val[c_idx], sao->band_position[c_idx],
+-                                            width, height);
++#ifdef RPI
++                if (sliced && c_idx != 0)
++                {
++                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
++                                                    sao->offset_val[1], sao->band_position[1],
++                                                    sao->offset_val[2], sao->band_position[2],
++                                                    width, height);
++                }
++                else
++#endif
++                {
++                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
++                                                    width, height);
++                }
+             }
+             sao->type_idx[c_idx] = SAO_APPLIED;
+             break;
+@@ -341,108 +426,117 @@
+         {
+             int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
+             int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
+-            int left_edge = edges[0];
+             int top_edge = edges[1];
+-            int right_edge = edges[2];
+             int bottom_edge = edges[3];
+-            int sh = s->ps.sps->pixel_shift;
+-            int left_pixels, right_pixels;
+ 
+             stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE;
+             dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE;
+ 
+             if (!top_edge) {
+-                int left = 1 - left_edge;
+-                int right = 1 - right_edge;
+-                const uint8_t *src1[2];
+                 uint8_t *dst1;
+-                int src_idx, pos;
++                int src_idx;
++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
++
++                dst1 = dst - stride_dst;
+ 
+-                dst1 = dst - stride_dst - (left << sh);
+-                src1[0] = src - stride_src - (left << sh);
+-                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
+-                pos = 0;
+-                if (left) {
++                if (src_l != NULL) {
+                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
+                                SAO_APPLIED);
+-                    copy_pixel(dst1, src1[src_idx], sh);
+-                    pos += (1 << sh);
++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
+                 }
++
+                 src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
+                            SAO_APPLIED);
+-                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+-                if (right) {
+-                    pos += width << sh;
++                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
++
++                if (src_r != NULL) {
+                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
+                                SAO_APPLIED);
+-                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
+                 }
+             }
+             if (!bottom_edge) {
+-                int left = 1 - left_edge;
+-                int right = 1 - right_edge;
+-                const uint8_t *src1[2];
+-                uint8_t *dst1;
+-                int src_idx, pos;
++                uint8_t * const dst1 = dst + height * stride_dst;
++                int src_idx;
++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
++                const unsigned int hoff = height * stride_src;
+ 
+-                dst1 = dst + height * stride_dst - (left << sh);
+-                src1[0] = src + height * stride_src - (left << sh);
+-                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
+-                pos = 0;
+-                if (left) {
++                if (src_l != NULL) {
+                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
+                                SAO_APPLIED);
+-                    copy_pixel(dst1, src1[src_idx], sh);
+-                    pos += (1 << sh);
++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
+                 }
++
+                 src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
+                            SAO_APPLIED);
+-                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+-                if (right) {
+-                    pos += width << sh;
++                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
++
++                if (src_r != NULL) {
+                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
+                                SAO_APPLIED);
+-                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
+                 }
+             }
+-            left_pixels = 0;
+-            if (!left_edge) {
++            if (src_l != NULL) {
+                 if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                     copy_vert(dst - (1 << sh),
+                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
+                               sh, height, stride_dst, 1 << sh);
+                 } else {
+-                    left_pixels = 1;
++                    copy_vert(dst - (1 << sh),
++                              src_l,
++                              sh, height, stride_dst, stride_src);
+                 }
+             }
+-            right_pixels = 0;
+-            if (!right_edge) {
++            if (src_r != NULL) {
+                 if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                     copy_vert(dst + (width << sh),
+                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
+                               sh, height, stride_dst, 1 << sh);
+                 } else {
+-                    right_pixels = 1;
++                    copy_vert(dst + (width << sh),
++                              src_r,
++                              sh, height, stride_dst, stride_src);
+                 }
+             }
+ 
+-            copy_CTB(dst - (left_pixels << sh),
+-                     src - (left_pixels << sh),
+-                     (width + left_pixels + right_pixels) << sh,
++            copy_CTB(dst,
++                     src,
++                     width << sh,
+                      height, stride_dst, stride_src);
+ 
+             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                            x_ctb, y_ctb);
+-            s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
+-                                            sao->eo_class[c_idx], width, height);
+-            s->hevcdsp.sao_edge_restore[restore](src, dst,
+-                                                stride_src, stride_dst,
+-                                                sao,
+-                                                edges, width,
+-                                                height, c_idx,
+-                                                vert_edge,
+-                                                horiz_edge,
+-                                                diag_edge);
++#ifdef RPI
++            if (sliced && c_idx != 0)
++            {
++                // Class always the same for both U & V (which is just as well :-))
++                s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
++                                                sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
++                                                width, height);
++                s->hevcdsp.sao_edge_restore_c[restore](src, dst,
++                                                    stride_src, stride_dst,
++                                                    sao,
++                                                    edges, width,
++                                                    height, c_idx,
++                                                    vert_edge,
++                                                    horiz_edge,
++                                                    diag_edge);
++            }
++            else
++#endif
++            {
++                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
++                                                sao->eo_class[c_idx], width, height);
++                s->hevcdsp.sao_edge_restore[restore](src, dst,
++                                                    stride_src, stride_dst,
++                                                    sao,
++                                                    edges, width,
++                                                    height, c_idx,
++                                                    vert_edge,
++                                                    horiz_edge,
++                                                    diag_edge);
++            }
+             restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+                                x, y, width, height, c_idx);
+             sao->type_idx[c_idx] = SAO_APPLIED;
+@@ -452,6 +546,7 @@
+     }
+ }
+ 
++// Returns 2 or 0.
+ static int get_pcm(HEVCContext *s, int x, int y)
+ {
+     int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
+@@ -478,7 +573,7 @@
+     uint8_t *src;
+     int x, y;
+     int chroma, beta;
+-    int32_t c_tc[2], tc[2];
++    int32_t c_tc[4], tc[2];
+     uint8_t no_p[2] = { 0 };
+     uint8_t no_q[2] = { 0 };
+ 
+@@ -495,6 +590,15 @@
                  s->ps.sps->pcm.loop_filter_disable_flag) ||
                 s->ps.pps->transquant_bypass_enable_flag;
  
@@ -6106,27 +8522,81 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_filter.c ffmpeg-3.2.4.patch/libavcodec/h
      if (x0) {
          left_tc_offset   = s->deblock[ctb - 1].tc_offset;
          left_beta_offset = s->deblock[ctb - 1].beta_offset;
-@@ -538,6 +562,19 @@
-                                                          s->frame->linesize[LUMA],
-                                                          beta, tc, no_p, no_q);
-                 } else
+@@ -528,19 +632,51 @@
+ 
+                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
+                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
+-                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+                 if (pcmf) {
+                     no_p[0] = get_pcm(s, x - 1, y);
+                     no_p[1] = get_pcm(s, x - 1, y + 4);
+                     no_q[0] = get_pcm(s, x, y);
+                     no_q[1] = get_pcm(s, x, y + 4);
+-                    s->hevcdsp.hevc_v_loop_filter_luma_c(src,
+-                                                         s->frame->linesize[LUMA],
+-                                                         beta, tc, no_p, no_q);
+-                } else
+-                    s->hevcdsp.hevc_v_loop_filter_luma(src,
+-                                                       s->frame->linesize[LUMA],
+-                                                       beta, tc, no_p, no_q);
++                }
++#ifdef RPI
++                if (rpi_sliced_frame(s->frame)) {
++
++                    // This copes properly with no_p/no_q
++                    s->hevcdsp.hevc_v_loop_filter_luma2(rpi_sliced_frame_pos_y(s->frame, x, y),
++                                                     s->frame->linesize[LUMA],
++                                                     beta, tc, no_p, no_q,
++                                                     rpi_sliced_frame_pos_y(s->frame, x - 4, y));
++                }
++                else
++#endif
++                {
++                    src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
++                    if (pcmf) {
++                        // Standard DSP code is broken if no_p / no_q is set
++                        s->hevcdsp.hevc_v_loop_filter_luma_c(src,
++                                                           s->frame->linesize[LUMA],
++                                                           beta, tc, no_p, no_q);
++                    }
++                    else
 +#ifdef RPI_DEBLOCK_VPU
-+                if (s->enable_rpi_deblock) {
-+                    uint8_t (*setup)[2][2][4];
-+                    int num16 = (y>>4)*s->setup_width + (x>>4);
-+                    int a = ((y>>3) & 1) << 1;
-+                    int b = (x>>3) & 1;
-+                    setup = s->dvq->y_setup_arm[num16];
-+                    setup[0][b][0][a] = beta;
-+                    setup[0][b][0][a + 1] = beta;
-+                    setup[0][b][1][a] = tc[0];
-+                    setup[0][b][1][a + 1] = tc[1];
-+                } else
++                    if (s->enable_rpi_deblock) {
++                        uint8_t (*setup)[2][2][4];
++                        int num16 = (y>>4)*s->setup_width + (x>>4);
++                        int a = ((y>>3) & 1) << 1;
++                        int b = (x>>3) & 1;
++                        setup = s->dvq->y_setup_arm[num16];
++                        setup[0][b][0][a] = beta;
++                        setup[0][b][0][a + 1] = beta;
++                        setup[0][b][1][a] = tc[0];
++                        setup[0][b][1][a + 1] = tc[1];
++                    } else
 +#endif
-                     s->hevcdsp.hevc_v_loop_filter_luma(src,
-                                                        s->frame->linesize[LUMA],
-                                                        beta, tc, no_p, no_q);
-@@ -570,6 +607,19 @@
++                    {
++                        s->hevcdsp.hevc_v_loop_filter_luma(src,
++                                                           s->frame->linesize[LUMA],
++                                                           beta, tc, no_p, no_q);
++                    }
++                }
+             }
+         }
+ 
+@@ -560,7 +696,12 @@
+                 beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
+                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
+                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
+-                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
++                src =
++#ifdef RPI
++                    rpi_sliced_frame(s->frame) ?
++                        rpi_sliced_frame_pos_y(s->frame, x, y) :
++#endif
++                        &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+                 if (pcmf) {
+                     no_p[0] = get_pcm(s, x, y - 1);
+                     no_p[1] = get_pcm(s, x + 4, y - 1);
+@@ -570,6 +711,19 @@
                                                           s->frame->linesize[LUMA],
                                                           beta, tc, no_p, no_q);
                  } else
@@ -6146,7 +8616,113 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_filter.c ffmpeg-3.2.4.patch/libavcodec/h
                      s->hevcdsp.hevc_h_loop_filter_luma(src,
                                                         s->frame->linesize[LUMA],
                                                         beta, tc, no_p, no_q);
-@@ -604,9 +654,23 @@
+@@ -578,6 +732,91 @@
+     }
+ 
+     if (s->ps.sps->chroma_format_idc) {
++#ifdef RPI
++        if (rpi_sliced_frame(s->frame)) {
++            const int v = 2;
++            const int h = 2;
++
++            // vertical filtering chroma
++            for (y = y0; y < y_end; y += 8 * v) {
++                for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) {
++                    const int bs0 = s->vertical_bs[(x +  y          * s->bs_width) >> 2];
++                    const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2];
++
++                    if ((bs0 == 2) || (bs1 == 2)) {
++                        const int qp0 = (get_qPy(s, x - 1, y)         + get_qPy(s, x, y)         + 1) >> 1;
++                        const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1;
++                        unsigned int no_f = 0;
++
++                        // tc_offset here should be set to cur_tc_offset I think
++                        const uint32_t tc4 =
++                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, cur_tc_offset) | (chroma_tc(s, qp0, 2, cur_tc_offset) << 16)) |
++                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
++
++                        if (tc4 == 0)
++                            continue;
++
++                        if (pcmf) {
++                            no_f =
++                                (get_pcm(s, x - 1, y) ? 1 : 0) |
++                                (get_pcm(s, x - 1, y + 4 * v) ? 2 : 0) |
++                                (get_pcm(s, x, y) ? 4 : 0) |
++                                (get_pcm(s, x, y + 4 * v) ? 8 : 0);
++                            if (no_f == 0xf)
++                                continue;
++                        }
++
++                        s->hevcdsp.hevc_v_loop_filter_uv2(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
++                                                       s->frame->linesize[1],
++                                                       tc4,
++                                                       rpi_sliced_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
++                                                       no_f);
++                    }
++                }
++
++                if (y == 0)
++                    continue;
++
++                // horizontal filtering chroma
++                tc_offset = x0 ? left_tc_offset : cur_tc_offset;
++                x_end2 = x_end;
++                if (x_end != s->ps.sps->width)
++                    x_end2 = x_end - 8 * h;
++
++                for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) {
++                    const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
++                    const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
++                    if ((bs0 == 2) || (bs1 == 2)) {
++                        const int qp0 = bs0 == 2 ? (get_qPy(s, x,         y - 1) + get_qPy(s, x,         y) + 1) >> 1 : 0;
++                        const int qp1 = bs1 == 2 ? (get_qPy(s, x + 4 * h, y - 1) + get_qPy(s, x + 4 * h, y) + 1) >> 1 : 0;
++                        const uint32_t tc4 =
++                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) |
++                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
++                        unsigned int no_f = 0;
++
++                        if (tc4 == 0)
++                            continue;
++
++                        if (pcmf) {
++                            no_f =
++                                (get_pcm(s, x,         y - 1) ? 1 : 0) |
++                                (get_pcm(s, x + 4 * h, y - 1) ? 2 : 0) |
++                                (get_pcm(s, x,         y)     ? 4 : 0) |
++                                (get_pcm(s, x + 4 * h, y)     ? 8 : 0);
++
++                            if (no_f == 0xf)
++                                continue;
++                        }
++
++                        s->hevcdsp.hevc_h_loop_filter_uv(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
++                                                             s->frame->linesize[1],
++                                                             tc4, no_f);
++                    }
++                }
++            }
++        }
++        else
++#endif
+         for (chroma = 1; chroma <= 2; chroma++) {
+             int h = 1 << s->ps.sps->hshift[chroma];
+             int v = 1 << s->ps.sps->vshift[chroma];
+@@ -594,7 +833,12 @@
+ 
+                         c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
+                         c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
+-                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
++                        src =
++#ifdef RPI
++                            rpi_sliced_frame(s->frame) ?
++                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
++#endif
++                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
+                         if (pcmf) {
+                             no_p[0] = get_pcm(s, x - 1, y);
+                             no_p[1] = get_pcm(s, x - 1, y + (4 * v));
+@@ -604,9 +848,23 @@
                                                                     s->frame->linesize[chroma],
                                                                     c_tc, no_p, no_q);
                          } else
@@ -6170,7 +8746,21 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_filter.c ffmpeg-3.2.4.patch/libavcodec/h
                      }
                  }
  
-@@ -637,6 +701,19 @@
+@@ -627,7 +885,12 @@
+ 
+                         c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
+                         c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
+-                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
++                        src =
++#ifdef RPI
++                            rpi_sliced_frame(s->frame) ?
++                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
++#endif
++                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+                         if (pcmf) {
+                             no_p[0] = get_pcm(s, x,           y - 1);
+                             no_p[1] = get_pcm(s, x + (4 * h), y - 1);
+@@ -637,6 +900,19 @@
                                                                     s->frame->linesize[chroma],
                                                                     c_tc, no_p, no_q);
                          } else
@@ -6190,7 +8780,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_filter.c ffmpeg-3.2.4.patch/libavcodec/h
                              s->hevcdsp.hevc_h_loop_filter_chroma(src,
                                                                   s->frame->linesize[chroma],
                                                                   c_tc, no_p, no_q);
-@@ -647,69 +724,6 @@
+@@ -647,69 +923,6 @@
      }
  }
  
@@ -6260,7 +8850,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_filter.c ffmpeg-3.2.4.patch/libavcodec/h
  
  void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                                             int log2_trafo_size)
-@@ -720,10 +734,21 @@
+@@ -720,10 +933,22 @@
      int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
      int min_pu_width     = s->ps.sps->min_pu_width;
      int min_tu_width     = s->ps.sps->min_tb_width;
@@ -6270,8 +8860,9 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_filter.c ffmpeg-3.2.4.patch/libavcodec/h
 -    int i, j, bs;
 +    int i, j;
 +    RefPicList *rpl      = s->ref->refPicList;
-+    int min_pu_in_4pix   = (1 << log2_min_pu_size) >> 2;
-+    int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size;
++    const unsigned int log2_dup = FFMIN(log2_min_pu_size, log2_trafo_size);
++    const unsigned int min_pu_in_4pix = 1 << (log2_dup - 2);  // Dup
++    const unsigned int trafo_in_min_pus = 1 << (log2_trafo_size - log2_dup); // Rep
 +    int y_pu             = y0 >> log2_min_pu_size;
 +    int x_pu             = x0 >> log2_min_pu_size;
 +    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
@@ -6285,7 +8876,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_filter.c ffmpeg-3.2.4.patch/libavcodec/h
  
      boundary_upper = y0 > 0 && !(y0 & 7);
      if (boundary_upper &&
-@@ -735,34 +760,56 @@
+@@ -735,34 +960,56 @@
            (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
          boundary_upper = 0;
  
@@ -6362,7 +8953,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_filter.c ffmpeg-3.2.4.patch/libavcodec/h
      boundary_left = x0 > 0 && !(x0 & 7);
      if (boundary_left &&
          ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
-@@ -773,77 +820,252 @@
+@@ -773,77 +1020,160 @@
            (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
          boundary_left = 0;
  
@@ -6424,11 +9015,6 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_filter.c ffmpeg-3.2.4.patch/libavcodec/h
  
 -    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
 -        RefPicList *rpl = s->ref->refPicList;
--
--        // bs for TU internal horizontal PU boundaries
--        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
--            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
--            int yq_pu = (y0 + j)     >> log2_min_pu_size;
 +    if (!is_intra) {
 +        for (i = inc; i < trafo_in_min_pus; i += inc) {
 +            MvField *left;
@@ -6445,159 +9031,56 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_filter.c ffmpeg-3.2.4.patch/libavcodec/h
 +    }
 +}
  
--            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
--                int x_pu = (x0 + i) >> log2_min_pu_size;
--                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
--                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
+-        // bs for TU internal horizontal PU boundaries
+-        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
+-            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
+-            int yq_pu = (y0 + j)     >> log2_min_pu_size;
 +#undef LUMA
 +#undef CB
 +#undef CR
  
--                bs = boundary_strength(s, curr, top, rpl);
--                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
--            }
--        }
-+#if !defined(RPI_FAST_CACHEFLUSH)
-+#if defined(RPI_LUMA_QPU) || defined(RPI_DEBLOCK_VPU)
-+static void flush_buffer_y(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_y(frame);
-+    gpu_cache_flush(&p);
-+}
-+
-+static void flush_buffer_u(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(frame);
-+    gpu_cache_flush(&p);
-+}
-+
-+static void flush_buffer_v(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_v(frame);
-+    gpu_cache_flush(&p);
-+}
-+#endif
-+#endif
- 
--        // bs for TU internal vertical PU boundaries
--        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
--            int y_pu = (y0 + j) >> log2_min_pu_size;
- 
--            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
--                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
--                int xq_pu = (x0 + i)     >> log2_min_pu_size;
--                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
--                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+-                int x_pu = (x0 + i) >> log2_min_pu_size;
+-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
 +#ifdef RPI_DEBLOCK_VPU
-+#error Not fixed yet
- 
--                bs = boundary_strength(s, curr, left, rpl);
--                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
--            }
 +// ff_hevc_flush_buffer_lines
 +// flushes and invalidates all pixel rows in [start,end-1]
 +static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
 +{
-+#ifdef RPI_FAST_CACHEFLUSH
-+        struct vcsm_user_clean_invalid_s iocache = {};
-+        int curr_y = start;
-+        int n = end;
-+        int curr_uv = curr_y >> s->ps.sps->vshift[1];
-+        int n_uv = n >> s->ps.sps->vshift[1];
-+        int sz,base;
-+        GPU_MEM_PTR_T p;
-+        if (curr_uv < 0) curr_uv = 0;
-+        if (n_uv<=curr_uv) { return; }
-+        sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+        base = s->frame->linesize[1] * curr_uv;
-+        if (flush_chroma) {
-+          p = get_gpu_mem_ptr_u(s->frame);
-+          iocache.s[0].handle = p.vcsm_handle;
-+          iocache.s[0].cmd = 3; // clean+invalidate
-+          iocache.s[0].addr = (int)p.arm + base;
-+          iocache.s[0].size  = sz;
-+          p = get_gpu_mem_ptr_v(s->frame);
-+          iocache.s[1].handle = p.vcsm_handle;
-+          iocache.s[1].cmd = 3; // clean+invalidate
-+          iocache.s[1].addr = (int)p.arm + base;
-+          iocache.s[1].size  = sz;
-+        }
-+        if (flush_luma) {
-+          p = get_gpu_mem_ptr_y(s->frame);
-+          sz = s->frame->linesize[0] * (n-curr_y);
-+          base = s->frame->linesize[0] * curr_y;
-+          iocache.s[2].handle = p.vcsm_handle;
-+          iocache.s[2].cmd = 3; // clean+invalidate
-+          iocache.s[2].addr = (int)p.arm + base;
-+          iocache.s[2].size  = sz;
-+        }
-+        vcsm_clean_invalid( &iocache );
-+#else
-+        if (flush_chroma) {
-+          flush_buffer_u(s->frame);
-+          flush_buffer_v(s->frame);
-+        }
-+        if (flush_luma) {
-+          flush_buffer_y(s->frame);
-         }
-+#endif
++    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
++    rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++      start, end - start, s->ps.sps->vshift[1], flush_luma, flush_chroma);
++    rpi_cache_flush_finish(rfe);
 +}
 +#endif
+ 
+-                bs = boundary_strength(s, curr, top, rpl);
+-                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+-            }
++#if RPI_INTER
 +
-+#ifdef RPI_INTER_QPU
-+void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
++// Flush some lines of a reference frames
++void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n)
 +{
 +    if (s->enable_rpi && s->used_for_ref) {
-+      // TODO make this use ff_hevc_flush_buffer_lines
-+#ifdef RPI_FAST_CACHEFLUSH
-+        struct vcsm_user_clean_invalid_s iocache = {};
-+        int curr_y = ((int *)f->progress->data)[0];
-+        int curr_uv = curr_y >> s->ps.sps->vshift[1];
-+        int n_uv = n >> s->ps.sps->vshift[1];
-+        int sz,base;
-+        GPU_MEM_PTR_T p;
-+        if (curr_uv < 0) curr_uv = 0;
-+        if (n_uv<=curr_uv) { return; }
-+        sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+        base = s->frame->linesize[1] * curr_uv;
-+        p = get_gpu_mem_ptr_u(s->frame);
-+        iocache.s[0].handle = p.vcsm_handle;
-+        iocache.s[0].cmd = 3; // clean+invalidate
-+        iocache.s[0].addr = (int)p.arm + base;
-+        iocache.s[0].size  = sz;
-+        p = get_gpu_mem_ptr_v(s->frame);
-+        iocache.s[1].handle = p.vcsm_handle;
-+        iocache.s[1].cmd = 3; // clean+invalidate
-+        iocache.s[1].addr = (int)p.arm + base;
-+        iocache.s[1].size  = sz;
-+
-+#ifdef RPI_LUMA_QPU
-+        p = get_gpu_mem_ptr_y(s->frame);
-+        sz = s->frame->linesize[0] * (n-curr_y);
-+        base = s->frame->linesize[0] * curr_y;
-+        iocache.s[2].handle = p.vcsm_handle;
-+        iocache.s[2].cmd = 3; // clean+invalidate
-+        iocache.s[2].addr = (int)p.arm + base;
-+        iocache.s[2].size  = sz;
-+#endif
-+        vcsm_clean_invalid( &iocache );
-+#else
-+        flush_buffer_u(s->frame);
-+        flush_buffer_v(s->frame);
-+#ifdef RPI_LUMA_QPU
-+        flush_buffer_y(s->frame);
-+#endif
-+
-+#endif
-+        //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-+        //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-+        //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
-     }
- }
++        const int d0 = ((int *)f->progress->data)[0];
++        const unsigned int curr_y = d0 == -1 ? 0 : d0;  // At start of time progress is -1
++
++        if (curr_y < (unsigned int)f->f->height) {
++            rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
++            rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++              curr_y, FFMIN(n, (unsigned int)f->f->height) - curr_y, s->ps.sps->vshift[1], 1, 1);
++            rpi_cache_flush_finish(rfe);
+         }
++    }
++}
 +#endif
  
--#undef LUMA
--#undef CB
--#undef CR
+-        // bs for TU internal vertical PU boundaries
+-        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+-            int y_pu = (y0 + j) >> log2_min_pu_size;
 +#ifdef RPI_DEBLOCK_VPU
-+#error XXX
 +/* rpi_deblock deblocks an entire row of ctbs using the VPU */
 +static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
 +{
@@ -6626,18 +9109,34 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_filter.c ffmpeg-3.2.4.patch/libavcodec/h
 +  s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
 +  s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
 +  s->dvq->vpu_cmds_arm[2][5] = 4;
-+  // Call VPU
-+  s->dvq->cmd_id = vpu_post_code2( vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5, 0); // 5 means to do all the commands
 +
++  // Call VPU
++  {
++      const vpu_qpu_job_h vqj = vpu_qpu_job_new();
++      vpu_qpu_job_add_vpu(vqj, vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5);  // 5 means to do all the commands
++      vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id);
++      vpu_qpu_job_finish(vqj);
++  }
+ 
+-            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
+-                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
+-                int xq_pu = (x0 + i)     >> log2_min_pu_size;
+-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
+-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
 +  s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1);
 +  s->dvq = s->dvq_ents + s->dvq_n;
-+
-+  if (s->dvq->cmd_id != -1) {
-+      vpu_wait(s->dvq->cmd_id);
-+      s->dvq->cmd_id = -1;
-+  }
-+}
-+
+ 
+-                bs = boundary_strength(s, curr, left, rpl);
+-                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+-            }
+-        }
+-    }
++  vpu_qpu_wait(&s->dvq->cmd_id);
+ }
+ 
+-#undef LUMA
+-#undef CB
+-#undef CR
 +#endif
  
  void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
@@ -6664,14 +9163,14 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_filter.c ffmpeg-3.2.4.patch/libavcodec/h
      if (s->ps.sps->sao_enabled) {
          int y_end = y >= s->ps.sps->height - ctb_size;
          if (y && x)
-@@ -852,16 +1074,46 @@
+@@ -852,16 +1182,46 @@
              sao_filter_CTB(s, x - ctb_size, y);
          if (y && x_end) {
              sao_filter_CTB(s, x, y - ctb_size);
 -            if (s->threads_type & FF_THREAD_FRAME )
-+            if (s->threads_type & FF_THREAD_FRAME ) {
-+#ifdef RPI_INTER_QPU
-+                ff_hevc_flush_buffer(s,&s->ref->tf, y);
++            if (s->threads_type == FF_THREAD_FRAME ) {
++#if RPI_INTER
++                rpi_flush_ref_frame_progress(s,&s->ref->tf, y);
 +#endif
                  ff_thread_report_progress(&s->ref->tf, y, 0);
 +            }
@@ -6679,15 +9178,15 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_filter.c ffmpeg-3.2.4.patch/libavcodec/h
          if (x_end && y_end) {
              sao_filter_CTB(s, x , y);
 -            if (s->threads_type & FF_THREAD_FRAME )
-+            if (s->threads_type & FF_THREAD_FRAME ) {
-+#ifdef RPI_INTER_QPU
-+                ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size);
++            if (s->threads_type == FF_THREAD_FRAME ) {
++#if RPI_INTER
++                rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size);
 +#endif
                  ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
 +            }
          }
 -    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
-+    } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
++    } else if (s->threads_type == FF_THREAD_FRAME && x_end) {
 +        //int newh = y + ctb_size - 4;
 +        //int currh = s->ref->tf.progress->data[0];
 +        //if (((y + ctb_size)&63)==0)
@@ -6698,14 +9197,14 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_filter.c ffmpeg-3.2.4.patch/libavcodec/h
 +            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
 +          }
 +        } else {
-+#ifdef RPI_INTER_QPU
-+          ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
++#if RPI_INTER
++          rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
 +#endif
 +          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
 +        }
 +#else
-+#ifdef RPI_INTER_QPU
-+        ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
++#if RPI_INTER
++        rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
 +        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
 +#endif
          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
@@ -6716,7 +9215,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_filter.c ffmpeg-3.2.4.patch/libavcodec/h
  void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
 diff -Naur ffmpeg-3.2.4/libavcodec/hevc.h ffmpeg-3.2.4.patch/libavcodec/hevc.h
 --- ffmpeg-3.2.4/libavcodec/hevc.h	2017-02-10 14:25:26.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/hevc.h	2017-03-22 22:42:34.835798541 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/hevc.h	2017-05-28 20:42:45.736088659 +0200
 @@ -23,6 +23,9 @@
  #ifndef AVCODEC_HEVC_H
  #define AVCODEC_HEVC_H
@@ -6727,37 +9226,53 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.h ffmpeg-3.2.4.patch/libavcodec/hevc.h
  #include "libavutil/buffer.h"
  #include "libavutil/md5.h"
  
-@@ -37,6 +40,29 @@
+@@ -37,6 +40,45 @@
  #include "thread.h"
  #include "videodsp.h"
  
 +// define RPI to split the CABAC/prediction/transform into separate stages
-+#ifdef RPI
++#ifndef RPI
 +
-+  #include "rpi_qpu.h"
-+  // Define RPI_INTER_QPU to use QPU for chroma inter prediction
-+  #define RPI_INTER_QPU
++  #define RPI_INTER          0
++  #define RPI_TSTATS         0
++  #define RPI_HEVC_SAND      0
 +
-+  #ifdef RPI_INTER_QPU
-+    // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
-+    #define RPI_LUMA_QPU
-+  #endif
++#else
++
++  #include "rpi_qpu.h"
++  #define RPI_INTER          1          // 0 use ARM for UV inter-pred, 1 use QPU
 +
-+  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
-+  #define RPI_MAX_JOBS 2
 +  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
 +  #define RPI_WORKER
++  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
++  // This has no effect unless RPI_WORKER is defined
++  // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as
++  // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one
++  // free for the foreground to fill in.
++  #define RPI_MAX_JOBS 2
++
 +  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
++  // As it stands there is something mildy broken in VPU deblock - looks mostly OK
++  // but reliably fails some conformance tests (e.g. DBLK_A/B/C_)
++  // With VPU luma & chroma pred it is much the same speed to deblock on the ARM
 +//  #define RPI_DEBLOCK_VPU
 +
-+#endif
++  #define RPI_VPU_DEBLOCK_CACHED 1
 +
-+#define RPI_VPU_DEBLOCK_CACHED 1
++  #if HAVE_NEON
++  #define RPI_HEVC_SAND      1
++  #else
++  // Sand bust on Pi1 currently - reasons unknown
++  #define RPI_HEVC_SAND      0
++  #endif
++
++  #define RPI_TSTATS 0
++#endif
 +
  #define MAX_DPB_SIZE 16 // A.4.1
  #define MAX_REFS 16
  
-@@ -669,17 +695,6 @@
+@@ -669,17 +711,6 @@
      uint8_t cu_transquant_bypass_flag;
  } CodingUnit;
  
@@ -6775,7 +9290,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.h ffmpeg-3.2.4.patch/libavcodec/hevc.h
  typedef struct NeighbourAvailable {
      int cand_bottom_left;
      int cand_left;
-@@ -756,7 +771,17 @@
+@@ -756,7 +787,17 @@
      uint8_t flags;
  } HEVCFrame;
  
@@ -6793,7 +9308,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.h ffmpeg-3.2.4.patch/libavcodec/hevc.h
      uint8_t cabac_state[HEVC_CONTEXTS];
  
      uint8_t stat_coeff[4];
-@@ -771,7 +796,6 @@
+@@ -771,7 +812,6 @@
  
      int qPy_pred;
  
@@ -6801,7 +9316,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.h ffmpeg-3.2.4.patch/libavcodec/hevc.h
  
      uint8_t ctb_left_flag;
      uint8_t ctb_up_flag;
-@@ -788,7 +812,6 @@
+@@ -788,7 +828,6 @@
      int ct_depth;
      CodingUnit cu;
      PredictionUnit pu;
@@ -6809,7 +9324,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.h ffmpeg-3.2.4.patch/libavcodec/hevc.h
  
  #define BOUNDARY_LEFT_SLICE     (1 << 0)
  #define BOUNDARY_LEFT_TILE      (1 << 1)
-@@ -799,6 +822,80 @@
+@@ -799,6 +838,147 @@
      int boundary_flags;
  } HEVCLocalContext;
  
@@ -6821,13 +9336,15 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.h ffmpeg-3.2.4.patch/libavcodec/hevc.h
 +// This is a distance of 1536 pixels across the screen
 +// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
 +// but allocate more memory and increase the latency before data in the next frame can be processed
-+#define RPI_NUM_CHUNKS 1
++#define RPI_NUM_CHUNKS 4
++#define RPI_CHUNK_SIZE 12
 +
 +// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
-+#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24)
++#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE)
 +
 +// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
-+#define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
++#define RPI_MAX_MV_CMDS_Y   (2*16*1*(RPI_MAX_WIDTH/4))
++#define RPI_MAX_MV_CMDS_C   (2*16*2*(RPI_MAX_WIDTH/4))
 +// Each block can have an intra prediction and a transform_add command
 +#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
 +// Worst case is 16x16 CTUs
@@ -6844,53 +9361,118 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.h ffmpeg-3.2.4.patch/libavcodec/hevc.h
 +
 +// Command for inter prediction
 +typedef struct HEVCMvCmd {
-+    int cmd;
-+    uint8_t *dst;
-+    ptrdiff_t dststride;
++    uint8_t cmd;
++    uint8_t block_w;
++    uint8_t block_h;
++    int8_t ref_idx[2];
++    uint16_t dststride;
++    uint16_t srcstride;
++    uint16_t srcstride1;
++    int16_t weight;
++    int16_t offset;
++    int16_t x_off;
++    int16_t y_off;
 +    uint8_t *src;
-+    ptrdiff_t srcstride;
-+    Mv mv;
-+    int x_off;
-+    int y_off;
-+    int block_w;
-+    int block_h;
-+    int weight;
-+    int offset;
 +    uint8_t *src1;
-+    ptrdiff_t srcstride1;
++    uint8_t *dst;
++    Mv mv;
 +    Mv mv1;
-+    int8_t ref_idx[2];
 +} HEVCMvCmd;
 +
 +
 +// Command for intra prediction and transform_add of predictions to coefficients
-+#define RPI_PRED_TRANSFORM_ADD 0
-+#define RPI_PRED_INTRA 1
++enum rpi_pred_cmd_e
++{
++    RPI_PRED_ADD_RESIDUAL,
++    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
++    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
++    RPI_PRED_INTRA,
++    RPI_PRED_I_PCM,
++    RPI_PRED_CMD_MAX
++};
++
 +typedef struct HEVCPredCmd {
-+    uint8_t size;
 +    uint8_t type;
-+    uint8_t na;
-+    uint8_t c_idx;
-+    union {
-+        uint8_t *dst; // RPI_PRED_TRANSFORM_ADD
-+        uint32_t x;   // RPI_PRED_INTRA
-+    };
-+    union {
-+        int16_t *buf; // RPI_PRED_TRANSFORM_ADD
-+        uint32_t y;   // RPI_PRED_INTRA
-+    };
++    uint8_t size;  // log2 "size" used by all variants
++    uint8_t na;    // i_pred - but left here as they pack well
++    uint8_t c_idx; // i_pred
 +    union {
-+        enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD
-+        uint32_t stride;         // RPI_PRED_INTRA
++        struct {  // TRANSFORM_ADD
++            uint8_t * dst;
++            const int16_t * buf;
++            uint32_t stride;
++        } ta;
++        struct {  // INTRA
++            uint16_t x;
++            uint16_t y;
++            enum IntraPredMode mode;
++        } i_pred;
++        struct {  // I_PCM
++            uint16_t x;
++            uint16_t y;
++            const void * src;
++            uint32_t src_len;
++        } i_pcm;
 +    };
 +} HEVCPredCmd;
 +
 +#endif
++
++#ifdef RPI
++
++struct qpu_mc_pred_c_s;
++struct qpu_mc_pred_y_s;
++
++typedef struct HEVCRpiLumaPred
++{
++    struct qpu_mc_pred_y_s *qpu_mc_base;
++    struct qpu_mc_pred_y_s *qpu_mc_curr;
++    struct qpu_mc_pred_y_s *last_lx;
++    unsigned int load;
++} HEVCRpiLumaPred;
++
++typedef struct HEVCRpiChromaPred
++{
++    struct qpu_mc_pred_c_s *qpu_mc_base;
++    struct qpu_mc_pred_c_s *qpu_mc_curr;
++    struct qpu_mc_pred_c_s *last_l0;
++    struct qpu_mc_pred_c_s *last_l1;
++    unsigned int load;
++} HEVCRpiChromaPred;
++
++typedef struct HEVCRpiJob {
++    GPU_MEM_PTR_T chroma_mvs_gptr;
++    GPU_MEM_PTR_T luma_mvs_gptr;
++    HEVCRpiChromaPred chroma_mvs[QPU_N_UV];
++    HEVCRpiLumaPred luma_mvs[QPU_N_Y];
++} HEVCRpiJob;
++
++#if RPI_TSTATS
++typedef struct HEVCRpiStats {
++    int y_pred1_y8_merge;
++    int y_pred1_xy;
++    int y_pred1_x0;
++    int y_pred1_y0;
++    int y_pred1_x0y0;
++    int y_pred1_wle8;
++    int y_pred1_wgt8;
++    int y_pred1_hle16;
++    int y_pred1_hgt16;
++    int y_pred2_xy;
++    int y_pred2_x0;
++    int y_pred2_y0;
++    int y_pred2_x0y0;
++    int y_pred2_hle16;
++    int y_pred2_hgt16;
++} HEVCRpiStats;
++#endif
++
++#endif
 +
  typedef struct HEVCContext {
      const AVClass *c;  // needed by private avoptions
      AVCodecContext *avctx;
-@@ -807,13 +904,107 @@
+@@ -807,13 +987,103 @@
  
      HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
      HEVCLocalContext    *HEVClc;
@@ -6908,7 +9490,8 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.h ffmpeg-3.2.4.patch/libavcodec/hevc.h
 +
 +#ifdef RPI
 +    int enable_rpi;
-+    HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS];
++    HEVCMvCmd *unif_mv_cmds_y[RPI_MAX_JOBS];
++    HEVCMvCmd *unif_mv_cmds_c[RPI_MAX_JOBS];
 +    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
 +    int buf_width;
 +    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
@@ -6917,7 +9500,8 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.h ffmpeg-3.2.4.patch/libavcodec/hevc.h
 +    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
 +    int num_coeffs[RPI_MAX_JOBS][4];
 +    int num_xfm_cmds[RPI_MAX_JOBS];
-+    int num_mv_cmds[RPI_MAX_JOBS];
++    int num_mv_cmds_y[RPI_MAX_JOBS];
++    int num_mv_cmds_c[RPI_MAX_JOBS];
 +    int num_pred_cmds[RPI_MAX_JOBS];
 +    int num_dblk_cmds[RPI_MAX_JOBS];
 +    int vpu_id;
@@ -6927,29 +9511,23 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.h ffmpeg-3.2.4.patch/libavcodec/hevc.h
 +    int max_ctu_count; // Number of CTUs when we trigger a round of processing
 +    int ctu_per_y_chan; // Number of CTUs per luma QPU
 +    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
-+#ifdef RPI_INTER_QPU
-+    GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
-+    uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
-+
-+    // _base pointers are to the start of the row
-+    uint32_t *mvs_base[RPI_MAX_JOBS][8];
-+    // these pointers are to the next free space
-+    uint32_t *u_mvs[RPI_MAX_JOBS][8];
-+    uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
-+    // Function pointers
-+    uint32_t mc_filter_uv;
-+    uint32_t mc_filter_uv_b0;
-+    uint32_t mc_filter_uv_b;
++
++    HEVCRpiJob jobs[RPI_MAX_JOBS];
++#if RPI_TSTATS
++    HEVCRpiStats tstats;
 +#endif
-+#ifdef RPI_LUMA_QPU
-+    GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
-+    uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
-+    uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
-+    uint32_t *y_mvs[RPI_MAX_JOBS][12];
-+    uint32_t *curr_y_mvs; // Current uniform stream for luma
++#if RPI_INTER
++    HEVCRpiChromaPred * curr_pred_c;
++    HEVCRpiLumaPred * curr_pred_y;
++    struct qpu_mc_pred_y_s * last_y8_p;
++    struct qpu_mc_pred_y_s * last_y8_lx;
++
 +    // Function pointers
-+    uint32_t mc_filter;
-+    uint32_t mc_filter_b;
++    uint32_t qpu_filter_uv;
++    uint32_t qpu_filter_uv_b0;
++    uint32_t qpu_dummy_frame; // Not a frame - just a bit of memory
++    uint32_t qpu_filter;
++    uint32_t qpu_filter_b;
 +#endif
 +
 +#ifdef RPI_WORKER
@@ -6986,7 +9564,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.h ffmpeg-3.2.4.patch/libavcodec/hevc.h
 +        int (*vpu_cmds_arm)[6]; // r0-r5 for each command
 +        int vpu_cmds_vc;
 +
-+        int cmd_id;
++        vpu_qpu_wait_h cmd_id;
 +    } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
 +
 +    struct dblk_vpu_q_s * dvq;
@@ -6999,7 +9577,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.h ffmpeg-3.2.4.patch/libavcodec/hevc.h
      uint8_t *cabac_state;
  
      /** 1 if the independent slice segment header was successfully parsed */
-@@ -931,6 +1122,9 @@
+@@ -931,6 +1201,9 @@
      uint32_t max_mastering_luminance;
      uint32_t min_mastering_luminance;
  
@@ -7009,30 +9587,195 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc.h ffmpeg-3.2.4.patch/libavcodec/hevc.h
  } HEVCContext;
  
  int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
-@@ -1057,6 +1251,10 @@
+@@ -1057,6 +1330,10 @@
                                   int log2_trafo_size, enum ScanType scan_idx,
                                   int c_idx);
  
-+#ifdef RPI_INTER_QPU
-+extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n);
++#if RPI_INTER
++extern void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n);
++#endif
++
+ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
+ 
+ 
+@@ -1081,4 +1358,15 @@
+ extern const uint8_t ff_hevc_diag_scan8x8_x[64];
+ extern const uint8_t ff_hevc_diag_scan8x8_y[64];
+ 
++#ifdef RPI
++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n);
++
++// arm/hevc_misc_neon.S
++// Neon coeff zap fn
++#if HAVE_NEON
++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
++#endif
++
++#endif
++
+ #endif /* AVCODEC_HEVC_H */
+diff -Naur ffmpeg-3.2.4/libavcodec/hevcpred.c ffmpeg-3.2.4.patch/libavcodec/hevcpred.c
+--- ffmpeg-3.2.4/libavcodec/hevcpred.c	2016-06-27 01:54:29.000000000 +0200
++++ ffmpeg-3.2.4.patch/libavcodec/hevcpred.c	2017-05-28 20:42:45.744088687 +0200
+@@ -24,6 +24,7 @@
+ 
+ #include "hevcpred.h"
+ 
++#define PRED_C 0
+ #define BIT_DEPTH 8
+ #include "hevcpred_template.c"
+ #undef BIT_DEPTH
+@@ -39,13 +40,37 @@
+ #define BIT_DEPTH 12
+ #include "hevcpred_template.c"
+ #undef BIT_DEPTH
++#undef PRED_C
++
++#ifdef RPI
++#define PRED_C 1
++#define BIT_DEPTH 8
++#include "hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 9
++#include "hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 10
++#include "hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 12
++#include "hevcpred_template.c"
++#undef BIT_DEPTH
++#undef PRED_C
++#endif
+ 
+ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
+ {
+ #undef FUNC
+ #define FUNC(a, depth) a ## _ ## depth
+ 
+-#define HEVC_PRED(depth)                                \
++#undef FUNCC
++#define FUNCC(a, depth) a ## _ ## depth ## _c
++
++#define HEVC_PRED_Y(depth)                                \
+     hpc->intra_pred[0]   = FUNC(intra_pred_2, depth);   \
+     hpc->intra_pred[1]   = FUNC(intra_pred_3, depth);   \
+     hpc->intra_pred[2]   = FUNC(intra_pred_4, depth);   \
+@@ -60,6 +85,30 @@
+     hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
+     hpc->pred_angular[3] = FUNC(pred_angular_3, depth);
+ 
++#define HEVC_PRED_C(depth)                                \
++    hpc->intra_pred_c[0]   = FUNCC(intra_pred_2, depth);   \
++    hpc->intra_pred_c[1]   = FUNCC(intra_pred_3, depth);   \
++    hpc->intra_pred_c[2]   = FUNCC(intra_pred_4, depth);   \
++    hpc->intra_pred_c[3]   = FUNCC(intra_pred_5, depth);   \
++    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
++    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
++    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
++    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
++    hpc->pred_dc_c         = FUNCC(pred_dc, depth);        \
++    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
++    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
++    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
++    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth);
++
++#ifdef RPI
++#define HEVC_PRED(depth) \
++    HEVC_PRED_Y(depth); \
++    HEVC_PRED_C(depth);
++#else
++#define HEVC_PRED(depth) \
++    HEVC_PRED_Y(depth);
++#endif
++
+     switch (bit_depth) {
+     case 9:
+         HEVC_PRED(9);
+diff -Naur ffmpeg-3.2.4/libavcodec/hevcpred.h ffmpeg-3.2.4.patch/libavcodec/hevcpred.h
+--- ffmpeg-3.2.4/libavcodec/hevcpred.h	2016-06-27 01:54:29.000000000 +0200
++++ ffmpeg-3.2.4.patch/libavcodec/hevcpred.h	2017-05-28 20:42:45.745088691 +0200
+@@ -38,6 +38,17 @@
+     void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
+                             const uint8_t *left, ptrdiff_t stride,
+                             int c_idx, int mode);
++#ifdef RPI
++    void (*intra_pred_c[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
++
++    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
++                           const uint8_t *left, ptrdiff_t stride);
++    void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left,
++                    ptrdiff_t stride, int log2_size, int c_idx);
++    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
++                            const uint8_t *left, ptrdiff_t stride,
++                            int c_idx, int mode);
++#endif
+ } HEVCPredContext;
+ 
+ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
+diff -Naur ffmpeg-3.2.4/libavcodec/hevcpred_template.c ffmpeg-3.2.4.patch/libavcodec/hevcpred_template.c
+--- ffmpeg-3.2.4/libavcodec/hevcpred_template.c	2016-06-27 01:54:29.000000000 +0200
++++ ffmpeg-3.2.4.patch/libavcodec/hevcpred_template.c	2017-05-28 20:42:45.746088694 +0200
+@@ -20,13 +20,55 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++//#define DISABLE_INTRA
++
+ #include "libavutil/pixdesc.h"
+ 
+ #include "bit_depth_template.c"
+ #include "hevcpred.h"
+ 
++#ifdef RPI
++#include "rpi_zc.h"
++#endif
++
++#define DUMP_PRED 0
++
+ #define POS(x, y) src[(x) + stride * (y)]
+ 
++#if PRED_C
++
++typedef uint8_t (* c8_dst_ptr_t)[2];
++typedef const uint8_t (* c8_src_ptr_t)[2];
++
++#if BIT_DEPTH == 8
++#undef BIT_DEPTH
++#define BIT_DEPTH 16
++#include "bit_depth_template.c"
++#undef FUNC
++#define FUNC(a) FUNC3(a, 8, _c)
++#else
++#undef FUNC
++#define FUNC FUNCC
++#endif
++
++#endif
++
++#if DUMP_PRED
++#ifndef DEBUG_ONCE
++#define DEBUG_ONCE
++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
++{
++    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
++        for (unsigned int x = 0; x != size; x++) {
++            printf("%4d", data[x * 2]);
++        }
++        printf("\n");
++    }
++    printf("\n");
++}
++#endif
 +#endif
 +
- void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
- 
- 
-diff -Naur ffmpeg-3.2.4/libavcodec/hevcpred_template.c ffmpeg-3.2.4.patch/libavcodec/hevcpred_template.c
---- ffmpeg-3.2.4/libavcodec/hevcpred_template.c	2016-06-27 01:54:29.000000000 +0200
-+++ ffmpeg-3.2.4.patch/libavcodec/hevcpred_template.c	2017-03-22 22:42:34.842798559 +0100
-@@ -20,6 +20,8 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+//#define DISABLE_INTRA
-+
- #include "libavutil/pixdesc.h"
- 
- #include "bit_depth_template.c"
-@@ -69,8 +71,11 @@
+ static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
+                                               int log2_size, int c_idx)
+ {
+@@ -69,8 +111,11 @@
                  AV_WN4P(&ptr[i], a);                                           \
              else                                                               \
                  a = PIXEL_SPLAT_X4(ptr[i + 3])
@@ -7045,21 +9788,416 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevcpred_template.c ffmpeg-3.2.4.patch/libavc
      int i;
      int hshift = s->ps.sps->hshift[c_idx];
      int vshift = s->ps.sps->vshift[c_idx];
-@@ -114,6 +119,10 @@
+@@ -79,15 +124,23 @@
+     int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
+     int size_in_luma_v = size << vshift;
+     int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
+-    int x = x0 >> hshift;
+-    int y = y0 >> vshift;
++    const int x = x0 >> hshift;
++    const int y = y0 >> vshift;
+     int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+     int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+ 
+     int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
+ 
+-    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
++    const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
++#if defined(RPI)
++    pixel *const src = s->frame->format != AV_PIX_FMT_SAND128 ?
++            (pixel*)s->frame->data[c_idx] + x + y * stride :
++        c_idx == 0 ?
++            (pixel *)rpi_sliced_frame_pos_y(s->frame, x, y) :
++            (pixel *)rpi_sliced_frame_pos_c(s->frame, x, y);
++#else
+     pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride;
++#endif
+ 
+     int min_pu_width = s->ps.sps->min_pu_width;
+ 
+@@ -95,14 +148,20 @@
+                               lc->tu.intra_pred_mode;
+     pixel4 a;
+     pixel  left_array[2 * MAX_TB_SIZE + 1];
++#if !PRED_C
+     pixel  filtered_left_array[2 * MAX_TB_SIZE + 1];
++#endif
+     pixel  top_array[2 * MAX_TB_SIZE + 1];
++#if !PRED_C
+     pixel  filtered_top_array[2 * MAX_TB_SIZE + 1];
++#endif
+ 
+     pixel  *left          = left_array + 1;
+     pixel  *top           = top_array  + 1;
++#if !PRED_C
+     pixel  *filtered_left = filtered_left_array + 1;
+     pixel  *filtered_top  = filtered_top_array  + 1;
++#endif
+     int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
+     int cand_left        = lc->na.cand_left;
+     int cand_up_left     = lc->na.cand_up_left;
+@@ -114,6 +173,26 @@
      int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
                             (x0 + size_in_luma_h)) >> hshift;
  
++    pixel * src_l = src - 1;
++    pixel * src_u = src - stride;
++    pixel * src_ur = src_u + size;
++
 +#ifdef DISABLE_INTRA
 +    return;
 +#endif
++
++#if defined(RPI)
++    if (s->frame->format == AV_PIX_FMT_SAND128) {
++        const AVFrame * const frame = s->frame;
++        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
++        const unsigned int stripe_adj = (frame->linesize[3] - 1) * stride;
++        if ((x & mask) == 0)
++            src_l -= stripe_adj;
++        if (((x + size) & mask) == 0)
++            src_ur += stripe_adj;
++    }
++#endif
 +
      if (s->ps.pps->constrained_intra_pred_flag == 1) {
          int size_in_luma_pu_v = PU(size_in_luma_v);
          int size_in_luma_pu_h = PU(size_in_luma_h);
+@@ -163,23 +242,24 @@
+         top[-1] = 128;
+     }
+     if (cand_up_left) {
+-        left[-1] = POS(-1, -1);
++        left[-1] = src_l[-stride];
+         top[-1]  = left[-1];
+     }
+     if (cand_up)
+-        memcpy(top, src - stride, size * sizeof(pixel));
++        // Always good - even with sand
++        memcpy(top, src_u, size * sizeof(pixel));
+     if (cand_up_right) {
+-        memcpy(top + size, src - stride + size, size * sizeof(pixel));
+-        EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1),
++        memcpy(top + size, src_ur, top_right_size * sizeof(pixel));
++        EXTEND(top + size + top_right_size, top[size + top_right_size - 1],
+                size - top_right_size);
+     }
+     if (cand_left)
+         for (i = 0; i < size; i++)
+-            left[i] = POS(-1, i);
++            left[i] = src_l[stride * i];
+     if (cand_bottom_left) {
+         for (i = size; i < size + bottom_left_size; i++)
+-            left[i] = POS(-1, i);
+-        EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
++            left[i] = src_l[stride * i];
++        EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1],
+                size - bottom_left_size);
+     }
+ 
+@@ -268,7 +348,11 @@
+             cand_up_left = 1;
+             cand_left    = 1;
+         } else { // No samples available
++#if PRED_C && BIT_DEPTH == 16
++            left[-1] = 0x8080;
++#else
+             left[-1] = (1 << (BIT_DEPTH - 1));
++#endif
+             EXTEND(top,  left[-1], 2 * size);
+             EXTEND(left, left[-1], 2 * size);
+         }
+@@ -287,6 +371,9 @@
+     top[-1] = left[-1];
+ 
+     // Filtering process
++    // Sand128 can only apply to chroma_format_idc == 1 so we don't need to
++    // worry about chroma smoothing for that case
++#if !PRED_C
+     if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || s->ps.sps->chroma_format_idc == 3)) {
+         if (mode != INTRA_DC && size != 4){
+             int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
+@@ -342,13 +429,46 @@
+                                            mode);
+         break;
+     }
++#else
++    switch (mode) {
++    case INTRA_PLANAR:
++        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++                                          (uint8_t *)left, stride);
++        break;
++    case INTRA_DC:
++        s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top,
++                       (uint8_t *)left, stride, log2_size, c_idx);
++        break;
++    default:
++        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++                                           (uint8_t *)left, stride, c_idx,
++                                           mode);
++        break;
++    }
++
++#if DUMP_PRED
++    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
++    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
++    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
++    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
++#endif
++#endif
+ }
+ 
++#if !PRED_C || BIT_DEPTH == 16
+ #define INTRA_PRED(size)                                                            \
+ static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
+ {                                                                                   \
+     FUNC(intra_pred)(s, x0, y0, size, c_idx);                                       \
+ }
++#else
++#define INTRA_PRED(size)                                                            \
++static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
++{                                                                                   \
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
++    abort();                                                                        \
++}
++#endif
+ 
+ INTRA_PRED(2)
+ INTRA_PRED(3)
+@@ -357,6 +477,7 @@
+ 
+ #undef INTRA_PRED
+ 
++#if !PRED_C
+ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
+                                   const uint8_t *_left, ptrdiff_t stride,
+                                   int trafo_size)
+@@ -371,13 +492,46 @@
+             POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
+                          (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
+ }
++#else
++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
++                                  const uint8_t * _left, ptrdiff_t stride,
++                                  int trafo_size)
++{
++    int x, y;
++    int size = 1 << trafo_size;
++    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
++    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
++    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
++
++    for (y = 0; y < size; y++, src += stride)
++    {
++        for (x = 0; x < size; x++)
++        {
++            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
++                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
++            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
++                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
++        }
++    }
++}
++#endif
+ 
++#if !PRED_C || BIT_DEPTH == 16
+ #define PRED_PLANAR(size)\
+ static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
+                                        const uint8_t *left, ptrdiff_t stride)   \
+ {                                                                               \
+     FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
+ }
++#else
++#define PRED_PLANAR(size)\
++static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
++                                       const uint8_t *left, ptrdiff_t stride)   \
++{                                                                               \
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF", __func__);                            \
++    abort();                                                                    \
++}
++#endif
+ 
+ PRED_PLANAR(0)
+ PRED_PLANAR(1)
+@@ -386,6 +540,7 @@
+ 
+ #undef PRED_PLANAR
+ 
++#if !PRED_C
+ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+                           const uint8_t *_left,
+                           ptrdiff_t stride, int log2_size, int c_idx)
+@@ -416,7 +571,53 @@
+             POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
+     }
+ }
++#else
++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++                          const uint8_t *_left,
++                          ptrdiff_t stride, int log2_size, int c_idx)
++{
++    unsigned int i, j;
++    const unsigned int size = (1 << log2_size);
++    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
++    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
++    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
++    unsigned int dc0 = size;
++    unsigned int dc1 = size;
++
++    for (i = 0; i < size; i++)
++    {
++        dc0 += left[i][0] + top[i][0];
++        dc1 += left[i][1] + top[i][1];
++    }
++
++    dc0 >>= log2_size + 1;
++    dc1 >>= log2_size + 1;
++
++    for (i = 0; i < size; i++, src += stride)
++    {
++        for (j = 0; j < size; ++j)
++        {
++            src[j][0] = dc0;
++            src[j][1] = dc1;
++
++        }
++    }
++}
++#endif
+ 
++#ifndef ANGLE_CONSTS
++#define ANGLE_CONSTS
++static const int intra_pred_angle[] = {
++     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
++    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
++};
++static const int inv_angle[] = {
++    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
++    -630, -910, -1638, -4096
++};
++#endif
++
++#if !PRED_C
+ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+                                                 const uint8_t *_top,
+                                                 const uint8_t *_left,
+@@ -428,15 +629,6 @@
+     const pixel *top  = (const pixel *)_top;
+     const pixel *left = (const pixel *)_left;
+ 
+-    static const int intra_pred_angle[] = {
+-         32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
+-        -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
+-    };
+-    static const int inv_angle[] = {
+-        -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
+-        -630, -910, -1638, -4096
+-    };
+-
+     int angle = intra_pred_angle[mode - 2];
+     pixel ref_array[3 * MAX_TB_SIZE + 4];
+     pixel *ref_tmp = ref_array + size;
+@@ -509,6 +701,83 @@
+         }
+     }
+ }
++#else
++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++                                                const uint8_t *_top,
++                                                const uint8_t *_left,
++                                                ptrdiff_t stride, int c_idx,
++                                                int mode, int size)
++{
++    int x, y;
++    c8_dst_ptr_t src  = (c8_dst_ptr_t)_src;
++    c8_src_ptr_t top  = (c8_src_ptr_t)_top;
++    c8_src_ptr_t left = (c8_src_ptr_t)_left;
++
++    const int angle = intra_pred_angle[mode - 2];
++    uint8_t ref_array[3 * MAX_TB_SIZE + 4][2];
++    c8_dst_ptr_t ref_tmp = ref_array + size;
++    c8_src_ptr_t ref;
++    const int last = (size * angle) >> 5;
++
++    if (mode >= 18) {
++        ref = top - 1;
++        if (angle < 0 && last < -1) {
++            memcpy(ref_tmp, top - 1, (size + 1) * 2);
++            for (x = last; x <= -1; x++)
++            {
++                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++            }
++            ref = (c8_src_ptr_t)ref_tmp;
++        }
++
++        for (y = 0; y < size; y++, src += stride) {
++            const int idx  = ((y + 1) * angle) >> 5;
++            const int fact = ((y + 1) * angle) & 31;
++            if (fact) {
++                for (x = 0; x < size; ++x) {
++                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
++                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
++                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
++                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
++                }
++            } else {
++                memcpy(src, ref + idx + 1, size * 2);
++            }
++        }
++    } else {
++        ref = left - 1;
++        if (angle < 0 && last < -1) {
++            memcpy(ref_tmp, left - 1, (size + 1) * 2);
++            for (x = last; x <= -1; x++)
++            {
++                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++            }
++            ref = (c8_src_ptr_t)ref_tmp;
++        }
++
++        for (x = 0; x < size; x++, src++) {
++            const int idx  = ((x + 1) * angle) >> 5;
++            const int fact = ((x + 1) * angle) & 31;
++            if (fact) {
++                for (y = 0; y < size; y++) {
++                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
++                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
++                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
++                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
++                }
++            } else {
++                for (y = 0; y < size; y++)
++                {
++                    src[y * stride][0] = ref[y + idx + 1][0];
++                    src[y * stride][1] = ref[y + idx + 1][1];
++                }
++            }
++        }
++    }
++}
++#endif
+ 
+ static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
+                                  const uint8_t *left,
 diff -Naur ffmpeg-3.2.4/libavcodec/hevc_ps.c ffmpeg-3.2.4.patch/libavcodec/hevc_ps.c
 --- ffmpeg-3.2.4/libavcodec/hevc_ps.c	2017-02-10 14:25:26.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/hevc_ps.c	2017-03-22 22:42:34.841798557 +0100
-@@ -1001,6 +1001,8 @@
++++ ffmpeg-3.2.4.patch/libavcodec/hevc_ps.c	2017-05-28 20:42:45.742088680 +0200
+@@ -779,7 +779,12 @@
+     switch (sps->bit_depth) {
+     case 8:
+         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
++#if RPI_HEVC_SAND
++        // *** Horrid kludge s.t. we start out with sand format
++        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND128 : AV_PIX_FMT_YUV420P;
++#else
+         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P;
++#endif
+         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
+         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
+        break;
+@@ -1001,6 +1006,8 @@
      sps->amp_enabled_flag = get_bits1(gb);
      sps->sao_enabled      = get_bits1(gb);
  
@@ -7070,20 +10208,21 @@ diff -Naur ffmpeg-3.2.4/libavcodec/hevc_ps.c ffmpeg-3.2.4.patch/libavcodec/hevc_
          sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
 diff -Naur ffmpeg-3.2.4/libavcodec/Makefile ffmpeg-3.2.4.patch/libavcodec/Makefile
 --- ffmpeg-3.2.4/libavcodec/Makefile	2017-02-10 14:25:26.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/Makefile	2017-03-22 22:42:34.813798485 +0100
-@@ -5,6 +5,11 @@
++++ ffmpeg-3.2.4.patch/libavcodec/Makefile	2017-05-28 20:42:45.713088577 +0200
+@@ -5,6 +5,12 @@
  HEADERS = avcodec.h                                                     \
            avdct.h                                                       \
            avfft.h                                                       \
 +          rpi_qpu.h                                                     \
 +          rpi_shader.h                                                  \
++	  rpi_shader_cmd.h                                              \
 +          rpi_mailbox.h                                                 \
 +          rpi_hevc_transform.h                                          \
 +          rpi_zc.h                                                      \
            d3d11va.h                                                     \
            dirac.h                                                       \
            dv_profile.h                                                  \
-@@ -45,6 +50,10 @@
+@@ -45,6 +51,10 @@
         resample.o                                                       \
         resample2.o                                                      \
         utils.o                                                          \
@@ -7094,21 +10233,25 @@ diff -Naur ffmpeg-3.2.4/libavcodec/Makefile ffmpeg-3.2.4.patch/libavcodec/Makefi
         vorbis_parser.o                                                  \
         xiph.o                                                           \
  
-@@ -1093,3 +1102,11 @@
+@@ -1093,3 +1103,15 @@
  $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
  $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
  endif
 +
++QASM := $(SUBDIR)../pi-util/qasm.py
++
++ifneq ("$(wildcard $(QASM))","")
 +$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
-+	python $(SUBDIR)../pi-util/qasm.py -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
++	python $(QASM) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
 +
 +$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
-+	python $(SUBDIR)../pi-util/qasm.py -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
++	python $(QASM) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
++endif
 +
-+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
++$(SUBDIR)rpi_qpu.o $(SUBDIR)hevc.o: $(SUBDIR)rpi_shader.h
 diff -Naur ffmpeg-3.2.4/libavcodec/mmaldec.c ffmpeg-3.2.4.patch/libavcodec/mmaldec.c
 --- ffmpeg-3.2.4/libavcodec/mmaldec.c	2017-02-10 14:25:27.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/mmaldec.c	2017-03-22 22:42:34.842798559 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/mmaldec.c	2017-05-28 20:42:45.746088694 +0200
 @@ -24,6 +24,9 @@
   * MMAL Video Decoder
   */
@@ -7129,7 +10272,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/mmaldec.c ffmpeg-3.2.4.patch/libavcodec/mmald
  #include "internal.h"
 diff -Naur ffmpeg-3.2.4/libavcodec/mpeg4videodec.c ffmpeg-3.2.4.patch/libavcodec/mpeg4videodec.c
 --- ffmpeg-3.2.4/libavcodec/mpeg4videodec.c	2017-02-10 14:25:27.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/mpeg4videodec.c	2017-03-22 22:42:34.843798562 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/mpeg4videodec.c	2017-05-28 20:42:45.747088698 +0200
 @@ -2212,6 +2212,9 @@
  
          if (ctx->divx_version >= 0)
@@ -7148,9 +10291,88 @@ diff -Naur ffmpeg-3.2.4/libavcodec/mpeg4videodec.c ffmpeg-3.2.4.patch/libavcodec
      if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
          s->codec_id == AV_CODEC_ID_MPEG4 &&
          avctx->idct_algo == FF_IDCT_AUTO) {
+diff -Naur ffmpeg-3.2.4/libavcodec/raw.c ffmpeg-3.2.4.patch/libavcodec/raw.c
+--- ffmpeg-3.2.4/libavcodec/raw.c	2017-02-10 14:25:27.000000000 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/raw.c	2017-05-28 20:42:45.747088698 +0200
+@@ -269,6 +269,11 @@
+     { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
+     { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
+ 
++    /* RPI */
++#ifdef RPI
++    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
++#endif
++
+     /* special */
+     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
+     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
+diff -Naur ffmpeg-3.2.4/libavcodec/rawenc.c ffmpeg-3.2.4.patch/libavcodec/rawenc.c
+--- ffmpeg-3.2.4/libavcodec/rawenc.c	2017-02-10 14:25:27.000000000 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/rawenc.c	2017-05-28 20:42:45.748088702 +0200
+@@ -47,6 +47,47 @@
+     return 0;
+ }
+ 
++static uint8_t * cpy_sand_c(uint8_t * dst, const AVFrame * const frame, const int c_off)
++{
++    for (int y = 0; y != frame->height / 2; ++y) {
++        for (int x = 0; x < frame->width; x += frame->linesize[0]) {
++            const uint8_t * p = frame->data[1] + x * frame->linesize[3] + y * frame->linesize[0] + c_off;
++            const int w = FFMIN(frame->linesize[0], frame->width - x) / 2;
++            for (int i = 0; i < w; ++i)
++                *dst++ = p[i * 2];
++        }
++    }
++    return dst;
++}
++
++static int raw_sand_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++                      const AVFrame *frame)
++{
++    int size = frame->width * frame->height * 3 / 2;
++    uint8_t * dst;
++    int ret;
++
++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++        return ret;
++
++    dst = pkt->data;
++
++    // Luma is "easy"
++    for (int y = 0; y != frame->height; ++y) {
++        for (int x = 0; x < frame->width; x += frame->linesize[0]) {
++            const int w = FFMIN(frame->linesize[0], frame->width - x);
++            memcpy(dst,
++                frame->data[0] + x * frame->linesize[3] + y * frame->linesize[0], w);
++            dst += w;
++        }
++    }
++    // Chroma is dull
++    dst = cpy_sand_c(dst, frame, 0);
++    dst = cpy_sand_c(dst, frame, 1);
++
++    return 0;
++}
++
+ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+                       const AVFrame *frame, int *got_packet)
+ {
+@@ -56,6 +97,12 @@
+     if (ret < 0)
+         return ret;
+ 
++    if (frame->format == AV_PIX_FMT_SAND128) {
++        ret = raw_sand_as_yuv420(avctx, pkt, frame);
++        *got_packet = (ret == 0);
++        return ret;
++    }
++
+     if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
+         return ret;
+     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
 diff -Naur ffmpeg-3.2.4/libavcodec/rpi_hevc_transform.h ffmpeg-3.2.4.patch/libavcodec/rpi_hevc_transform.h
 --- ffmpeg-3.2.4/libavcodec/rpi_hevc_transform.h	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/rpi_hevc_transform.h	2017-03-22 22:42:34.845798567 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/rpi_hevc_transform.h	2017-05-28 20:42:45.749088705 +0200
 @@ -0,0 +1,3070 @@
 +unsigned char rpi_hevc_transform [] = {
 +21,
@@ -10224,7 +13446,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_hevc_transform.h ffmpeg-3.2.4.patch/libav
 +};
 diff -Naur ffmpeg-3.2.4/libavcodec/rpi_hevc_transform.s ffmpeg-3.2.4.patch/libavcodec/rpi_hevc_transform.s
 --- ffmpeg-3.2.4/libavcodec/rpi_hevc_transform.s	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/rpi_hevc_transform.s	2017-03-22 22:42:34.846798570 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/rpi_hevc_transform.s	2017-05-28 20:42:45.750088709 +0200
 @@ -0,0 +1,917 @@
 +# ******************************************************************************
 +# Argon Design Ltd.
@@ -11145,8 +14367,8 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_hevc_transform.s ffmpeg-3.2.4.patch/libav
 +  pop r6-r7, pc
 diff -Naur ffmpeg-3.2.4/libavcodec/rpi_mailbox.c ffmpeg-3.2.4.patch/libavcodec/rpi_mailbox.c
 --- ffmpeg-3.2.4/libavcodec/rpi_mailbox.c	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/rpi_mailbox.c	2017-03-22 22:42:34.846798570 +0100
-@@ -0,0 +1,340 @@
++++ ffmpeg-3.2.4.patch/libavcodec/rpi_mailbox.c	2017-05-28 20:42:45.750088709 +0200
+@@ -0,0 +1,149 @@
 +/*
 +Copyright (c) 2012, Broadcom Europe Ltd.
 +All rights reserved.
@@ -11174,6 +14396,8 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_mailbox.c ffmpeg-3.2.4.patch/libavcodec/r
 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 +*/
 +
++#ifdef RPI
++
 +#include <stdio.h>
 +#include <string.h>
 +#include <stdlib.h>
@@ -11181,7 +14405,6 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_mailbox.c ffmpeg-3.2.4.patch/libavcodec/r
 +#include <unistd.h>
 +#include <assert.h>
 +#include <stdint.h>
-+#include <sys/mman.h>
 +#include <sys/ioctl.h>
 +
 +#include <linux/ioctl.h>
@@ -11191,75 +14414,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_mailbox.c ffmpeg-3.2.4.patch/libavcodec/r
 +#define DEVICE_FILE_NAME "/dev/vcio"
 +
 +#include "rpi_mailbox.h"
-+
-+#define PAGE_SIZE (4*1024)
-+
-+// Shared memory will not be cached in ARM cache
-+void *mapmem_shared(unsigned base, unsigned size)
-+{
-+   int mem_fd;
-+   unsigned offset = base % PAGE_SIZE;
-+   base = base - offset;
-+   /* open /dev/mem */
-+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
-+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
-+      return NULL;
-+   }
-+   void *mem = mmap(
-+      0,
-+      size,
-+      PROT_READ|PROT_WRITE,
-+      MAP_SHARED/*|MAP_FIXED*/,
-+      mem_fd,
-+      base);
-+#ifdef DEBUG
-+   printf("base=0x%x, mem=%p\n", base, mem);
-+#endif
-+   if (mem == MAP_FAILED) {
-+      printf("mmap error %d\n", (int)mem);
-+      return NULL;
-+   }
-+   close(mem_fd);
-+   return (char *)mem + offset;
-+}
-+
-+// Unshared memory will be faster as lives in ARM cache, but requires cache flushing
-+void *mapmem_private(unsigned base, unsigned size)
-+{
-+   int mem_fd;
-+   unsigned offset = base % PAGE_SIZE;
-+   base = base - offset;
-+   /* open /dev/mem */
-+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
-+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
-+      return NULL;
-+   }
-+   void *mem = mmap(
-+      0,
-+      size,
-+      PROT_READ|PROT_WRITE,
-+      MAP_PRIVATE/*|MAP_FIXED*/,
-+      mem_fd,
-+      base);
-+#ifdef DEBUG
-+   printf("base=0x%x, mem=%p\n", base, mem);
-+#endif
-+   if (mem == MAP_FAILED) {
-+      printf("mmap error %d\n", (int)mem);
-+      return NULL;
-+   }
-+   close(mem_fd);
-+   return (char *)mem + offset;
-+}
-+
-+void unmapmem(void *addr, unsigned size)
-+{
-+   int s = munmap(addr, size);
-+   if (s != 0) {
-+      printf("munmap error %d\n", s);
-+      exit (-1);
-+   }
-+}
++//#include <interface/vctypes/vc_image_structs.h>
 +
 +/*
 + * use ioctl to send mbox property message
@@ -11273,109 +14428,25 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_mailbox.c ffmpeg-3.2.4.patch/libavcodec/r
 +      printf("ioctl_set_msg failed:%d\n", ret_val);
 +   }
 +
-+#ifdef DEBUG
-+   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
-+   for (i=0; i<size/4; i++)
-+      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
-+#endif
-+   return ret_val;
-+}
-+
-+unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000c; // (the tag id)
-+   p[i++] = 12; // (size of the buffer)
-+   p[i++] = 12; // (size of the data)
-+   p[i++] = size; // (num bytes? or pages?)
-+   p[i++] = align; // (alignment)
-+   p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING)
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned mem_free(int file_desc, unsigned handle)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000f; // (the tag id)
-+   p[i++] = 4; // (size of the buffer)
-+   p[i++] = 4; // (size of the data)
-+   p[i++] = handle;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned mem_lock(int file_desc, unsigned handle)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000d; // (the tag id)
-+   p[i++] = 4; // (size of the buffer)
-+   p[i++] = 4; // (size of the data)
-+   p[i++] = handle;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned mem_unlock(int file_desc, unsigned handle)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000e; // (the tag id)
-+   p[i++] = 4; // (size of the buffer)
-+   p[i++] = 4; // (size of the data)
-+   p[i++] = handle;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
++#ifdef DEBUG
++   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
++   for (i=0; i<size/4; i++)
++      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
++#endif
++   return ret_val;
 +}
 +
-+unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
++unsigned mbox_mem_lock(int file_desc, unsigned handle)
 +{
 +   int i=0;
 +   unsigned p[32];
 +   p[i++] = 0; // size
 +   p[i++] = 0x00000000; // process request
 +
-+   p[i++] = 0x30010; // (the tag id)
-+   p[i++] = 28; // (size of the buffer)
-+   p[i++] = 28; // (size of the data)
-+   p[i++] = code;
-+   p[i++] = r0;
-+   p[i++] = r1;
-+   p[i++] = r2;
-+   p[i++] = r3;
-+   p[i++] = r4;
-+   p[i++] = r5;
++   p[i++] = 0x3000d; // (the tag id)
++   p[i++] = 4; // (size of the buffer)
++   p[i++] = 4; // (size of the data)
++   p[i++] = handle;
 +
 +   p[i++] = 0x00000000; // end tag
 +   p[0] = i*sizeof *p; // actual size
@@ -11384,18 +14455,17 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_mailbox.c ffmpeg-3.2.4.patch/libavcodec/r
 +   return p[5];
 +}
 +
-+unsigned qpu_enable(int file_desc, unsigned enable)
++unsigned mbox_mem_unlock(int file_desc, unsigned handle)
 +{
 +   int i=0;
 +   unsigned p[32];
-+
 +   p[i++] = 0; // size
 +   p[i++] = 0x00000000; // process request
 +
-+   p[i++] = 0x30012; // (the tag id)
++   p[i++] = 0x3000e; // (the tag id)
 +   p[i++] = 4; // (size of the buffer)
 +   p[i++] = 4; // (size of the data)
-+   p[i++] = enable;
++   p[i++] = handle;
 +
 +   p[i++] = 0x00000000; // end tag
 +   p[0] = i*sizeof *p; // actual size
@@ -11404,72 +14474,30 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_mailbox.c ffmpeg-3.2.4.patch/libavcodec/r
 +   return p[5];
 +}
 +
-+unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) {
-+   int i=0;
-+   unsigned p[32];
-+
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+   p[i++] = 0x30011; // (the tag id)
-+   p[i++] = 16; // (size of the buffer)
-+   p[i++] = 16; // (size of the data)
-+   p[i++] = num_qpus;
-+   p[i++] = control;
-+   p[i++] = noflush;
-+   p[i++] = timeout; // ms
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
++#define GET_VCIMAGE_PARAMS 0x30044
 +
-+void execute_multi(int file_desc,
-+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
-+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
-+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) {
-+   int i=0;
-+   unsigned p[32];
++int mbox_get_image_params(int fd, VC_IMAGE_T * img)
++{
++    uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
++    uint32_t * p = buf;
++    void * rimg;
++    int rv;
 +
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+   p[i++] = 0x30018; // (the tag id)
-+   p[i++] = 88; // (size of the buffer)
-+   p[i++] = 88; // (size of the data)
-+
-+   p[i++] = num_qpus;
-+   p[i++] = control;
-+   p[i++] = noflush;
-+   p[i++] = timeout; // ms
-+
-+   p[i++] = num_qpus_2;
-+   p[i++] = control_2;
-+   p[i++] = noflush_2;
-+   p[i++] = timeout_2; // ms
-+
-+   p[i++] = code;
-+   p[i++] = r0;
-+   p[i++] = r1;
-+   p[i++] = r2;
-+   p[i++] = r3;
-+   p[i++] = r4;
-+   p[i++] = r5;
-+
-+   p[i++] = code_2;
-+   p[i++] = r0_2;
-+   p[i++] = r1_2;
-+   p[i++] = r2_2;
-+   p[i++] = r3_2;
-+   p[i++] = r4_2;
-+   p[i++] = r5_2;
++    *p++ = 0; // size
++    *p++ = 0; // process request
++    *p++ = GET_VCIMAGE_PARAMS;
++    *p++ = sizeof(*img);
++    *p++ = sizeof(*img);
++    rimg = p;
++    memcpy(p, img, sizeof(*img));
++    p += sizeof(*img) / sizeof(*p);
++    *p++ = 0;  // End tag
++    buf[0] = (p - buf) * sizeof(*p);
 +
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
++    rv = mbox_property(fd, buf);
++    memcpy(img, rimg, sizeof(*img));
 +
-+   mbox_property(file_desc, p);
-+   return;
++    return rv;
 +}
 +
 +int mbox_open() {
@@ -11487,51 +14515,76 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_mailbox.c ffmpeg-3.2.4.patch/libavcodec/r
 +void mbox_close(int file_desc) {
 +  close(file_desc);
 +}
++
++#endif
++
 diff -Naur ffmpeg-3.2.4/libavcodec/rpi_mailbox.h ffmpeg-3.2.4.patch/libavcodec/rpi_mailbox.h
 --- ffmpeg-3.2.4/libavcodec/rpi_mailbox.h	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/rpi_mailbox.h	2017-03-22 22:42:34.846798570 +0100
-@@ -0,0 +1,25 @@
++++ ffmpeg-3.2.4.patch/libavcodec/rpi_mailbox.h	2017-05-28 20:42:45.750088709 +0200
+@@ -0,0 +1,58 @@
 +#ifndef RPI_MAILBOX_H
 +#define RPI_MAILBOX_H
 +
++/* The image structure. */
++typedef struct vc_image_extra_uv_s {
++  void *u, *v;
++  int vpitch;
++} VC_IMAGE_EXTRA_UV_T;
++
++typedef union {
++    VC_IMAGE_EXTRA_UV_T uv;
++//  VC_IMAGE_EXTRA_RGBA_T rgba;
++//  VC_IMAGE_EXTRA_PAL_T pal;
++//  VC_IMAGE_EXTRA_TF_T tf;
++//  VC_IMAGE_EXTRA_BAYER_T bayer;
++//  VC_IMAGE_EXTRA_MSBAYER_T msbayer;
++//  VC_IMAGE_EXTRA_CODEC_T codec;
++//  VC_IMAGE_EXTRA_OPENGL_T opengl;
++} VC_IMAGE_EXTRA_T;
++
++
++typedef struct VC_IMAGE_T {
++  unsigned short                  type;           /* should restrict to 16 bits */
++  unsigned short                  info;           /* format-specific info; zero for VC02 behaviour */
++  unsigned short                  width;          /* width in pixels */
++  unsigned short                  height;         /* height in pixels */
++  int                             pitch;          /* pitch of image_data array in bytes */
++  int                             size;           /* number of bytes available in image_data array */
++  void                           *image_data;     /* pixel data */
++  VC_IMAGE_EXTRA_T                extra;          /* extra data like palette pointer */
++  void                           *metadata;       /* metadata header for the image */
++  void                           *pool_object;    /* nonNULL if image was allocated from a vc_pool */
++  int                             mem_handle;     /* the mem handle for relocatable memory storage */
++  int                             metadata_size;  /* size of metadata of each channel in bytes */
++  int                             channel_offset; /* offset of consecutive channels in bytes */
++  uint32_t                        video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
++  uint8_t                         num_channels;   /* number of channels (2 for stereo) */
++  uint8_t                         current_channel;/* the channel this header is currently pointing to */
++  uint8_t                         linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
++  uint8_t                         is_channel_linked;     /* Track if the above structure is been used to link the header
++                                                            into a linked-mulitchannel image */
++  uint8_t                         channel_index;         /* index of the channel this header represents while
++                                                            it is being linked. */
++  uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
++} VC_IMAGE_T;
++
++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
++
++
 +extern int mbox_open(void);
 +extern void mbox_close(int file_desc);
 +
-+extern unsigned get_version(int file_desc);
-+extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags);
-+extern unsigned mem_free(int file_desc, unsigned handle);
-+extern unsigned mem_lock(int file_desc, unsigned handle);
-+extern unsigned mem_unlock(int file_desc, unsigned handle);
-+extern void *mapmem_shared(unsigned base, unsigned size);
-+extern void *mapmem_private(unsigned base, unsigned size);
-+extern void unmapmem(void *addr, unsigned size);
-+
-+extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
-+extern void execute_multi(int file_desc,
-+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
-+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
-+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2);
-+extern unsigned qpu_enable(int file_desc, unsigned enable);
++extern unsigned mbox_mem_lock(int file_desc, unsigned handle);
++extern unsigned mbox_mem_unlock(int file_desc, unsigned handle);
++
++int mbox_get_image_params(int fd, VC_IMAGE_T * img);
 +
 +#endif
 diff -Naur ffmpeg-3.2.4/libavcodec/rpi_qpu.c ffmpeg-3.2.4.patch/libavcodec/rpi_qpu.c
 --- ffmpeg-3.2.4/libavcodec/rpi_qpu.c	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/rpi_qpu.c	2017-03-22 22:42:34.848798575 +0100
-@@ -0,0 +1,993 @@
++++ ffmpeg-3.2.4.patch/libavcodec/rpi_qpu.c	2017-05-28 20:42:45.751088712 +0200
+@@ -0,0 +1,902 @@
 +#ifdef RPI
-+// Use vchiq service for submitting jobs
-+#define GPUSERVICE
-+
-+// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
-+// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
-+//#define RPI_TIME_TOTAL_QPU
-+// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
-+//#define RPI_TIME_TOTAL_VPU
-+// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
-+#define RPI_TIME_TOTAL_POSTED
-+
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <string.h>
@@ -11544,27 +14597,35 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_qpu.c ffmpeg-3.2.4.patch/libavcodec/rpi_q
 +#include <pthread.h>
 +#include <time.h>
 +
++#include <interface/vcsm/user-vcsm.h>
++
 +#include "rpi_mailbox.h"
 +#include "rpi_qpu.h"
 +#include "rpi_shader.h"
 +#include "rpi_hevc_transform.h"
++#include "rpi_zc.h"
 +
-+#include "rpi_user_vcsm.h"
-+#ifdef GPUSERVICE
 +#pragma GCC diagnostic push
 +// Many many redundant decls in the header files
 +#pragma GCC diagnostic ignored "-Wredundant-decls"
 +#include "interface/vmcs_host/vc_vchi_gpuserv.h"
 +#pragma GCC diagnostic pop
-+#endif
 +
-+// QPU profile flags
-+#define NO_FLUSH 1
-+#define CLEAR_PROFILE 2
-+#define OUTPUT_COUNTS 4
++// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
++#define RPI_TRACE_TIME_VPU_QPU_WAIT     0
 +
-+#define FLAGS_FOR_PROFILING (NO_FLUSH)
++// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
++// Beware this is expensive and will probably throw off all other timing by >10%
++#define RPI_TRACE_QPU_PROFILE_ALL       0
 +
++// QPU "noflush" flags
++// a mixture of flushing & profiling
++
++#define QPU_FLAGS_NO_FLUSH_VPU          1       // If unset VPU cache will be flushed
++#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2       // Clear & Enable detailed QPU profiling registers
++#define QPU_FLAGS_PROF_OUTPUT_COUNTS    4       // Print the results
++#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
++#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
 +
 +// On Pi2 there is no way to access the VPU L2 cache
 +// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
@@ -11621,65 +14682,223 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_qpu.c ffmpeg-3.2.4.patch/libavcodec/rpi_q
 +{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
 +};
 +
++// Code/constants on GPU
 +struct GPU
 +{
 +  unsigned int qpu_code[QPU_CODE_SIZE];
 +  unsigned int vpu_code[VPU_CODE_SIZE];
 +  short transMatrix2even[16*16*2];
-+  int open_count; // Number of allocated video buffers
-+  int      mb; // Mailbox handle
-+  int      vc; // Address in GPU memory
-+  int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task
-+  int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task
 +};
 +
++#define CFE_ENTS_PER_A 8
++// If we have a sliced frame 2048 wide @ 64 per slice then there are 32 slices
++// in a line & we want to flush luma + chroma + a couple of bits so ents ~ 70
++// allow 128
++#define CFE_ENT_COUNT  128
++#define CFE_A_COUNT    (CFE_ENT_COUNT / CFE_ENTS_PER_A)
++
++struct rpi_cache_flush_env_s {
++    unsigned int n;
++    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
++};
++
++#define WAIT_COUNT_MAX 16
++
++typedef struct trace_time_one_s
++{
++  int count;
++  int64_t start[WAIT_COUNT_MAX];
++  int64_t total[WAIT_COUNT_MAX];
++} trace_time_one_t;
++
++typedef struct trace_time_wait_s
++{
++  unsigned int jcount;
++  int64_t start0;
++  int64_t last_update;
++  trace_time_one_t active;
++  trace_time_one_t wait;
++} trace_time_wait_t;
++
++typedef struct vq_wait_s
++{
++  sem_t sem;
++  unsigned int cost;
++  struct vq_wait_s * next;
++} vq_wait_t;
++
++#define VQ_WAIT_POOL_SIZE 16
++typedef struct vq_wait_pool_s
++{
++  vq_wait_t * head;
++  vq_wait_t pool[VQ_WAIT_POOL_SIZE];
++} vq_wait_pool_t;
++
++static void vq_wait_pool_init(vq_wait_pool_t * const pool);
++static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
++
++typedef struct gpu_env_s
++{
++  int open_count;
++  int init_count;
++  int mb;
++  unsigned int current_load;
++  GPU_MEM_PTR_T code_gm_ptr;
++  vq_wait_pool_t wait_pool;
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  trace_time_wait_t ttw;
++#endif
++} gpu_env_t;
++
 +// Stop more than one thread trying to allocate memory or use the processing resources at once
 +static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-+static volatile struct GPU* gpu = NULL;
-+static GPU_MEM_PTR_T gpu_mem_ptr;
++static gpu_env_t * gpu = NULL;
 +
-+#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED)
-+static unsigned int Microseconds(void) {
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++
++static int64_t ns_time(void)
++{
 +    struct timespec ts;
-+    unsigned int x;
-+    static unsigned int base = 0;
-+    clock_gettime(CLOCK_REALTIME, &ts);
-+    x = ts.tv_sec*1000000 + ts.tv_nsec/1000;
-+    if (base==0) base=x;
-+    return x-base;
++    clock_gettime(CLOCK_MONOTONIC, &ts);
++    return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
++}
++
++
++#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
++
++#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
++#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
++#define T_ARG(t) T_SEC(t), T_MS(t)
++#define T_FMT "%u.%03u"
++
++static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
++{
++  // Update totals for levels that are still pending
++  for (int i = 0; i < tto->count; ++i) {
++    tto->total[i] += now - tto->start[i];
++    tto->start[i] = now;
++  }
++
++  printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
++         prefix,
++         T_ARG(now - start0 - tto->total[0]),
++         T_ARG(tto->total[0]),
++         T_ARG(tto->total[1]),
++         T_ARG(tto->total[2]),
++         T_ARG(tto->total[3]));
++}
++
++
++static void tto_start(trace_time_one_t * const tto, const int64_t now)
++{
++  av_assert0(tto->count < WAIT_COUNT_MAX);
++  tto->start[tto->count++] = now;
 +}
++
++static void tto_end(trace_time_one_t * const tto, const int64_t now)
++{
++  const int n = --tto->count;
++  av_assert0(n >= 0);
++  tto->total[n] += now - tto->start[n];
++}
++
++static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
++{
++  printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
++  tto_print(&ttw->active, now, ttw->start0, "Active");
++  tto_print(&ttw->wait,   now, ttw->start0, "  Wait");
++}
++
 +#endif
 +
-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb);
-+static void gpu_free_internal(GPU_MEM_PTR_T *p);
++// GPU memory alloc fns (internal)
++
++// GPU_MEM_PTR_T alloc fns
++static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
++  p->numbytes = numbytes;
++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
++  av_assert0(p->vcsm_handle);
++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
++  av_assert0(p->vc_handle);
++  p->arm = vcsm_lock(p->vcsm_handle);
++  av_assert0(p->arm);
++  p->vc = mbox_mem_lock(mb, p->vc_handle);
++  av_assert0(p->vc);
++  return 0;
++}
++
++static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
++  p->numbytes = numbytes;
++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
++  av_assert0(p->vcsm_handle);
++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
++  av_assert0(p->vc_handle);
++  p->arm = vcsm_lock(p->vcsm_handle);
++  av_assert0(p->arm);
++  p->vc = mbox_mem_lock(mb, p->vc_handle);
++  av_assert0(p->vc);
++  return 0;
++}
++
++static void gpu_free_internal(const int mb, GPU_MEM_PTR_T * const p) {
++  mbox_mem_unlock(mb, p->vc_handle);
++  vcsm_unlock_ptr(p->arm);
++  vcsm_free(p->vcsm_handle);
++  memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
++}
++
++
++// GPU init, free, lock, unlock
++
++static void gpu_term(void)
++{
++  gpu_env_t * const ge = gpu;
++
++  // We have to hope that eveything has terminated...
++  gpu = NULL;
++
++  vc_gpuserv_deinit();
++
++  gpu_free_internal(ge->mb, &ge->code_gm_ptr);
++
++  vcsm_exit();
++
++  mbox_close(ge->mb);
++
++  vq_wait_pool_deinit(&ge->wait_pool);
++
++  free(ge);
++}
++
 +
 +// Connect to QPU, returns 0 on success.
-+static int gpu_init(volatile struct GPU **gpu) {
-+  int mb = mbox_open();
-+  int vc;
++static int gpu_init(gpu_env_t ** const gpu) {
 +  volatile struct GPU* ptr;
-+	if (mb < 0)
-+		return -1;
-+#ifndef RPI_ASYNC
-+	if (qpu_enable(mb, 1)) return -2;
-+#endif
-+  vcsm_init();
-+  vc_gpuserv_init();
-+  gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
-+  ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
-+  memset((void*)ptr, 0, sizeof *ptr);
-+  vc = gpu_mem_ptr.vc;
++  gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
++  *gpu = NULL;
++
++  if (ge == NULL)
++    return -1;
 +
-+  ptr->mb = mb;
-+  ptr->vc = vc;
++  if ((ge->mb = mbox_open()) < 0)
++    return -1;
 +
-+  printf("GPU allocated at 0x%x\n",vc);
++  vq_wait_pool_init(&ge->wait_pool);
 +
-+  *gpu = ptr;
++  vcsm_init();
++
++  gpu_malloc_uncached_internal(ge->mb, sizeof(struct GPU), &ge->code_gm_ptr);
++  ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
++
++  // Zero everything so we have zeros between the code bits
++  memset((void *)ptr, 0, sizeof(*ptr));
 +
 +  // Now copy over the QPU code into GPU memory
 +  {
-+    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
++    int num_bytes = (char *)mc_end - (char *)rpi_shader;
 +    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
 +    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
 +  }
@@ -11692,105 +14911,55 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_qpu.c ffmpeg-3.2.4.patch/libavcodec/rpi_q
 +  // And the transform coefficients
 +  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
 +
-+#ifdef RPI_ASYNC
-+  {
-+    int err;
-+    vpu_async_tail = 0;
-+    vpu_async_head = 0;
-+    err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
-+    //printf("Created thread\n");
-+    if (err) {
-+        av_log(NULL, AV_LOG_FATAL, "Failed to create vpu thread\n");
-+        return -4;
-+    }
++  *gpu = ge;
++  return 0;
++}
 +
-+    {
-+      struct sched_param param = {0};
-+      int policy = 0;
 +
-+      if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
-+      {
-+        av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
-+      }
-+      else
-+      {
-+        av_log(NULL, AV_LOG_INFO, "VPU thread: policy=%d (%s), pri=%d\n",
-+            policy,
-+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
-+            param.sched_priority);
 +
-+        policy = SCHED_FIFO;
-+        param.sched_priority = sched_get_priority_max(SCHED_FIFO);
++static void gpu_unlock(void) {
++  pthread_mutex_unlock(&gpu_mutex);
++}
 +
-+        av_log(NULL, AV_LOG_INFO, "Attempt to set: policy=%d (%s), pri=%d\n",
-+            policy,
-+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
-+            param.sched_priority);
++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
++static gpu_env_t * gpu_lock(void) {
++  pthread_mutex_lock(&gpu_mutex);
 +
-+        if (pthread_setschedparam(vpu_thread, policy, &param) != 0)
-+        {
-+          av_log(NULL, AV_LOG_ERROR, "Unable to set VPU thread scheduling parameters\n");
-+        }
-+        else
-+        {
-+          if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
-+          {
-+            av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
-+          }
-+          else
-+          {
-+            av_log(NULL, AV_LOG_INFO, "VPU thread (after): policy=%d (%s), pri=%d\n",
-+                policy,
-+                policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
-+                param.sched_priority);
-+          }
-+        }
-+      }
++  av_assert0(gpu != NULL);
++  return gpu;
++}
 +
-+    }
++static gpu_env_t * gpu_lock_ref(void)
++{
++  pthread_mutex_lock(&gpu_mutex);
 +
++  if (gpu == NULL) {
++    int rv = gpu_init(&gpu);
++    if (rv != 0) {
++      gpu_unlock();
++      return NULL;
++    }
 +  }
-+#endif
 +
-+  return 0;
++  ++gpu->open_count;
++  return gpu;
 +}
 +
-+// Returns 1 if the gpu is currently idle
-+static int gpu_idle(void)
++static void gpu_unlock_unref(gpu_env_t * const ge)
 +{
-+  int ret = pthread_mutex_trylock(&gpu_mutex);
-+  if (ret==0) {
-+    pthread_mutex_unlock(&gpu_mutex);
-+    return 1;
-+  }
-+  return 0;
-+}
-+
-+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
-+static void gpu_lock(void) {
-+  pthread_mutex_lock(&gpu_mutex);
++  if (--ge->open_count == 0)
++    gpu_term();
 +
-+  if (gpu==NULL) {
-+    gpu_init(&gpu);
-+  }
++  gpu_unlock();
 +}
 +
-+static void gpu_unlock(void) {
-+  pthread_mutex_unlock(&gpu_mutex);
++static inline gpu_env_t * gpu_ptr(void)
++{
++  av_assert0(gpu != NULL);
++  return gpu;
 +}
 +
-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
-+  p->numbytes = numbytes;
-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+  av_assert0(p->vcsm_handle);
-+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+  av_assert0(p->vc_handle);
-+  p->arm = vcsm_lock(p->vcsm_handle);
-+  av_assert0(p->arm);
-+  p->vc = mem_lock(mb, p->vc_handle);
-+  av_assert0(p->vc);
-+  return 0;
-+}
++// Public gpu fns
 +
 +// Allocate memory on GPU
 +// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
@@ -11800,730 +14969,530 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_qpu.c ffmpeg-3.2.4.patch/libavcodec/rpi_q
 +int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
 +{
 +  int r;
-+  gpu_lock();
-+  r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb);
-+  gpu->open_count++;
++  gpu_env_t * const ge = gpu_lock_ref();
++  if (ge == NULL)
++    return -1;
++  r = gpu_malloc_uncached_internal(ge->mb, numbytes, p);
 +  gpu_unlock();
 +  return r;
 +}
 +
-+int gpu_get_mailbox(void)
-+{
-+  av_assert0(gpu);
-+  return gpu->mb;
-+}
-+
-+// Call this to clean and invalidate a region of memory
-+void gpu_cache_flush(const GPU_MEM_PTR_T * const p)
-+{
-+#ifdef RPI_FAST_CACHEFLUSH
-+    struct vcsm_user_clean_invalid_s iocache = {};
-+    iocache.s[0].handle = p->vcsm_handle;
-+    iocache.s[0].cmd = 3; // clean+invalidate
-+    iocache.s[0].addr = (int) p->arm;
-+    iocache.s[0].size  = p->numbytes;
-+    vcsm_clean_invalid( &iocache );
-+#else
-+    void *tmp = vcsm_lock(p->vcsm_handle);
-+    vcsm_unlock_ptr(tmp);
-+#endif
-+}
-+
-+void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
-+{
-+#ifdef RPI_FAST_CACHEFLUSH
-+    struct vcsm_user_clean_invalid_s iocache = {};
-+    iocache.s[0].handle = p0->vcsm_handle;
-+    iocache.s[0].cmd = 3; // clean+invalidate
-+    iocache.s[0].addr = (int) p0->arm;
-+    iocache.s[0].size  = p0->numbytes;
-+    iocache.s[1].handle = p1->vcsm_handle;
-+    iocache.s[1].cmd = 3; // clean+invalidate
-+    iocache.s[1].addr = (int) p1->arm;
-+    iocache.s[1].size  = p1->numbytes;
-+    iocache.s[2].handle = p2->vcsm_handle;
-+    iocache.s[2].cmd = 3; // clean+invalidate
-+    iocache.s[2].addr = (int) p2->arm;
-+    iocache.s[2].size  = p2->numbytes;
-+    vcsm_clean_invalid( &iocache );
-+#else
-+    void *tmp;
-+    tmp = vcsm_lock(p0->vcsm_handle);
-+    vcsm_unlock_ptr(tmp);
-+    tmp = vcsm_lock(p1->vcsm_handle);
-+    vcsm_unlock_ptr(tmp);
-+    tmp = vcsm_lock(p2->vcsm_handle);
-+    vcsm_unlock_ptr(tmp);
-+#endif
-+}
-+
-+static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
-+  p->numbytes = numbytes;
-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
-+  av_assert0(p->vcsm_handle);
-+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+  av_assert0(p->vc_handle);
-+  p->arm = vcsm_lock(p->vcsm_handle);
-+  av_assert0(p->arm);
-+  p->vc = mem_lock(gpu->mb, p->vc_handle);
-+  av_assert0(p->vc);
-+  return 0;
-+}
-+
 +// This allocates data that will be
 +//    Cached in ARM L2
 +//    Uncached in VPU L2
 +int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
 +{
 +  int r;
-+  gpu_lock();
-+  r = gpu_malloc_cached_internal(numbytes, p);
-+  gpu->open_count++;
++  gpu_env_t * const ge = gpu_lock_ref();
++  if (ge == NULL)
++    return -1;
++  r = gpu_malloc_cached_internal(ge->mb, numbytes, p);
 +  gpu_unlock();
 +  return r;
 +}
 +
-+static void gpu_term(void)
-+{
-+  int mb;
-+
-+  if (gpu==NULL)
-+    return;
-+  mb = gpu->mb;
-+
-+  // ??? Tear down anything needed for gpuexecute
-+
-+  qpu_enable(mb, 0);
-+  gpu_free_internal(&gpu_mem_ptr);
-+
-+  vc_gpuserv_deinit();
-+  vcsm_exit();
-+
-+  mbox_close(mb);
-+  gpu = NULL;
++void gpu_free(GPU_MEM_PTR_T * const p) {
++  gpu_env_t * const ge = gpu_lock();
++  gpu_free_internal(ge->mb, p);
++  gpu_unlock_unref(ge);
 +}
 +
-+void gpu_free_internal(GPU_MEM_PTR_T *p) {
-+  int mb = gpu->mb;
-+  mem_unlock(mb,p->vc_handle);
-+  vcsm_unlock_ptr(p->arm);
-+  vcsm_free(p->vcsm_handle);
++unsigned int vpu_get_fn(void) {
++  // Make sure that the gpu is initialized
++  av_assert0(gpu != NULL);
++  return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code);
 +}
 +
-+void gpu_free(GPU_MEM_PTR_T *p) {
-+  gpu_lock();
-+
-+  gpu_free_internal(p);
-+
-+  gpu->open_count--;
-+  if (gpu->open_count==0) {
-+      printf("Closing GPU\n");
-+      gpu_term();
-+      gpu = NULL;
-+  }
-+  gpu_unlock();
++unsigned int vpu_get_constants(void) {
++  av_assert0(gpu != NULL);
++  return gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even);
 +}
 +
-+unsigned int vpu_get_fn(void) {
-+  // Make sure that the gpu is initialized
-+  if (gpu==NULL) {
-+    printf("Preparing gpu\n");
-+    gpu_lock();
-+    gpu_unlock();
-+  }
-+  return gpu->vc + offsetof(struct GPU,vpu_code);
++int gpu_get_mailbox(void)
++{
++  av_assert0(gpu);
++  return gpu->mb;
 +}
 +
-+unsigned int vpu_get_constants(void) {
-+  if (gpu==NULL) {
-+    gpu_lock();
-+    gpu_unlock();
-+  }
-+  return gpu->vc + offsetof(struct GPU,transMatrix2even);
++void gpu_ref(void)
++{
++  gpu_lock_ref();
++  gpu_unlock();
 +}
 +
-+#ifdef GPUSERVICE
-+static void callback(void *cookie)
++void gpu_unref(void)
 +{
-+  sem_post((sem_t *)cookie);
++  gpu_env_t * const ge = gpu_lock();
++  gpu_unlock_unref(ge);
 +}
-+#endif
 +
++// ----------------------------------------------------------------------------
++//
++// Cache flush functions
 +
-+static volatile uint32_t post_done = 0;
-+static volatile uint32_t post_qed = 0;
 +
-+static void post_code2_cb(void * v)
++rpi_cache_flush_env_t * rpi_cache_flush_init()
 +{
-+  uint32_t n = (uint32_t)v;
-+  if ((int32_t)(n - post_done) > 0) {
-+    post_done = n;
-+  }
++    rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t));
++    if (rfe == NULL)
++        return NULL;
++
++    rfe->n = 0;
++    return rfe;
 +}
 +
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
++{
++    if (rfe != NULL)
++        free(rfe);
++}
 +
-+// Post a command to the queue
-+// Returns an id which we can use to wait for completion
-+int vpu_post_code2(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
 +{
-+  struct gpu_job_s j[1] = {
++    int rc = 0;
++    unsigned int na;
++    unsigned int nr;
++
++    // Clear any reamaining ents in the final block
++    if ((nr = rfe->n % CFE_ENTS_PER_A) != 0)
++        memset(rfe->a[rfe->n / CFE_ENTS_PER_A].s + nr, 0, (CFE_ENTS_PER_A - nr) * sizeof(rfe->a[0].s[0]));
++
++    for (na = 0; na * CFE_ENTS_PER_A < rfe->n; ++na)
 +    {
-+      .command = EXECUTE_VPU,
-+      .u.v.q = {code, r0, r1, r2, r3, r4, r5},
-+      .callback.func = post_code2_cb
++        if (vcsm_clean_invalid(rfe->a + na) != 0)
++            rc = -1;
 +    }
-+  };
-+  uint32_t id;
 +
-+  j[0].callback.cookie = (void *)(id = ++post_qed);
++    free(rfe);
 +
-+  av_assert0(vc_gpuserv_execute_code(1, j) == 0);
++    if (rc == 0)
++        return 0;
 +
-+  return id;
++    av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
++    return rc;
 +}
 +
-+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+    int qpu0_n, const uint32_t * qpu0_mail,
-+    int qpu1_n, const uint32_t * qpu1_mail)
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
 +{
-+#if 1
-+  sem_t sync0;
-+  struct gpu_job_s j[4];
-+
-+  sem_init(&sync0, 0, 0);
-+
-+  j[0].command = EXECUTE_VPU;
-+  j[0].u.v.q[0] = vpu_code;
-+  j[0].u.v.q[1] = r0;
-+  j[0].u.v.q[2] = r1;
-+  j[0].u.v.q[3] = r2;
-+  j[0].u.v.q[4] = r3;
-+  j[0].u.v.q[5] = r4;
-+  j[0].u.v.q[6] = r5;
-+  j[0].callback.func = 0;
-+  j[0].callback.cookie = NULL;
-+
-+  j[1].command = EXECUTE_QPU;
-+  j[1].u.q.jobs = qpu1_n;
-+  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
-+  j[1].u.q.timeout = 5000;
-+  j[1].callback.func = 0;
-+  j[1].callback.cookie = NULL;
-+
-+  j[2].command = EXECUTE_QPU;
-+  j[2].u.q.jobs = qpu0_n;
-+  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+  j[2].u.q.noflush = 1;
-+  j[2].u.q.timeout = 5000;
-+  j[2].callback.func = 0;
-+  j[2].callback.cookie = NULL;
-+
-+  j[3].command = EXECUTE_SYNC;
-+  j[3].u.s.mask = 3;
-+  j[3].callback.func = callback;
-+  j[3].callback.cookie = (void *)&sync0;
-+
-+  av_assert0(vc_gpuserv_execute_code(4, j) == 0);
-+
-+  sem_wait(&sync0);
-+#else
++    // Deal with empty pointer trivially
++    if (gm == NULL || gm->numbytes == 0)
++        return;
 +
-+  sem_t sync0, sync2;
-+  struct gpu_job_s j[3];
-+
-+  sem_init(&sync0, 0, 0);
-+  sem_init(&sync2, 0, 0);
-+
-+  j[0].command = EXECUTE_VPU;
-+  j[0].u.v.q[0] = vpu_code;
-+  j[0].u.v.q[1] = r0;
-+  j[0].u.v.q[2] = r1;
-+  j[0].u.v.q[3] = r2;
-+  j[0].u.v.q[4] = r3;
-+  j[0].u.v.q[5] = r4;
-+  j[0].u.v.q[6] = r5;
-+  j[0].callback.func = callback;
-+  j[0].callback.cookie = (void *)&sync0;
-+
-+  j[1].command = EXECUTE_QPU;
-+  j[1].u.q.jobs = qpu1_n;
-+  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
-+  j[1].u.q.timeout = 5000;
-+  j[1].callback.func = 0;
-+  j[1].callback.cookie = NULL;
-+
-+  j[2].command = EXECUTE_QPU;
-+  j[2].u.q.jobs = qpu0_n;
-+  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+  j[2].u.q.noflush = 1;
-+  j[2].u.q.timeout = 5000;
-+  j[2].callback.func = callback;
-+  j[2].callback.cookie = (void *)&sync2;
-+
-+  av_assert0(vc_gpuserv_execute_code(3, j) == 0);
-+
-+  sem_wait(&sync0);
-+  sem_wait(&sync2);
-+#endif
++    {
++        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
++        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
 +
-+  return 0;
-+}
++        av_assert0(rfe->n < CFE_ENT_COUNT);
 +
++        a->s[n].cmd = mode;
++        a->s[n].handle = gm->vcsm_handle;
++        a->s[n].addr = (unsigned int)gm->arm;
++        a->s[n].size = gm->numbytes;
++        ++rfe->n;
++    }
++}
 +
-+// Wait for completion of the given command
-+void vpu_wait(int id)
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset, const unsigned int size)
 +{
-+  if (id == 0) {
-+#if 0
-+    sem_t sync0;
-+    struct gpu_job_s j[1] =
++    // Deal with empty pointer trivially
++    if (gm == NULL || size == 0)
++        return;
++
++//    printf("[%d] offset=%d, size=%d, numbytes=%d\n", rfe->n, offset, size, gm->numbytes);
++
++    av_assert0(offset <= gm->numbytes);
++    av_assert0(size <= gm->numbytes);
++    av_assert0(offset + size <= gm->numbytes);
++
 +    {
-+      {
-+        .command = EXECUTE_SYNC,
-+        .u.s.mask = 3,
-+        .callback.func = callback,
-+        .callback.cookie = (void *)&sync0
-+      }
-+    };
++        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
++        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
 +
-+    sem_init(&sync0, 0, 0);
++        av_assert0(rfe->n < CFE_ENT_COUNT);
 +
-+    av_assert0(vc_gpuserv_execute_code(1, j) == 0);
++        a->s[n].cmd = mode;
++        a->s[n].handle = gm->vcsm_handle;
++        a->s[n].addr = (unsigned int)gm->arm + offset;
++        a->s[n].size = size;
++        ++rfe->n;
++    }
++}
 +
-+    sem_wait(&sync0);
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
++{
++#if !RPI_ONE_BUF
++#error Fixme! (NIF)
 +#endif
++  if (gpu_is_buf1(frame)) {
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
 +  }
-+  else {
-+    while ((int32_t)(post_done - (uint32_t)id) < 0) {
-+      usleep(1000);
-+    }
++  else
++  {
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
 +  }
 +}
 +
-+
-+unsigned int qpu_get_fn(int num) {
-+    // Make sure that the gpu is initialized
-+    unsigned int *fn;
-+    if (gpu==NULL) {
-+      printf("Preparing gpu\n");
-+      gpu_lock();
-+      gpu_unlock();
++void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
++  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma)
++{
++  const unsigned int y_offset = frame->linesize[0] * start_line;
++  const unsigned int y_size = frame->linesize[0] * n;
++  // Round UV up/down to get everything
++  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
++  const unsigned int uv_offset = frame->linesize[1] * (start_line >> uv_shift);
++  const unsigned int uv_size = frame->linesize[1] * ((start_line + n + uv_rnd) >> uv_shift) - uv_offset;
++
++  // As all unsigned they will also reject -ve
++  // Test individually as well as added to reject overflow
++  av_assert0(start_line <= (unsigned int)frame->height);
++  av_assert0(n <= (unsigned int)frame->height);
++  av_assert0(start_line + n <= (unsigned int)frame->height);
++
++  if (!gpu_is_buf1(frame))
++  {
++    if (do_luma) {
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
 +    }
-+    switch(num) {
-+    case QPU_MC_SETUP:
-+      fn = mc_setup;
-+      break;
-+    case QPU_MC_FILTER:
-+      fn = mc_filter;
-+      break;
-+    case QPU_MC_EXIT:
-+      fn = mc_exit;
-+      break;
-+    case QPU_MC_INTERRUPT_EXIT12:
-+      fn = mc_interrupt_exit12;
-+      break;
-+    case QPU_MC_FILTER_B:
-+      fn = mc_filter_b;
-+      break;
-+    //case QPU_MC_FILTER_HONLY:
-+    //  fn = mc_filter_honly;
-+    //  break;
-+    case QPU_MC_SETUP_UV:
-+      fn = mc_setup_uv;
-+      break;
-+    case QPU_MC_FILTER_UV:
-+      fn = mc_filter_uv;
-+      break;
-+    case QPU_MC_FILTER_UV_B0:
-+      fn = mc_filter_uv_b0;
-+      break;
-+    case QPU_MC_FILTER_UV_B:
-+      fn = mc_filter_uv_b;
-+      break;
-+    case QPU_MC_INTERRUPT_EXIT8:
-+      fn = mc_interrupt_exit8;
-+      break;
-+    case QPU_MC_END:
-+      fn = mc_end;
-+      break;
-+    default:
-+      printf("Unknown function\n");
-+      exit(-1);
++    if (do_chroma) {
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
++    }
++  }
++  else if (!rpi_sliced_frame(frame))
++  {
++    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
++    if (do_luma) {
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
++    }
++    if (do_chroma) {
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
++    }
++  }
++  else
++  {
++    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
++//    printf("%s: start_line=%d, lines=%d, %c%c\n", __func__, start_line, n, do_luma ? 'l' : ' ', do_chroma ? 'c' : ' ');
++    for (int x = 0; x < frame->width; x += frame->linesize[0]) {
++      if (do_luma) {
++        rpi_cache_flush_add_gm_range(rfe, gm, mode, rpi_sliced_frame_off_y(frame, x, start_line), y_size);
++      }
++      if (do_chroma) {
++        rpi_cache_flush_add_gm_range(rfe, gm, mode,
++                                     (frame->data[1] - gm->arm) + rpi_sliced_frame_off_c(frame, x >> 1, start_line >> 1), uv_size);
++      }
 +    }
-+    return gpu->vc + 4*(int)(fn-rpi_shader);
-+    //return code[num] + gpu->vc;
++  }
 +}
 +
-+#if 0
-+typedef unsigned int uint32_t;
-+
-+typedef struct mvs_s {
-+    GPU_MEM_PTR_T unif_mvs_ptr;
-+    uint32_t *unif_mvs; // Base of memory for motion vector commands
++// Call this to clean and invalidate a region of memory
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
++{
++  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
++  rpi_cache_flush_add_gm_ptr(rfe, p, mode);
++  rpi_cache_flush_finish(rfe);
++}
 +
-+    // _base pointers are to the start of the row
-+    uint32_t *mvs_base[8];
-+    // these pointers are to the next free space
-+    uint32_t *u_mvs[8];
 +
-+} HEVCContext;
++// ----------------------------------------------------------------------------
 +
-+#define RPI_CHROMA_COMMAND_WORDS 12
 +
-+static void rpi_inter_clear(HEVCContext *s)
++// Wait abstractions - mostly so we can easily add profile code
++static void vq_wait_pool_init(vq_wait_pool_t * const wp)
 +{
-+    int i;
-+    for(i=0;i<8;i++) {
-+        s->u_mvs[i] = s->mvs_base[i];
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 128;  // w
-+        *s->u_mvs[i]++ = 128;  // h
-+        *s->u_mvs[i]++ = 128;  // stride u
-+        *s->u_mvs[i]++ = 128;  // stride v
-+        s->u_mvs[i] += 3;  // Padding words
-+    }
++  unsigned int i;
++  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++    sem_init(&wp->pool[i].sem, 0, 0);
++    wp->pool[i].next = wp->pool + i + 1;
++  }
++  wp->head = wp->pool + 0;
++  wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
 +}
 +
-+static void rpi_execute_inter_qpu(HEVCContext *s)
++static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
 +{
-+    int k;
-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
++  unsigned int i;
++  wp->head = NULL;
++  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++    sem_destroy(&wp->pool[i].sem);
++    wp->pool[i].next = NULL;
++  }
++}
 +
-+    for(k=0;k<8;k++) {
-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); //  dummy location for V
-+    }
 +
-+    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+
-+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-+      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-+      );
++// If sem_init actually takes time then maybe we want a pool...
++static vq_wait_t * vq_wait_new(const unsigned int cost)
++{
++  gpu_env_t * const ge = gpu_lock_ref();
++  vq_wait_t * const wait = ge->wait_pool.head;
++  ge->wait_pool.head = wait->next;
++  ge->current_load += cost;
++  wait->cost = cost;
++  wait->next = NULL;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  tto_start(&ge->ttw.active, ns_time());
++#endif
++
++  gpu_unlock();
++  return wait;
 +}
 +
-+void rpi_test_qpu(void)
++static void vq_wait_delete(vq_wait_t * const wait)
 +{
-+    HEVCContext mvs;
-+    HEVCContext *s = &mvs;
-+    int i;
-+    int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
-+    uint32_t *p;
-+    printf("Allocate memory\n");
-+    gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-+    s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm;
-+
-+    // Set up initial locations for uniform streams
-+    p = s->unif_mvs;
-+    for(i = 0; i < 8; i++) {
-+        s->mvs_base[i] = p;
-+        p += uv_commands_per_qpu;
++  gpu_env_t * const ge = gpu_lock();
++  wait->next = ge->wait_pool.head;
++  ge->wait_pool.head = wait;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  {
++    trace_time_wait_t * const ttw = &ge->ttw;
++    const int64_t now = ns_time();
++    ++ttw->jcount;
++    tto_end(&ttw->wait, now);
++
++    if (ttw->start0 == 0)
++    {
++      ttw->start0 = ttw->active.start[0];
++      ttw->last_update = ttw->start0;
 +    }
-+    // Now run a simple program that should just quit immediately after a single texture fetch
-+    rpi_inter_clear(s);
-+    for(i=0;i<4;i++) {
-+      printf("Launch QPUs\n");
-+      rpi_execute_inter_qpu(s);
-+      printf("Done\n");
++    if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
++    {
++      ttw->last_update += WAIT_TIME_PRINT_PERIOD;
++      ttw_print(ttw, now);
 +    }
-+    printf("Free memory\n");
-+    gpu_free(&s->unif_mvs_ptr);
-+    return;
-+}
++  }
 +#endif
++  gpu_unlock_unref(ge);
++}
 +
-+#if 0
-+
-+int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
-+//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
-+int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4};
-+//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
-+
-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24);
-+
-+static uint8_t av_clip_uint8(int32_t a)
++static void vq_wait_wait(vq_wait_t * const wait)
 +{
-+    if (a&(~255)) return (-a)>>31;
-+    else          return a;
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  {
++      const int64_t now = ns_time();
++      gpu_env_t * const ge = gpu_lock();
++      tto_start(&ge->ttw.wait, now);
++      gpu_unlock();
++  }
++#endif
++
++  while (sem_wait(&wait->sem) == -1 && errno == EINTR)
++    /* loop */;
 +}
 +
-+static int32_t filter8(const uint8_t *data, int pitch)
++static void vq_wait_post(vq_wait_t * const wait)
 +{
-+   int32_t vsum = 0;
-+   int x, y;
++#if !RPI_TRACE_TIME_VPU_QPU_WAIT
++  if (wait->cost != 0)
++#endif
++  {
++    gpu_env_t *const ge = gpu_lock();
++    ge->current_load -= wait->cost;
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++    tto_end(&ge->ttw.active, ns_time());
++#endif
++    gpu_unlock();
++  }
 +
-+   for (y = 0; y < 8; y++) {
-+      int32_t hsum = 0;
++  sem_post(&wait->sem);
++}
 +
-+      for (x = 0; x < 8; x++)
-+         hsum += hcoeffs[x]*data[x + y * pitch];
 +
-+      vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning
-+   }
 +
-+   return av_clip_uint8( (vsum + 64) >> 7);
-+}
++// Header comments were wrong for these two
++#define VPU_QPU_MASK_QPU  1
++#define VPU_QPU_MASK_VPU  2
 +
-+// Note regression changes coefficients so is not thread safe
-+//#define REGRESSION
-+#ifdef REGRESSION
-+#define CMAX 100
-+#else
-+#define CMAX 2
-+#endif
-+#define YMAX 16
++#define VPU_QPU_JOB_MAX 4
++struct vpu_qpu_job_env_s
++{
++  unsigned int n;
++  unsigned int mask;
++  unsigned int cost;
++  struct gpu_job_s j[VPU_QPU_JOB_MAX];
++};
++
++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
 +
-+int rpi_test_shader(void)
++vpu_qpu_job_env_t * vpu_qpu_job_new(void)
 +{
-+   int i, c;
++  vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
++  return vqj;
++}
 +
-+   uint32_t *unifs;
++void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
++{
++  memset(vqj, 0, sizeof(*vqj));
++  free(vqj);
++}
 +
-+   uint8_t *in_buffer;
-+   uint8_t *out_buffer[2];
++static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
++{
++  struct gpu_job_s * const j = vqj->j + vqj->n++;
++  av_assert0(vqj->n <= VPU_QPU_JOB_MAX);
++  return j;
++}
 +
-+   GPU_MEM_PTR_T unifs_ptr;
-+   GPU_MEM_PTR_T in_buffer_ptr;
-+   GPU_MEM_PTR_T out_buffer_ptr[2];
++void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
++  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
++{
++  if (vpu_code != 0) {
++    struct gpu_job_s *const j = new_job(vqj);
++    vqj->mask |= VPU_QPU_MASK_VPU;
++
++    j->command = EXECUTE_VPU;
++    j->u.v.q[0] = vpu_code;
++    j->u.v.q[1] = r0;
++    j->u.v.q[2] = r1;
++    j->u.v.q[3] = r2;
++    j->u.v.q[4] = r3;
++    j->u.v.q[5] = r4;
++    j->u.v.q[6] = r5;
++  }
++}
 +
-+   // Addresses in GPU memory of filter programs
-+   uint32_t mc_setup = 0;
-+   uint32_t mc_filter = 0;
-+   uint32_t mc_exit = 0;
++// flags are QPU_FLAGS_xxx
++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail)
++{
++  if (n != 0) {
++    struct gpu_job_s *const j = new_job(vqj);
++    vqj->mask |= VPU_QPU_MASK_QPU;
++    vqj->cost += cost;
++
++    j->command = EXECUTE_QPU;
++    j->u.q.jobs = n;
++#if RPI_TRACE_QPU_PROFILE_ALL
++    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
++#else
++    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
++#endif
++    j->u.q.timeout = 5000;
++    memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
++  }
++}
 +
-+   int pitch = 0x500;
++// Convert callback to sem post
++static void vpu_qpu_job_callback_wait(void * v)
++{
++  vq_wait_post(v);
++}
 +
-+   if (gpu==NULL) {
-+      gpu_lock();
-+      gpu_unlock();
-+   }
++void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
++{
++  vq_wait_t * wait;
 +
-+   printf("This needs to change to reflect new assembler\n");
-+   // Use table to compute locations of program start points
-+   mc_setup = code[0] + gpu->vc;
-+   mc_filter = code[1] + gpu->vc;
-+   mc_exit = code[2] + gpu->vc;
++  if (vqj->mask == 0) {
++    *wait_h = NULL;
++    return;
++  }
 +
-+   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
-+      return -2;
-+   }
-+   unifs = (uint32_t*)unifs_ptr.arm;
++  // We are going to want a sync object
++  wait = vq_wait_new(vqj->cost);
 +
-+   if (!vcos_verify_ge0(gpu_malloc_uncached(64*23,&in_buffer_ptr))) {
-+      return -3;
-+   }
-+   in_buffer = (uint8_t*)in_buffer_ptr.arm;
++  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
++  // If we only posted one thing or only QPU jobs
++  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
++  {
++    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
++    av_assert0(j->callback.func == 0);
 +
-+   if (!vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) {
-+      return -4;
-+   }
-+   out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm;
-+   out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm;
-+
-+   for (c = 0; c < CMAX; c++) {
-+      int xo[] = {rand()&31, rand()&31};
-+
-+#ifdef REGRESSION
-+      for (i = 0; i < 8; i++) {
-+         hcoeffs[i] = (int8_t)rand();
-+         vcoeffs[i] = (int8_t)rand();
-+         if (hcoeffs[i]==-128)
-+           hcoeffs[i]++;
-+         if (vcoeffs[i]==-128)
-+           vcoeffs[i]++;
-+      }
-+#endif
++    j->callback.func = vpu_qpu_job_callback_wait;
++    j->callback.cookie = wait;
++  }
++  else
++  {
++    struct gpu_job_s *const j = new_job(vqj);
 +
-+      for (i = 0; i < 64*23; i++) {
-+         //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]);
-+         in_buffer[i] = rand();
-+      }
++    j->command = EXECUTE_SYNC;
++    j->u.s.mask = vqj->mask;
++    j->callback.func = vpu_qpu_job_callback_wait;
++    j->callback.cookie = wait;
++  }
 +
-+      // Clear output array
-+      {
-+        int b;
-+        for(b=0;b<2;b++) {
-+          for(i=0;i<16*16;i++) {
-+            out_buffer[b][i] = 3;
-+          }
-+        }
-+      }
++  vqj->cost = 0;
++  vqj->mask = 0;
++  *wait_h = wait;
++}
 +
-+      unifs[0] = mc_filter;
-+      unifs[1] = in_buffer_ptr.vc+xo[0]+16;
-+      unifs[2] = 64; // src pitch
-+      unifs[3] = pitch; // dst pitch
-+      unifs[4] = 0; // Padding
-+      unifs[5] = 0;
-+      unifs[6] = 0;
-+      unifs[7 ] = mc_filter;
-+      unifs[8 ] = in_buffer_ptr.vc+xo[1]+16;
-+      unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-+      unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-+      unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-+      unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-+      unifs[13] = out_buffer_ptr[0].vc;
-+      unifs[14] = mc_exit;
-+      unifs[15] = in_buffer_ptr.vc+xo[1]+16;        // dummy
-+      unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-+      unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-+      unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-+      unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-+      unifs[20] = out_buffer_ptr[1].vc;
-+
-+      printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
-+
-+      // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem
-+
-+      //qpu_run_shader(mc_setup, unifs_ptr.vc);
-+      //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc);
-+      rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]);
-+      rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]);
-+
-+      if (1)
-+      {
-+         int x, y, b;
-+         int bad = 0;
-+
-+         for (b=0; b<2; ++b)
-+            for (y=0; y<YMAX; ++y)
-+               for (x=0; x<16; ++x) {
-+                  int32_t ref = filter8(in_buffer+x+y*64+xo[b], 64);
-+
-+                  if (out_buffer[b][x+y*pitch] != ref) {
-+                      bad = 1;
-+//                     printf("%d, %d, %d, %d\n", c, b, x, y);
-+                  }
-+#ifndef REGRESSION
-+                  //printf("%08x %08x\n", out_buffer[b][x+y*pitch], ref);
-+#endif
-+               }
-+          if (bad)
-+            printf("Failed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
-+          else
-+            printf("Passed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
-+      }
-+      //printf("%d\n", simpenrose_get_qpu_tick_count());
-+   }
++int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
++{
++  return vqj->n == 0 ? 0 : vc_gpuserv_execute_code(vqj->n, vqj->j);
++}
 +
-+   gpu_free(&out_buffer_ptr[0]);
-+   gpu_free(&out_buffer_ptr[1]);
-+   gpu_free(&in_buffer_ptr);
-+   gpu_free(&unifs_ptr);
++// Simple wrapper of start + delete
++int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
++{
++  int rv;
++  rv = vpu_qpu_job_start(vqj);
++  vpu_qpu_job_delete(vqj);
++  return rv;
++}
 +
-+   return 0;
++unsigned int vpu_qpu_current_load(void)
++{
++  return gpu_ptr()->current_load;
 +}
 +
-+void rpi_do_block_arm(const uint8_t *in_buffer, int src_pitch, uint8_t *dst, int dst_pitch)
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
 +{
-+  int x,y;
-+  for (y=0; y<16; ++y) {
-+    for (x=0; x<16; ++x) {
-+       dst[x+y*dst_pitch] = filter8(in_buffer+x+y*src_pitch, src_pitch);
++  if (wait_h != NULL)
++  {
++    vq_wait_t * const wait = *wait_h;
++    if (wait != NULL) {
++      *wait_h = NULL;
++      vq_wait_wait(wait);
++      vq_wait_delete(wait);
 +    }
 +  }
 +}
 +
-+void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, int dst_pitch, uint8_t *dst)
++int vpu_qpu_init()
 +{
-+   uint32_t *unifs;
-+
-+   GPU_MEM_PTR_T unifs_ptr;
-+   //uint8_t *out_buffer;
-+   //GPU_MEM_PTR_T out_buffer_ptr;
++  gpu_env_t * const ge = gpu_lock_ref();
++  if (ge == NULL)
++    return -1;
 +
-+   // Addresses in GPU memory of filter programs
-+   uint32_t mc_setup = 0;
-+   uint32_t mc_filter = 0;
-+   uint32_t mc_exit = 0;
-+   //int x,y;
-+
-+   if (gpu==NULL) {
-+      gpu_lock();
-+      gpu_unlock();
-+   }
++  if (ge->init_count++ == 0)
++  {
++    vc_gpuserv_init();
++  }
 +
-+   // Use table to compute locations of program start points
-+   mc_setup = code[0] + gpu->vc;
-+   mc_filter = code[1] + gpu->vc;
-+   mc_exit = code[2] + gpu->vc;
++  gpu_unlock();
++  return 0;
++}
 +
-+   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
-+      return;
-+   }
-+   //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr);
-+   //out_buffer = (uint8_t*)out_buffer_ptr.arm;
++void vpu_qpu_term()
++{
++  gpu_env_t * const ge = gpu_lock();
 +
-+   /*for (y=0; y<16; ++y) {
-+      for (x=0; x<16; ++x) {
-+         out_buffer[x+y*dst_pitch] = 7;
-+      }
-+    }*/
++  if (--ge->init_count == 0) {
++    vc_gpuserv_deinit();
 +
-+   unifs = (uint32_t*)unifs_ptr.arm;
-+
-+    unifs[0] = mc_filter;
-+    unifs[1] = (int)in_buffer_vc;
-+    unifs[2] = src_pitch; // src pitch
-+    unifs[3] = dst_pitch; // dst pitch
-+    unifs[4] = 0; // Padding
-+    unifs[5] = 0;
-+    unifs[6] = 0;
-+    unifs[7 ] = mc_exit;
-+    unifs[8 ] = (int)in_buffer_vc;
-+    unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-+    unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-+    unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-+    unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-+    unifs[13] = (int)dst_vc;
-+    //unifs[13] = (int)out_buffer_ptr.vc;
-+
-+    //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
-+
-+    qpu_run_shader(mc_setup, unifs_ptr.vc);
-+
-+    /*for (y=0; y<16; ++y) {
-+      for (x=0; x<16; ++x) {
-+         dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch];
-+      }
-+    }*/
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++    ttw_print(&ge->ttw, ns_time());
++#endif
++  }
 +
-+    gpu_free(&unifs_ptr);
-+    //gpu_free(&out_buffer_ptr);
++  gpu_unlock_unref(ge);
 +}
 +
-+
-+
-+#endif
++uint32_t qpu_fn(const int * const mc_fn)
++{
++  return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code);
++}
 +
 +#endif // RPI
 diff -Naur ffmpeg-3.2.4/libavcodec/rpi_qpu.h ffmpeg-3.2.4.patch/libavcodec/rpi_qpu.h
 --- ffmpeg-3.2.4/libavcodec/rpi_qpu.h	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/rpi_qpu.h	2017-03-22 22:42:34.848798575 +0100
-@@ -0,0 +1,176 @@
++++ ffmpeg-3.2.4.patch/libavcodec/rpi_qpu.h	2017-05-28 20:42:45.752088716 +0200
+@@ -0,0 +1,200 @@
 +#ifndef RPI_QPU_H
 +#define RPI_QPU_H
 +
-+// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
-+// *** N.B. Code has rotted & crashes if this is unset (before this set of changes)
-+#define RPI_FAST_CACHEFLUSH
-+
 +#define RPI_ONE_BUF 1
 +
 +typedef struct gpu_mem_ptr_s {
@@ -12537,9 +15506,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_qpu.h ffmpeg-3.2.4.patch/libavcodec/rpi_q
 +// General GPU functions
 +extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
 +extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
-+extern void gpu_free(GPU_MEM_PTR_T *p);
-+extern void gpu_cache_flush(const GPU_MEM_PTR_T * const p);
-+extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
++extern void gpu_free(GPU_MEM_PTR_T * const p);
 +
 +#include "libavutil/frame.h"
 +#if !RPI_ONE_BUF
@@ -12582,29 +15549,31 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_qpu.h ffmpeg-3.2.4.patch/libavcodec/rpi_q
 +    return av_buffer_get_opaque(frame->buf[0]);
 +}
 +
-+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const int n)
++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
 +{
 +    return av_buffer_pool_opaque(frame->buf[n]);
 +}
 +
++static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
++{
++    const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
++    return gm->vc + (frame->data[n] - gm->arm);
++}
++
 +
 +static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
-+    return gpu_is_buf1(frame) ? gpu_buf1_gmem(frame)->vc : gpu_buf3_gmem(frame, 0)->vc;
++    return get_vc_address3(frame, 0);
 +}
 +
 +static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
-+    return gpu_is_buf1(frame) ?
-+        gpu_buf1_gmem(frame)->vc + frame->data[1] - frame->data[0] :
-+        gpu_buf3_gmem(frame, 1)->vc;
++    return get_vc_address3(frame, 1);
 +}
 +
 +static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
-+    return gpu_is_buf1(frame) ?
-+        gpu_buf1_gmem(frame)->vc + frame->data[2] - frame->data[0] :
-+        gpu_buf3_gmem(frame, 2)->vc;
++    return get_vc_address3(frame, 2);
 +}
 +
-+
++#if 0
 +static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
 +    if (gpu_is_buf1(frame))
 +    {
@@ -12641,48 +15610,74 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_qpu.h ffmpeg-3.2.4.patch/libavcodec/rpi_q
 +    else
 +        return *gpu_buf3_gmem(frame, 2);
 +}
-+
 +#endif
++#endif
++
++// Cache flush stuff
++
++struct rpi_cache_flush_env_s;
++typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
++
++rpi_cache_flush_env_t * rpi_cache_flush_init(void);
++// Free env without flushing
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
++// Do the accumulated flush & free the env
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
++
++typedef enum
++{
++    RPI_CACHE_FLUSH_MODE_INVALIDATE     = 1,
++    RPI_CACHE_FLUSH_MODE_WRITEBACK      = 2,
++    RPI_CACHE_FLUSH_MODE_WB_INVALIDATE  = 3
++} rpi_cache_flush_mode_t;
++
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
++  const unsigned int offset, const unsigned int size);
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
++  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma);
++
++// init, add, finish for one gm ptr
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
 +
 +
 +// QPU specific functions
-+extern void rpi_test_qpu(void);
-+
-+enum {
-+  QPU_MC_SETUP,
-+  QPU_MC_FILTER,
-+  QPU_MC_EXIT,
-+  QPU_MC_INTERRUPT_EXIT12,
-+  QPU_MC_FILTER_B,
-+  QPU_MC_FILTER_HONLY,
-+  QPU_MC_SETUP_UV,
-+  QPU_MC_FILTER_UV,
-+  QPU_MC_FILTER_UV_B0,
-+  QPU_MC_FILTER_UV_B,
-+  QPU_MC_INTERRUPT_EXIT8,
-+  QPU_MC_END
-+  };
-+extern unsigned int qpu_get_fn(int num);
-+
-+#define QPU_N_UV   8
-+#define QPU_N_Y    12
-+#define QPU_N_MAX  16
++uint32_t qpu_fn(const int * const mc_fn);
++
++#define QPU_N_GRP_UV 4
++#define QPU_N_UV     8
++#define QPU_N_GRP_Y  4  // 4 QPUs per TMU
++#define QPU_N_Y      12
 +
 +#define QPU_MAIL_EL_VALS  2
-+#define QPU_MAIL_EL_SIZE  (QPU_MAIL_EL_VALS * sizeof(uint32_t))
-+#define QPU_MAIL_VALS_MAX (QPU_N_MAX * QPU_MAIL_EL_VALS)
-+#define QPU_MAIL_SIZE (QPU_MAIL_VALS_MAX * sizeof(uint32_t))
++
++struct vpu_qpu_wait_s;
++typedef struct vq_wait_s * vpu_qpu_wait_h;
 +
 +// VPU specific functions
++
++struct vpu_qpu_job_env_s;
++typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
++
++vpu_qpu_job_h vpu_qpu_job_new(void);
++void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
++void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
++  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail);
++void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
++int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
++int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
++
++
 +extern unsigned int vpu_get_fn(void);
 +extern unsigned int vpu_get_constants(void);
-+//extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+extern int vpu_post_code2( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
-+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+    int qpu0_n, const uint32_t * qpu0_mail,
-+    int qpu1_n, const uint32_t * qpu1_mail);
 +
-+extern void vpu_wait( int id);
++// Waits for previous post_codee to complete and Will null out *wait_h after use
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
++unsigned int vpu_qpu_current_load(void);
++int vpu_qpu_init(void);
++void vpu_qpu_term(void);
 +
 +// Simple test of shader code
 +extern int rpi_test_shader(void);
@@ -12691,12 +15686,14 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_qpu.h ffmpeg-3.2.4.patch/libavcodec/rpi_q
 +extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
 +
 +extern int gpu_get_mailbox(void);
++void gpu_ref(void);
++void gpu_unref(void);
 +
 +#endif
 diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.c ffmpeg-3.2.4.patch/libavcodec/rpi_shader.c
 --- ffmpeg-3.2.4/libavcodec/rpi_shader.c	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/rpi_shader.c	2017-03-22 22:42:34.849798577 +0100
-@@ -0,0 +1,629 @@
++++ ffmpeg-3.2.4.patch/libavcodec/rpi_shader.c	2017-05-28 20:42:45.753088719 +0200
+@@ -0,0 +1,670 @@
 +#include "rpi_shader.h"
 +
 +#ifdef _MSC_VER
@@ -12720,639 +15717,779 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.c ffmpeg-3.2.4.patch/libavcodec/rp
 +__attribute__((aligned(8)))
 +#endif
 +unsigned int rpi_shader[] = {
-+// ::mc_setup_uv
-+/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num
-+/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
-+/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
-+/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+/* [0x00000060] */ 0x00010000, 0xe0020127, // mov ra4, 0x10000
-+/* [0x00000068] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
-+/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
-+/* [0x00000078] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
-+/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x000000d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-+/* [0x000000e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+/* [0x000000e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-+/* [0x000000f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+/* [0x000000f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x00000100] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000108] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000110] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-+/* [0x00000118] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000120] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+/* [0x00000128] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-+/* [0x00000130] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
-+/* [0x00000138] */ 0x00000009, 0xe00208a7, // mov r2, 9
-+/* [0x00000140] */ 0x0c827580, 0x10021367, // add rb13, r2, unif
-+/* [0x00000148] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000150] */ 0x15827d80, 0x100208a7, // mov r2, unif
-+/* [0x00000158] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+/* [0x00000160] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x00000168] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000170] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000178] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000180] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000188] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000190] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000198] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x000001a0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x000001a8] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1
-+/* [0x000001b0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x000001b8] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1
-+/* [0x000001c0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x000001c8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x000001d0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x000001d8] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x000001e8] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x000001f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000001f8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000200] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-+/* [0x00000208] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
++// ::mc_setup_c
++/* [0x00000000] */ 0x95801ff6, 0xd0020927, // mov tmurs, 1          ; mov -, unif
++/* [0x00000008] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x00000010] */ 0x15827d80, 0x10020627, // mov ra_base, unif
++/* [0x00000018] */ 0x0d801dc0, 0xd0021667, // sub rb_max_x, unif, 1
++/* [0x00000020] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
++/* [0x00000028] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
++/* [0x00000030] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
++/* [0x00000038] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
++/* [0x00000040] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0
++/* [0x00000048] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
++/* [0x00000050] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
++/* [0x00000058] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
++/* [0x00000060] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
++/* [0x00000068] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000070] */ 0x95800dbf, 0xd002550c, // mov rb_xpitch, unif   ; mov ra12, 0
++/* [0x00000078] */ 0x95800dbf, 0xd002540d, // mov rb_pitch, unif    ; mov ra13, 0
++/* [0x00000080] */ 0x95980dbf, 0xd002580e, // mov r0, elem_num      ; mov ra14, 0
++/* [0x00000088] */ 0x8c5d03f6, 0x1002560f, // add rb24, r1, rb_pitch ; mov ra15, ra_k0
++/* [0x00000090] */ 0x0c027180, 0x14020827, // add r0, r0, ra0.16b
++/* [0x00000098] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
++/* [0x000000a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000000a8] */ 0x149c11c0, 0xd0020867, // and r1, r0, 1
++/* [0x000000b0] */ 0x119c43c0, 0xd01204e7, // shl ra_xshift_next, r1, 4
++/* [0x000000b8] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2
++/* [0x000000c0] */ 0xec9e7009, 0x10024821, // add r0, r0, r0        ; v8subs r1, r1, r1
++/* [0x000000c8] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch
++/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000000e0] */ 0x8c467076, 0x14024821, // add r0, r0, r1        ; mov r1, ra_y
++/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
++/* [0x000000f0] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
++/* [0x000000f8] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
++/* [0x00000100] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
++/* [0x00000108] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0
++/* [0x00000110] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
++/* [0x00000118] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
++/* [0x00000120] */ 0x4c510387, 0x10224460, // add ra_y, r1, ra_k1   ; mul24 r0, r0, rb_pitch
++/* [0x00000128] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0
++/* [0x00000130] */ 0x0c809f80, 0xd0021367, // add rb13, 9, unif
++/* [0x00000138] */ 0x15827d80, 0x100009e7, // mov -, unif
++/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000148] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
++/* [0x00000150] */ 0x119c53c0, 0xd0020867, // shl r1, r1, 5
++/* [0x00000158] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
++/* [0x00000160] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000168] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
++/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000178] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
++/* [0x00000180] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
++/* [0x00000188] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000190] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000198] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x000001a0] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
++/* [0x000001a8] */ 0x15027d80, 0x12120567, // mov ra_y2, ra0.16a
++/* [0x000001b0] */ 0x15027d80, 0x14020827, // mov r0, ra0.16b
++/* [0x000001b8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
++/* [0x000001c0] */ 0x938001f6, 0xd0020827, // max r0, r0, 0         ; mov -, unif
++/* [0x000001c8] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x  ; mov -, unif
++/* [0x000001d0] */ 0x948011f6, 0xd0020867, // and r1, r0, 1         ; mov -, unif
++/* [0x000001d8] */ 0x119c43c0, 0xd0021067, // shl rb_xshift2_next, r1, 4
++/* [0x000001e0] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2
++/* [0x000001e8] */ 0xec9e7009, 0x10024821, // add r0, r0, r0        ; v8subs r1, r1, r1
++/* [0x000001f0] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch
++/* [0x000001f8] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x00000200] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000208] */ 0x8c567076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_y2
++/* [0x00000210] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
++/* [0x00000218] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
++/* [0x00000220] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
++/* [0x00000228] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
++/* [0x00000230] */ 0x8c660c3f, 0x10020f27, // add t1s, ra_base2, r0 ; mov -, unif
++/* [0x00000238] */ 0x938003f6, 0xd0020827, // max r0, r1, 0         ; mov -, unif
++/* [0x00000240] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000248] */ 0x9281e1f6, 0x10020827, // min r0, r0, rb_max_y  ; mov -, unif
++/* [0x00000250] */ 0x4c510387, 0x10124560, // add ra_y2, r1, ra_k1   ; mul24 r0, r0, rb_pitch
++/* [0x00000258] */ 0x0c667c00, 0x10020f27, // add t1s, ra_base2, r0
 +// ::mc_filter_uv
-+/* [0x00000210] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0         ; mov r1, unif
-+/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000230] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
-+/* [0x00000238] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000240] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3        ; mov ra1, unif
-+/* [0x00000248] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
-+/* [0x00000250] */ 0x959dc27f, 0x10024731, // mov ra_y_next, r1     ; mov vw_setup, rb28
-+/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000260] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x00000268] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
-+/* [0x00000270] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
-+/* [0x00000278] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
-+/* [0x00000280] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
-+/* [0x00000288] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
-+/* [0x00000290] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27    ; mov ra3, unif
-+/* [0x00000298] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000002a0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif         ; mov rb8,  ra3.8a
-+/* [0x000002a8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif    ; mov rb9,  ra3.8b
-+/* [0x000002b0] */ 0x800e7036, 0x1c0049ca, // nop                   ; mov rb10, ra3.8c
-+/* [0x000002b8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
-+/* [0x000002c0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
-+/* [0x000002c8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
-+/* [0x000002d0] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
++/* [0x00000260] */ 0x9581cdbf, 0x100247b1, // mov ra_link, unif     ; mov vw_setup, rb28
++/* [0x00000268] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
++/* [0x00000270] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000278] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
++/* [0x00000280] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
++/* [0x00000288] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
++/* [0x00000290] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
++/* [0x00000298] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4
++/* [0x000002a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
++/* [0x000002a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0        ; mov ra_y_next, ra2.16a
++/* [0x000002b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
++/* [0x000002b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000002c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
++/* [0x000002c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
++/* [0x000002d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2    ; mov ra3, unif
++/* [0x000002d8] */ 0x8c8013f6, 0xd0025441, // add rb17, r1, 1       ; mov ra1, unif
++/* [0x000002e0] */ 0x8c8033f6, 0xd002d481, // add rb18, r1, 3       ; mov.ifnz ra1, unif
++/* [0x000002e8] */ 0x8c0e70b6, 0x18024808, // add r0,   r0, r2      ; mov rb8,  ra3.8a
++/* [0x000002f0] */ 0x910cf1f6, 0xda024809, // shl r0,   r0, 15      ; mov rb9,  ra3.8b
++/* [0x000002f8] */ 0x8c05b1f6, 0x140256a1, // add rb26, r0, rb27    ; mov r1, ra1.16b
++/* [0x00000300] */ 0x910cd3f6, 0x1c02484a, // shl r1, r1, rb13      ; mov rb10, ra3.8c
++/* [0x00000308] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
++/* [0x00000310] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
++/* [0x00000318] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
 +// :uvloop
-+/* [0x000002d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
-+/* [0x000002e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+/* [0x000002e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x000002f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000002f8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000300] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000308] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000310] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000318] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+/* [0x00000320] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x00000328] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000330] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
-+/* [0x00000338] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+/* [0x00000340] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+/* [0x00000348] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+/* [0x00000350] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+/* [0x00000358] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00000360] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+/* [0x00000368] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x00000370] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00000378] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000380] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000388] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
-+/* [0x00000390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000398] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
-+/* [0x000003a0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+/* [0x000003a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000003b0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
-+/* [0x000003b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000003c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x000003e0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x000003e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x000003f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x000003f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000400] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000408] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000410] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000418] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000420] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000428] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000430] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000438] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000440] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x00000320] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0
++/* [0x00000328] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
++/* [0x00000330] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y
++/* [0x00000338] */ 0x936807f6, 0xd0029898, // max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
++/* [0x00000340] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00000348] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
++/* [0x00000350] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
++/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000360] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
++/* [0x00000368] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++/* [0x00000370] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++/* [0x00000378] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++/* [0x00000380] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000388] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00000390] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++/* [0x00000398] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x000003a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
++/* [0x000003a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
++/* [0x000003b0] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop
++/* [0x000003b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
++/* [0x000003c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
++/* [0x000003c8] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
++/* [0x000003d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra14, rb10
++/* [0x000003d8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra15, rb11
++/* [0x000003e0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x000003e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18  ; mul24 r1, r1, ra_k256
++/* [0x000003f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000003f8] */ 0x409ce00f, 0x100049e1, // nop                   ; mul24 r1, r1, rb14
++/* [0x00000400] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
++/* [0x00000408] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
++/* [0x00000410] */ 0x0f9cd3c0, 0x10c20067, // asr ra1.8as, r1, rb13
++/* [0x00000418] */ 0x809f8009, 0xd00049e1, // nop                   ; mov r1, r1 << 8
++/* [0x00000420] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:uvloop
++/* [0x00000428] */ 0x0f9cd3c0, 0x10d20067, // asr ra1.8bs, r1, rb13
++/* [0x00000430] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000438] */ 0x15067d80, 0x10020c27, // mov vpm, ra1
++/* [0x00000440] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000448] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
++/* [0x00000450] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x00000458] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
 +// ::mc_filter_uv_b0
-+/* [0x00000448] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000450] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000458] */ 0x938001f6, 0xd0024821, // max r0, r0, 0                ; mov r1, unif
-+/* [0x00000460] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000468] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next
-+/* [0x00000470] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000478] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3  	     ; mov ra1, unif
-+/* [0x00000480] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3        ; mov ra0, unif
-+/* [0x00000488] */ 0x959d527f, 0x10024731, // mov ra_y_next, r1            ; mov vw_setup, rb21
-+/* [0x00000490] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000498] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x000004a0] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
-+/* [0x000004a8] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
-+/* [0x000004b0] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
-+/* [0x000004b8] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
-+/* [0x000004c0] */ 0x918101f6, 0xd0025803, // shl r0,   r0, i_shift16      ; mov ra3, unif
-+/* [0x000004c8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000004d0] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a
-+/* [0x000004d8] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b
-+/* [0x000004e0] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c
-+/* [0x000004e8] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d
-+/* [0x000004f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000004f8] */ 0x15827d80, 0x100213a7, // mov      rb14, unif
-+/* [0x00000500] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif    ; mov r3, 0
++/* [0x00000460] */ 0x9581cdbf, 0x100049f1, // mov -, unif           ; mov vw_setup, rb28
++/* [0x00000468] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
++/* [0x00000470] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000478] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
++/* [0x00000480] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
++/* [0x00000488] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
++/* [0x00000490] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
++/* [0x00000498] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4
++/* [0x000004a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
++/* [0x000004a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0        ; mov ra_y_next, ra2.16a
++/* [0x000004b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
++/* [0x000004b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000004c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
++/* [0x000004c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
++/* [0x000004d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2    ; mov ra3, unif
++/* [0x000004d8] */ 0x0c9c13c0, 0xd0021467, // add rb17, r1, 1
++/* [0x000004e0] */ 0x8c0c33f6, 0xd80247c8, // add ra31, r1, 3       ; mov rb8,  ra3.8a
++/* [0x000004e8] */ 0x8c0e70b6, 0x1a024809, // add r0,   r0, r2      ; mov rb9,  ra3.8b
++/* [0x000004f0] */ 0x910cf1f6, 0xdc02480a, // shl r0,   r0, 15      ; mov rb10, ra3.8c
++/* [0x000004f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
++/* [0x00000500] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
++/* [0x00000508] */ 0x15827d80, 0x100213a7, // mov rb14, unif
++/* [0x00000510] */ 0x15827d80, 0x100613a7, // mov.ifnz rb14, unif
 +// :uvloop_b0
-+/* [0x00000508] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
-+/* [0x00000510] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+/* [0x00000518] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000520] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000528] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000530] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000540] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000548] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+/* [0x00000550] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x00000558] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000560] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
-+/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+/* [0x00000570] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+/* [0x00000578] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+/* [0x00000580] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+/* [0x00000588] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00000590] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+/* [0x00000598] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x000005a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x000005a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x000005b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000005b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
-+/* [0x000005c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x000005c8] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
-+/* [0x000005d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+/* [0x000005d8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+/* [0x000005e0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000005e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000005f0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
-+/* [0x000005f8] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+/* [0x00000600] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000608] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000610] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000618] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_filter_uv_b
-+/* [0x00000620] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000628] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
-+/* [0x00000630] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000638] */ 0x938001f6, 0xd002581c, // max r0, r0, 0                      ; mov ra_y_next, unif
-+/* [0x00000640] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000648] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8
-+/* [0x00000650] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3                     ; mov ra1, unif
-+/* [0x00000658] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3              ; mov ra0, unif
-+/* [0x00000660] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x00000668] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
-+/* [0x00000670] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
-+/* [0x00000678] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
-+/* [0x00000680] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000688] */ 0x918151f6, 0xd00258c3, // shl r3, r0, i_shift21     ; mov ra3, unif
-+/* [0x00000690] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x00000698] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+/* [0x000006a0] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
-+/* [0x000006a8] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
-+/* [0x000006b0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000006b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000006c0] */ 0x950e0ff6, 0x18024048, // mov      ra1, unif  ; mov rb8,  ra3.8a
-+/* [0x000006c8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif  ; mov rb9,  ra3.8b
-+/* [0x000006d0] */ 0x800e7036, 0x1c0049ca, // nop                 ; mov rb10, ra3.8c
-+/* [0x000006d8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0           ; mov rb11, ra3.8d
-+/* [0x000006e0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
-+/* [0x000006e8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
++/* [0x00000518] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0
++/* [0x00000520] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
++/* [0x00000528] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y
++/* [0x00000530] */ 0x936807f6, 0xd0029898, // max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
++/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00000540] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
++/* [0x00000548] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
++/* [0x00000550] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000558] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
++/* [0x00000560] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++/* [0x00000570] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++/* [0x00000578] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000580] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00000588] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++/* [0x00000590] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x00000598] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
++/* [0x000005a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
++/* [0x000005a8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
++/* [0x000005b0] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
++/* [0x000005b8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15        ; mul24 r2, ra15, rb10
++/* [0x000005c0] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
++/* [0x000005c8] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
++/* [0x000005d0] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
++/* [0x000005d8] */ 0x8d9c623f, 0x10025847, // sub r1, r1, r0        ; mov ra7, rb6
++/* [0x000005e0] */ 0x0d7e7780, 0x100229e7, // sub.setf -, r3, ra31
++/* [0x000005e8] */ 0x8f1463f6, 0xd0124206, // asr ra8.16a, r1, 6    ; mov rb6, ra5
++/* [0x000005f0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
++/* [0x000005f8] */ 0x95104ff6, 0x10024144, // mov ra5, rb4          ; mov rb4, ra4
++/* [0x00000600] */ 0x95185ff6, 0x10024105, // mov ra4, rb5          ; mov rb5, ra6
++/* [0x00000608] */ 0x95207ff6, 0x10024187, // mov ra6, rb7          ; mov rb7, ra8
++/* [0x00000610] */ 0x0d9cfec0, 0xd00229e7, // sub.setf -, 15, r3
++/* [0x00000618] */ 0x00000090, 0xf06809e7, // brr.anyn -, r:uv_b0_post_fin
++/* [0x00000620] */ 0x8d80bef6, 0xd00208e7, // sub r3, 11, r3        ; mov -, unif
++/* [0x00000628] */ 0x95810ff6, 0xd002581e, // mov r0, i_shift16     ; mov ra_link, unif
++/* [0x00000630] */ 0x00010000, 0xe0020867, // mov r1, 0x10000
++/* [0x00000638] */ 0x00000040, 0xf02809e7, // brr.anyz -, r:uv_b0_post12
++/* [0x00000640] */ 0x511c7c39, 0x1006c1c7, // shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
++/* [0x00000648] */ 0x51186c39, 0x1006c186, // shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
++/* [0x00000650] */ 0x51145c39, 0x1006c145, // shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
++/* [0x00000658] */ 0x51104c39, 0x10024104, // shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
++/* [0x00000660] */ 0x119de7c0, 0xd00229e7, // shl.setf -, r3, i_shift30
++/* [0x00000668] */ 0x95105dbf, 0x100d81c6, // mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
++/* [0x00000670] */ 0x95187dbf, 0x100d8144, // mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
++/* [0x00000678] */ 0x00000030, 0xf0f809e7, // brr -, r:uv_b0_post_fin
++/* [0x00000680] */ 0x95144dbf, 0x100901c6, // mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
++/* [0x00000688] */ 0x95105dbf, 0x10090144, // mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
++/* [0x00000690] */ 0x95187dbf, 0x10090105, // mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
++// :uv_b0_post12
++/* [0x00000698] */ 0x95187dbf, 0x100248a3, // mov r2, ra6           ; mov r3, rb7
++/* [0x000006a0] */ 0x51144c39, 0x10024187, // shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
++/* [0x000006a8] */ 0x959e749b, 0x10024144, // mov ra5, r2           ; mov rb4, r3
++/* [0x000006b0] */ 0x95105dbf, 0x100248a3, // mov r2,  ra4          ; mov r3,  rb5
++/* [0x000006b8] */ 0x511c6c39, 0x10024105, // shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
++/* [0x000006c0] */ 0x959e749b, 0x100241c6, // mov ra7, r2           ; mov rb6, r3
++// :uv_b0_post_fin
++/* [0x000006c8] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
++/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x000006d8] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
++/* [0x000006e0] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
++/* [0x000006e8] */ 0x935c11bf, 0x10024800, // max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next
++/* [0x000006f0] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x  ; mov -, unif
++/* [0x000006f8] */ 0x119c41c0, 0xd0021067, // shl rb_xshift2_next, r0, 4
++/* [0x00000700] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
++/* [0x00000708] */ 0x8c0a7036, 0x12225815, // add r0, r0, r0        ; mov ra_y2_next, ra2.16a
++/* [0x00000710] */ 0x94827076, 0x10025843, // and r1, r0, r1        ; mov ra3, unif
++/* [0x00000718] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000720] */ 0x8c0e7076, 0x18024808, // add r0, r0, r1        ; mov rb8,  ra3.8a
++/* [0x00000728] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
++/* [0x00000730] */ 0x950e0ff6, 0x1a024049, // mov ra1, unif         ; mov rb9,  ra3.8b
++/* [0x00000738] */ 0x950e0ff6, 0x1c06404a, // mov.ifnz ra1, unif    ; mov rb10, ra3.8c
++/* [0x00000740] */ 0x800e7036, 0x1e0049cb, // nop                   ; mov rb11, ra3.8d
++/* [0x00000748] */ 0xf104dddb, 0x14024863, // shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3
++/* [0x00000750] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
 +// :uvloop_b
-+/* [0x000006f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
-+/* [0x000006f8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+/* [0x00000700] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000708] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000710] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20
-+/* [0x00000718] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000720] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000728] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000730] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2         ; v8subs r1, r1, rb20
-+/* [0x00000738] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x00000740] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000748] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
-+/* [0x00000750] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+/* [0x00000758] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+/* [0x00000760] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+/* [0x00000768] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+/* [0x00000770] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00000778] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+/* [0x00000780] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x00000788] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00000790] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000798] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000007a0] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
-+/* [0x000007a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x000007b0] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
-+/* [0x000007b8] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+/* [0x000007c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000007c8] */ 0x4d13023e, 0x10024860, // sub r1, r1, r0          ; mul24 r0, vpm, ra4
-+/* [0x000007d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x000007d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000007e0] */ 0x4f0501ce, 0xd2024821, // asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
-+/* [0x000007e8] */ 0x409ce007, 0x100049e0, // nop                     ; mul24 r0, r0, rb14
-+/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000007f8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00000800] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x00000808] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000810] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00000818] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x00000820] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000828] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000830] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000838] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000840] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000848] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000850] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000858] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000860] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000868] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_exit
-+/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000878] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000880] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000888] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00000890] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000898] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x000008a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000008a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x000008b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_interrupt_exit8
-+/* [0x000008b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x000008c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000008c8] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x000008d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000008d8] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00000758] */ 0xcd5117de, 0xb00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu1
++/* [0x00000760] */ 0x8e5409f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next
++/* [0x00000768] */ 0x8e5481f6, 0xd202c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y2
++/* [0x00000770] */ 0x935d37bf, 0x10029899, // max r2, r3, ra_k0     ; mov.ifz ra_base2, rb_base2_next
++/* [0x00000778] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00000780] */ 0x4c510797, 0x10124562, // add ra_y2, r3, ra_k1  ; mul24 r2, r2, rb_pitch
++/* [0x00000788] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255
++/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000798] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255  ; mul24      r3, ra0.8a,       r0
++/* [0x000007a0] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
++/* [0x000007a8] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
++/* [0x000007b0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
++/* [0x000007b8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
++/* [0x000007c0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
++/* [0x000007c8] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
++/* [0x000007d0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
++/* [0x000007d8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
++/* [0x000007e0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
++/* [0x000007e8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b
++/* [0x000007f0] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
++/* [0x000007f8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15          ; mul24 r2, ra15, rb10
++/* [0x00000800] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
++/* [0x00000808] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
++/* [0x00000810] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
++/* [0x00000818] */ 0x4d1ce237, 0x14024860, // sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
++/* [0x00000820] */ 0x55586fce, 0x100241e1, // mov ra7, rb6          ; mul24 r1, r1, ra_k256
++/* [0x00000828] */ 0x8f14e3f6, 0xd0024846, // asr r1, r1, 14        ; mov rb6, ra5
++/* [0x00000830] */ 0x55044fce, 0x12024161, // mov ra5, rb4          ; mul24 r1, r1, ra1.16a
++/* [0x00000838] */ 0x8c127236, 0x10024844, // add r1, r1, r0        ; mov rb4, ra4
++/* [0x00000840] */ 0x55585fce, 0x10024121, // mov ra4, rb5          ; mul24 r1, r1, ra_k256
++/* [0x00000848] */ 0x8c18c3f6, 0x10024845, // add r1, r1, rb12      ; mov rb5, ra6
++/* [0x00000850] */ 0x8d7c77bf, 0x100279c6, // sub.setf -, r3, ra31  ; mov ra6, rb7
++/* [0x00000858] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb13
++/* [0x00000860] */ 0x809f8009, 0xd00049e1, // nop                   ; mov r1, r1 << 8
++/* [0x00000868] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
++/* [0x00000870] */ 0x0f9cd3c0, 0x10d200e7, // asr ra3.8bs, r1, rb13
++/* [0x00000878] */ 0x95232ff6, 0x100049c7, // mov -, vw_wait        ; mov rb7, ra8
++/* [0x00000880] */ 0x150e7d80, 0x10020c27, // mov vpm, ra3
++/* [0x00000888] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000890] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
++/* [0x00000898] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x000008a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++// ::mc_interrupt_exit8c
++/* [0x000008a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x000008b0] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x000008b8] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x000008c0] */ 0x159f2fc0, 0xa00009e7, // mov  -, vw_wait ; nop ; ldtmu0
++/* [0x000008c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000008d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000008d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
 +/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
 +/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
 +/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
 +/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000900] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000908] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000910] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000918] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000920] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000928] */ 0x009e7000, 0x100009e7, // nop        ; nop
++/* [0x00000900] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
++/* [0x00000908] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
++/* [0x00000910] */ 0x009e7000, 0x100009e7, // nop        ; nop
++// ::mc_exit
++// ::mc_exit_c
++/* [0x00000918] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00000920] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00000928] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00000930] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
++/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
++/* [0x00000940] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
++/* [0x00000948] */ 0x009e7000, 0x100009e7, // nop        ; nop
++/* [0x00000950] */ 0x009e7000, 0x100009e7, // nop        ; nop
++// ::mc_interrupt_exit12
++/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00000960] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00000970] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
++/* [0x00000978] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00000980] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00000988] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00000990] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00000998] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000009d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
++/* [0x000009d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
++/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
++// ::mc_exit1
++/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x000009f8] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00000a08] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00000a10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
++/* [0x00000a18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
++/* [0x00000a20] */ 0x009e7000, 0x100009e7, // nop        ; nop
 +// ::mc_setup
-+/* [0x00000930] */ 0x00000010, 0xe00208e7, // mov r3, 16
-+/* [0x00000938] */ 0x15827d80, 0x10020227, // mov ra8, unif
-+/* [0x00000940] */ 0x15827d80, 0x10020267, // mov ra9, unif
-+/* [0x00000948] */ 0x15827d80, 0x100202a7, // mov ra10, unif
-+/* [0x00000950] */ 0x15827d80, 0x100202e7, // mov ra11, unif
-+/* [0x00000958] */ 0x15827d80, 0x10020867, // mov r1, unif
-+/* [0x00000960] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-+/* [0x00000968] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-+/* [0x00000970] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-+/* [0x00000978] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1
-+/* [0x00000980] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1
-+/* [0x00000988] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-+/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000998] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x000009a0] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+/* [0x000009a8] */ 0x15227d80, 0x10020867, // mov r1, ra8
-+/* [0x000009b0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-+/* [0x000009b8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-+/* [0x000009c0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-+/* [0x000009c8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x000009d0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x000009d8] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9
-+/* [0x000009e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000009e8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x000009f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x000009f8] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+/* [0x00000a00] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000a08] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000a10] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+/* [0x00000a18] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
-+/* [0x00000a20] */ 0x152a7d80, 0x10020867, // mov r1, ra10
-+/* [0x00000a28] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-+/* [0x00000a30] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-+/* [0x00000a38] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-+/* [0x00000a40] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x00000a48] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000a50] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11
-+/* [0x00000a58] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
-+/* [0x00000a60] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
-+/* [0x00000a68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000a70] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+/* [0x00000a78] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000a80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000a88] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+/* [0x00000a90] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
-+/* [0x00000a98] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
-+/* [0x00000aa0] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
-+/* [0x00000aa8] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+/* [0x00000ab0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+/* [0x00000ab8] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
-+/* [0x00000ac0] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x00000ac8] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x00000ad0] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x00000ad8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x00000ae0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x00000ae8] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x00000af0] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x00000af8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x00000b00] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x00000b08] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000b10] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000b18] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000b20] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000b28] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000b30] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000b38] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000b40] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000b48] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x00000b50] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000b58] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000b60] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x00000b68] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
-+/* [0x00000b70] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000b78] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00000b80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000b88] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00000b90] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000b98] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+/* [0x00000ba0] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
-+/* [0x00000ba8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000bb0] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
-+/* [0x00000bb8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000bc0] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
++/* [0x00000a28] */ 0x95801ff6, 0xd0025908, // mov tmurs, 1          ; mov ra8, unif
++/* [0x00000a30] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00000a38] */ 0x15827d80, 0x100202a7, // mov ra10, unif
++/* [0x00000a40] */ 0x15827d80, 0x100202e7, // mov ra11, unif
++/* [0x00000a48] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x00000a50] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
++/* [0x00000a58] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1
++/* [0x00000a60] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
++/* [0x00000a68] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00000a70] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000a78] */ 0x159d03c0, 0x10021627, // or  rb24, r1, rb_pitch
++/* [0x00000a80] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
++/* [0x00000a88] */ 0x0c227cc0, 0x12020827, // add r0, ra8.16a, r3
++/* [0x00000a90] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000a98] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000aa0] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3
++/* [0x00000aa8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x00000ab0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x00000ab8] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000ac0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000ac8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000ad0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
++/* [0x00000ad8] */ 0x15227d80, 0x14020867, // mov r1, ra8.16b
++/* [0x00000ae0] */ 0x0c9c13c0, 0xd0220467, // add ra_y, r1, 1
++/* [0x00000ae8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
++/* [0x00000af0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000af8] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
++/* [0x00000b00] */ 0x0c627c40, 0x10020e27, // add t0s, ra_base, r1
++/* [0x00000b08] */ 0x0c2a7cc0, 0x12020827, // add r0, ra10.16a, r3
++/* [0x00000b10] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000b18] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000b20] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000b28] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000b30] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000b38] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000b40] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000b48] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
++/* [0x00000b50] */ 0x152a7d80, 0x14020867, // mov r1, ra10.16b
++/* [0x00000b58] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
++/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
++/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
++/* [0x00000b78] */ 0x0c667c40, 0x10020f27, // add t1s, ra_base2, r1
++/* [0x00000b80] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
++/* [0x00000b88] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
++/* [0x00000b90] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
++/* [0x00000b98] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0
++/* [0x00000ba0] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
++/* [0x00000ba8] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
++/* [0x00000bb0] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
++/* [0x00000bb8] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
++/* [0x00000bc0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000bc8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000bd0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000bd8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000be0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000be8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000bf0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000bf8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000c00] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000c08] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000c10] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
++/* [0x00000c18] */ 0x13440dc0, 0xd4020867, // max r1, ra_y, 0
++/* [0x00000c20] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000c28] */ 0x0c441dc0, 0xd4220467, // add ra_y, ra_y, 1
++/* [0x00000c30] */ 0x55810d8f, 0x100049e1, // mov -, unif           ; mul24 r1, r1, rb_pitch
++/* [0x00000c38] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_base
++/* [0x00000c40] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
++/* [0x00000c48] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000c50] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
++/* [0x00000c58] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
++/* [0x00000c60] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_base2
 +// :per_block_setup
-+/* [0x00000bc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000bd0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000bd8] */ 0x959a0ff6, 0x10024061, // mov ra1, unif  ; mov r1, elem_num
-+/* [0x00000be0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000be8] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-+/* [0x00000bf0] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
-+/* [0x00000bf8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000c00] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000c08] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000c10] */ 0x95048ff6, 0xd40258dc, // mov r3, 8                          ; mov ra_y_next, ra1.16b
-+/* [0x00000c18] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
-+/* [0x00000c20] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-+/* [0x00000c28] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
-+/* [0x00000c30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000c38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000c40] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
-+/* [0x00000c48] */ 0x8c0676f6, 0x142258d5, // add r3, r3, r3                     ; mov ra_y2_next, ra1.16b
-+/* [0x00000c50] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
-+/* [0x00000c58] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+/* [0x00000c60] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000c68] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x00000c70] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5
-+/* [0x00000c78] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7
-+/* [0x00000c80] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
-+/* [0x00000c88] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
-+/* [0x00000c90] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
-+/* [0x00000c98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
-+/* [0x00000ca0] */ 0x119d01c0, 0xd0040827, // shl.ifz r0, r0, i_shift16
-+/* [0x00000ca8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3
-+/* [0x00000cb0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
-+/* [0x00000cb8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
-+/* [0x00000cc0] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
-+/* [0x00000cc8] */ 0x01040400, 0xe0020867, // mov r1,0x01040400
-+/* [0x00000cd0] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
-+/* [0x00000cd8] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
-+/* [0x00000ce0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
-+/* [0x00000ce8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
-+/* [0x00000cf0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
-+/* [0x00000cf8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
-+/* [0x00000d00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
-+/* [0x00000d08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
-+/* [0x00000d10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-+/* [0x00000d18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
-+/* [0x00000d20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
-+/* [0x00000d28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
-+/* [0x00000d30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
-+/* [0x00000d38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
-+/* [0x00000d40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
-+/* [0x00000d48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
-+/* [0x00000d50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
-+/* [0x00000d58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
-+/* [0x00000d60] */ 0x902203bf, 0x1e7240e0, // ror ra3.8d, r1, ra8.8d    ; mov r0, unif
-+/* [0x00000d68] */ 0x9020d3bf, 0x1c724061, // ror ra1.8d, r1, ra8.8c    ; mov r1, rb13
-+/* [0x00000d70] */ 0x910e0e76, 0x18024844, // shl r1, unif, r1          ; mov rb4, ra3.8a
-+/* [0x00000d78] */ 0x8f0e70f6, 0x1a024485, // asr ra18, r0, r3          ; mov rb5, ra3.8b
-+/* [0x00000d80] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000d88] */ 0x910e70f6, 0x1c024806, // shl r0, r0, r3            ; mov rb6, ra3.8c
-+/* [0x00000d90] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                 ; mov rb7, ra3.8d
-+/* [0x00000d98] */ 0x0f9c93c0, 0xd0021327, // asr rb12, r1, 9
++/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000c70] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000c78] */ 0x959a0ff6, 0x10024063, // mov ra1, unif         ; mov r3, elem_num
++/* [0x00000c80] */ 0x154e7d80, 0x12120467, // mov ra_xshift, ra_xshift_next
++/* [0x00000c88] */ 0x159c1fc0, 0x10021027, // mov rb_xshift2, rb_xshift2_next
++/* [0x00000c90] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3
++/* [0x00000c98] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000ca0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000ca8] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3
++/* [0x00000cb0] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x00000cb8] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x00000cc0] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000cc8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000cd0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000cd8] */ 0x0c827c00, 0x100206a7, // add ra_base_next, unif, r0
++/* [0x00000ce0] */ 0x15067d80, 0x142204e7, // mov ra_y_next, ra1.16b
++/* [0x00000ce8] */ 0x15827d80, 0x10020067, // mov ra1, unif
++/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00000cf8] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3
++/* [0x00000d00] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000d08] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000d10] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000d18] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000d20] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000d28] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000d30] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000d38] */ 0x0c827c00, 0x100214e7, // add rb_base2_next, unif, r0
++/* [0x00000d40] */ 0x15067d80, 0x14220567, // mov ra_y2_next, ra1.16b
++/* [0x00000d48] */ 0x15827d80, 0x10020427, // mov ra_width_height, unif
++/* [0x00000d50] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
++/* [0x00000d58] */ 0x0d418f80, 0x14021767, // sub rb29, rb24, ra_width
++/* [0x00000d60] */ 0x8c405df6, 0xd2025460, // add rb17, ra_height, 5  ; mov r0, ra_height
++/* [0x00000d68] */ 0x00000010, 0xe0020867, // mov r1, 16
++/* [0x00000d70] */ 0x129e7040, 0x10020827, // min r0, r0, r1
++/* [0x00000d78] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
++/* [0x00000d80] */ 0x119c71c0, 0xd0020827, // shl r0,   r0, 7
++/* [0x00000d88] */ 0x0c427180, 0x14020827, // add r0,   r0, ra_width
++/* [0x00000d90] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
++/* [0x00000d98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
++/* [0x00000da0] */ 0x918101f6, 0xd0045805, // shl.ifz r0, r0, i_shift16          ; mov ra5, unif
++/* [0x00000da8] */ 0x01040400, 0xe00208a7, // mov r2, 0x01040400
++/* [0x00000db0] */ 0x911431f6, 0xd202420e, // shl ra8, r0, 3                     ; mov rb14, ra5.16a
++/* [0x00000db8] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x00000dc0] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
++/* [0x00000dc8] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
++/* [0x00000dd0] */ 0x10227580, 0x1e5200a7, // ror ra2.8b, r2, ra8.8d
++/* [0x00000dd8] */ 0x10227580, 0x1c520027, // ror ra0.8b, r2, ra8.8c
++/* [0x00000de0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
++/* [0x00000de8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
++/* [0x00000df0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
++/* [0x00000df8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
++/* [0x00000e00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
++/* [0x00000e08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
++/* [0x00000e10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00000e18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
++/* [0x00000e20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
++/* [0x00000e28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
++/* [0x00000e30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
++/* [0x00000e38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
++/* [0x00000e40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x00000e48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
++/* [0x00000e50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
++/* [0x00000e58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
++/* [0x00000e60] */ 0x10227380, 0x1e7200e7, // ror ra3.8d, r1, ra8.8d
++/* [0x00000e68] */ 0x10227380, 0x1c720067, // ror ra1.8d, r1, ra8.8c
++/* [0x00000e70] */ 0x950e0dbf, 0x18025112, // mov rb4, ra3.8a            ; mov ra18, unif
++/* [0x00000e78] */ 0x150e7d80, 0x1a021167, // mov rb5, ra3.8b
++/* [0x00000e80] */ 0x150e7d80, 0x1c0211a7, // mov rb6, ra3.8c
++/* [0x00000e88] */ 0x154a7d80, 0x10060167, // mov.ifnz ra5, ra18
++/* [0x00000e90] */ 0x15827d80, 0x100215e7, // mov rb_dest, unif
++/* [0x00000e98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000ea0] */ 0x1114ddc0, 0x14020827, // shl r0, ra5.16b, rb13
++/* [0x00000ea8] */ 0x0f9c91c0, 0xd0021327, // asr rb12, r0, 9
++/* [0x00000eb0] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                  ; mov rb7, ra3.8d
 +// ::mc_filter
-+/* [0x00000da0] */ 0x0f9cf1c0, 0xd00213a7, // asr rb14, r0, 15
++/* [0x00000eb8] */ 0x11141dc0, 0xd20213a7, // shl rb14, ra5.16a, 1
 +// :yloop
-+/* [0x00000da8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+/* [0x00000db0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+/* [0x00000db8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000dc0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000dc8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
-+/* [0x00000dd0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000dd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000de0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x00000de8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+/* [0x00000df0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00000df8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000e00] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+/* [0x00000e08] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
-+/* [0x00000e10] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000e18] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
-+/* [0x00000e20] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+/* [0x00000e28] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
-+/* [0x00000e30] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+/* [0x00000e38] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
-+/* [0x00000e40] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00000e48] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
-+/* [0x00000e50] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x00000e58] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
-+/* [0x00000e60] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
-+/* [0x00000e68] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
-+/* [0x00000e70] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
-+/* [0x00000e78] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
-+/* [0x00000e80] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
-+/* [0x00000e88] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
-+/* [0x00000e90] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
-+/* [0x00000e98] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00000ea0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
-+/* [0x00000ea8] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
-+/* [0x00000eb0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00000eb8] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
-+/* [0x00000ec0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
-+/* [0x00000ec8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
-+/* [0x00000ed0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
-+/* [0x00000ed8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
-+/* [0x00000ee0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+/* [0x00000ee8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+/* [0x00000ef0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+/* [0x00000ef8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+/* [0x00000f00] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x00000f08] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x00000f10] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000f18] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x00000f20] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000f28] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+/* [0x00000f30] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x00000f38] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00000f40] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00000f48] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00000f50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x00000f58] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000f60] */ 0xfffffc48, 0xf0f809e7, // brr -, r:per_block_setup
-+/* [0x00000f68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000f70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000f78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x00000ec0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
++/* [0x00000ec8] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
++/* [0x00000ed0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3
++/* [0x00000ed8] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++/* [0x00000ee0] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
++/* [0x00000ee8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00000ef0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00000ef8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
++/* [0x00000f00] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2   ; v8min r0, r0, rb_k255
++/* [0x00000f08] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00000f10] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00000f18] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
++/* [0x00000f20] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
++/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000f30] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
++/* [0x00000f38] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x00000f40] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x00000f48] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x00000f50] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x00000f58] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x00000f60] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x00000f68] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x00000f70] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x00000f78] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x00000f80] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x00000f88] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x00000f90] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x00000f98] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x00000fa0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x00000fa8] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00000fb0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
++/* [0x00000fb8] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
++/* [0x00000fc0] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
++/* [0x00000fc8] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
++/* [0x00000fd0] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
++/* [0x00000fd8] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
++/* [0x00000fe0] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
++/* [0x00000fe8] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
++/* [0x00000ff0] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
++/* [0x00000ff8] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
++/* [0x00001000] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
++/* [0x00001008] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
++/* [0x00001010] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
++/* [0x00001018] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
++/* [0x00001020] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
++/* [0x00001028] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
++/* [0x00001030] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
++/* [0x00001038] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00001040] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
++/* [0x00001048] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
++/* [0x00001050] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
++/* [0x00001058] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
++/* [0x00001060] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
++/* [0x00001068] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
++/* [0x00001070] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
++/* [0x00001078] */ 0x00000010, 0xe0020867, // mov r1, 16
++/* [0x00001080] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1
++/* [0x00001088] */ 0x159e7000, 0x10120427, // mov ra_height, r0
++/* [0x00001090] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0
++/* [0x00001098] */ 0xfffffbb0, 0xf02809e7, // brr.anyz -, r:per_block_setup
++/* [0x000010a0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
++/* [0x000010a8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x000010b0] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
++/* [0x000010b8] */ 0x129e7040, 0x10020827, // min r0, r0, r1
++/* [0x000010c0] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0
++/* [0x000010c8] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1
++/* [0x000010d0] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23
++/* [0x000010d8] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0
++/* [0x000010e0] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch
++/* [0x000010e8] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
++/* [0x000010f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
++/* [0x000010f8] */ 0xfffffda8, 0xf0f809e7, // brr -, r:yloop
++/* [0x00001100] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00001108] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00001110] */ 0x009e7000, 0x100009e7, // nop
 +// ::mc_filter_b
-+/* [0x00000f80] */ 0x0f9d01c0, 0xd00213a7, // asr rb14, r0, i_shift16
 +// :yloopb
-+/* [0x00000f88] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+/* [0x00000f90] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+/* [0x00000f98] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000fa0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000fa8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
-+/* [0x00000fb0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000fb8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000fc0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x00000fc8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+/* [0x00000fd0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00000fd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000fe0] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+/* [0x00000fe8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
-+/* [0x00000ff0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000ff8] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
-+/* [0x00001000] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+/* [0x00001008] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
-+/* [0x00001010] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+/* [0x00001018] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
-+/* [0x00001020] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00001028] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
-+/* [0x00001030] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x00001038] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
-+/* [0x00001040] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
-+/* [0x00001048] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
-+/* [0x00001050] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
-+/* [0x00001058] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
-+/* [0x00001060] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
-+/* [0x00001068] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
-+/* [0x00001070] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
-+/* [0x00001078] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00001080] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
-+/* [0x00001088] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
-+/* [0x00001090] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001098] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
-+/* [0x000010a0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
-+/* [0x000010a8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
-+/* [0x000010b0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
-+/* [0x000010b8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
-+/* [0x000010c0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+/* [0x000010c8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+/* [0x000010d0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+/* [0x000010d8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+/* [0x000010e0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x000010e8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x000010f0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
-+/* [0x000010f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x00001100] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00001108] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
-+/* [0x00001110] */ 0x4c4b808e, 0xd0024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
-+/* [0x00001118] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00001120] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00001128] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001130] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00001138] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x00001140] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00001148] */ 0xfffffa60, 0xf0f809e7, // brr -, r:per_block_setup
-+/* [0x00001150] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00001158] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00001160] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_interrupt_exit12
-+/* [0x00001168] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00001170] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001178] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001180] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001188] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001190] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001198] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000011f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x000011f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_exit1
-+/* [0x00001200] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001210] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001218] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001220] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001228] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00001230] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00001238] */ 0x009e7000, 0x100009e7, // nop        ; nop
++/* [0x00001118] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
++/* [0x00001120] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
++/* [0x00001128] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3
++/* [0x00001130] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++/* [0x00001138] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
++/* [0x00001140] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00001148] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00001150] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
++/* [0x00001158] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2   ; v8min r0, r0, rb_k255
++/* [0x00001160] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00001168] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00001170] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
++/* [0x00001178] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
++/* [0x00001180] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00001188] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
++/* [0x00001190] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x00001198] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x000011a0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x000011a8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x000011b0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x000011b8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x000011c0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x000011c8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x000011d0] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x000011d8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x000011e0] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x000011e8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x000011f0] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x000011f8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x00001200] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00001208] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
++/* [0x00001210] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
++/* [0x00001218] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
++/* [0x00001220] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
++/* [0x00001228] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
++/* [0x00001230] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
++/* [0x00001238] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
++/* [0x00001240] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
++/* [0x00001248] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
++/* [0x00001250] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
++/* [0x00001258] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
++/* [0x00001260] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
++/* [0x00001268] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
++/* [0x00001270] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
++/* [0x00001278] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
++/* [0x00001280] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
++/* [0x00001288] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
++/* [0x00001290] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00001298] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
++/* [0x000012a0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
++/* [0x000012a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
++/* [0x000012b0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
++/* [0x000012b8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
++/* [0x000012c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
++/* [0x000012c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
++/* [0x000012d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
++/* [0x000012d8] */ 0x00000010, 0xe0020867, // mov r1, 16
++/* [0x000012e0] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1
++/* [0x000012e8] */ 0x159e7000, 0x10120427, // mov ra_height, r0
++/* [0x000012f0] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0
++/* [0x000012f8] */ 0xfffff950, 0xf02809e7, // brr.anyz -, r:per_block_setup
++/* [0x00001300] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
++/* [0x00001308] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x00001310] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
++/* [0x00001318] */ 0x129e7040, 0x10020827, // min r0, r0, r1
++/* [0x00001320] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0
++/* [0x00001328] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1
++/* [0x00001330] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23
++/* [0x00001338] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0
++/* [0x00001340] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch
++/* [0x00001348] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
++/* [0x00001350] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
++/* [0x00001358] */ 0xfffffda0, 0xf0f809e7, // brr -, r:yloopb
++/* [0x00001360] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00001368] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00001370] */ 0x009e7000, 0x100009e7, // nop
 +// ::mc_end
 +};
 +#ifdef __HIGHC__
 +#pragma Align_to(8, rpi_shader)
 +#endif
+diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader_cmd.h ffmpeg-3.2.4.patch/libavcodec/rpi_shader_cmd.h
+--- ffmpeg-3.2.4/libavcodec/rpi_shader_cmd.h	1970-01-01 01:00:00.000000000 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/rpi_shader_cmd.h	2017-05-28 20:42:45.755088727 +0200
+@@ -0,0 +1,88 @@
++#ifndef RPI_SHADER_CMD_H
++#define RPI_SHADER_CMD_H
++
++#pragma pack(push, 4)
++
++typedef struct qpu_mc_pred_c_s {
++    uint32_t next_fn;
++    int16_t next_src_y;
++    int16_t next_src_x;
++    uint32_t next_src_base_c;
++    union {
++        struct {
++            uint16_t h;
++            uint16_t w;
++            uint32_t coeffs_x;
++            uint32_t coeffs_y;
++            uint32_t wo_u;
++            uint32_t wo_v;
++            uint32_t dst_addr_c;
++        } p;
++        struct {
++            uint16_t h;
++            uint16_t w;
++            uint32_t coeffs_x;
++            uint32_t coeffs_y;
++            uint32_t weight_u;
++            uint32_t weight_v;
++            uint32_t dummy0;
++        } b0;
++        struct {
++            uint32_t dummy0;
++            uint32_t coeffs_x;
++            uint32_t coeffs_y;
++            uint32_t wo_u;
++            uint32_t wo_v;
++            uint32_t dst_addr_c;
++        } b1;
++        struct {
++            uint32_t pic_cw;            // C Width (== Y width / 2)
++            uint32_t pic_ch;            // C Height (== Y Height / 2)
++            uint32_t stride2;
++            uint32_t stride1;
++            uint32_t wdenom;
++            uint32_t dummy0;
++        } s0;
++        struct {
++            uint32_t dummy0;
++            uint32_t dummy1;
++            uint32_t dummy2;
++            uint32_t dummy3;
++            uint32_t dummy4;
++            uint32_t dummy5;
++        } s1;
++    };
++} qpu_mc_pred_c_t;
++
++typedef struct qpu_mc_pred_y_s {
++    int16_t next_src1_x;
++    int16_t next_src1_y;
++    uint32_t next_src1_base;
++    int16_t next_src2_x;
++    int16_t next_src2_y;
++    uint32_t next_src2_base;
++    union {
++        struct {
++            uint16_t h;
++            uint16_t w;
++            uint32_t mymx21;
++            uint32_t wo1;
++            uint32_t wo2;
++            uint32_t dst_addr;
++        } p;
++        struct {
++            uint16_t pic_h;
++            uint16_t pic_w;
++            uint32_t stride2;
++            uint32_t stride1;
++            uint32_t wdenom;
++            uint32_t dummy0;
++        } s;
++    };
++    uint32_t next_fn;
++} qpu_mc_pred_y_t;
++
++#pragma pack(pop)
++
++#endif
++
 diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.h ffmpeg-3.2.4.patch/libavcodec/rpi_shader.h
 --- ffmpeg-3.2.4/libavcodec/rpi_shader.h	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/rpi_shader.h	2017-03-22 22:42:34.849798577 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/rpi_shader.h	2017-05-28 20:42:45.753088719 +0200
 @@ -0,0 +1,19 @@
 +#ifndef rpi_shader_H
 +#define rpi_shader_H
 +
 +extern unsigned int rpi_shader[];
 +
-+#define mc_setup_uv (rpi_shader + 0)
-+#define mc_filter_uv (rpi_shader + 132)
-+#define mc_filter_uv_b0 (rpi_shader + 274)
-+#define mc_filter_uv_b (rpi_shader + 392)
-+#define mc_exit (rpi_shader + 540)
-+#define mc_interrupt_exit8 (rpi_shader + 558)
-+#define mc_setup (rpi_shader + 588)
-+#define mc_filter (rpi_shader + 872)
-+#define mc_filter_b (rpi_shader + 992)
-+#define mc_interrupt_exit12 (rpi_shader + 1114)
-+#define mc_exit1 (rpi_shader + 1152)
-+#define mc_end (rpi_shader + 1168)
++#define mc_setup_c (rpi_shader + 0)
++#define mc_filter_uv (rpi_shader + 152)
++#define mc_filter_uv_b0 (rpi_shader + 280)
++#define mc_interrupt_exit8c (rpi_shader + 554)
++#define mc_exit (rpi_shader + 582)
++#define mc_exit_c (rpi_shader + 582)
++#define mc_interrupt_exit12 (rpi_shader + 598)
++#define mc_exit1 (rpi_shader + 634)
++#define mc_setup (rpi_shader + 650)
++#define mc_filter (rpi_shader + 942)
++#define mc_filter_b (rpi_shader + 1094)
++#define mc_end (rpi_shader + 1246)
 +
 +#endif
 diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.qasm ffmpeg-3.2.4.patch/libavcodec/rpi_shader.qasm
 --- ffmpeg-3.2.4/libavcodec/rpi_shader.qasm	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/rpi_shader.qasm	2017-03-22 22:42:34.851798583 +0100
-@@ -0,0 +1,1098 @@
++++ ffmpeg-3.2.4.patch/libavcodec/rpi_shader.qasm	2017-05-28 20:42:45.754088723 +0200
+@@ -0,0 +1,1259 @@
++
++# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
++# the warning that we are using rotation & ra/rb registers. r0..3 can be
++# rotated through all 16 elems ra regs can only be rotated through their
++# local 4.  As it happens this is what is wanted here as we do not want the
++# constants from the other half of the calc.
++
 +# register allocation
 +#
 +# ra0...ra7                                     eight horizontal filter coefficients
@@ -13369,32 +16506,32 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.qasm ffmpeg-3.2.4.patch/libavcodec
 +#
 +# rb8...rb11                                    eight vertical filter coefficients
 +
-+# ra4                                           y: Fiter, UV: 0x10000
++# ra4                                           y: Fiter, UV: part -of b0 -> b stash
 +
 +# rb12                                          offset to add before shift (round + weighting offsets)
 +# rb13                                          shift: denom + 6 + 9
 +# rb14                                          L0 weight (U on left, V on right)
 +# rb15                                          -- free --
 +#
-+# ra16                                          clipped(row start address+elem_num)&~3
-+# ra17                                          per-channel shifts
++# ra16                                          width:height
++# ra17                                          ra_y:ra_xshift
 +# ra18                                          L1 weight (Y)
-+# ra19                                          next ra17
++# ra19                                          ra_y_next:ra_xshift_next
 +#
 +# rb16                                          pitch
 +# rb17                                          height + 1
-+# rb18                                          height + 3
-+# rb19                                          next ra16
++# rb18                                          max(height,16) + 3
++# rb19                                          frame_base2_next
 +#
 +# ra20                                          1
-+# ra21                                          ra_21
++# ra21                                          ra_y2_next:ra_y2 (luma); free (chroma)
 +# ra22 ra_k256                                  256
-+# ra23 ra_y2_next                               ra_y2_next
++# ra23                                          0
 +#
-+# rb20                                          0xffffff00
-+# rb21                                          vpm_setup for reading/writing 16bit results into VPM
++# rb20                                          -- free --
++# rb21                                          -- free --
 +# rb22 rb_k255                                  255
-+# rb23                                          24
++# rb23                                          dest (Y)
 +#
 +# rb24                                          vdw_setup_1(dst_pitch)
 +# rb25                                          frame width-1
@@ -13405,146 +16542,233 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.qasm ffmpeg-3.2.4.patch/libavcodec
 +# rb30                                          frame height-1
 +# rb31                                          used as temp to count loop iterations
 +#
-+# ra24                                          clipped(row start address+8+elem_num)&~3
-+# ra25                                          per-channel shifts 2
++# ra24                                          src frame base
++# ra25                                          src frame base 2
 +# ra26                                          next ra24
 +# ra27                                          next ra25
-+# ra28                                          next y
-+# ra29                                          y for next texture access
-+# ra30                                          64
++# ra28                                          -- free --
++# ra29                                          -- free --
 +#
-+# ra31                                          next kernel address
++# Use an even numbered register as a link register to avoid corrupting flags
++# ra30                                          next kernel address
++# ra31                                          chroma-B height+3; free otherwise
++
++.set rb_max_x,                     rb25
++.set rb_max_y,                     rb30
++.set rb_pitch,                     rb16
++.set ra_width_height,              ra16
++.set ra_width,                     ra16.16b
++.set ra_height,                    ra16.16a
++.set ra_y2,                        ra21.16a
++.set ra_y2_next,                   ra21.16b
++
++.set rb_base2_next,                rb19
++
++.set rb_dest,                      rb23
++.set ra_base,                      ra24
++.set ra_base_next,                 ra26
++.set ra_xshift,                    ra17.16a
++
++.set ra_base2,                     ra25
++
++# Note ra_xy & ra_xy_next should have same structure!
++.set ra_xshift_next,               ra19.16a
++.set rb_xshift2,                   rb0
++.set rb_xshift2_next,              rb1
++
++.set ra_y_next,                    ra19.16b
++.set ra_y,                         ra17.16b
++
++.set ra_k1,                        ra20
++.set rb_xpitch,                    rb20
++.set rb_k255,                      rb22
++.set ra_k256,                      ra22
++.set ra_k0,                        ra23
++
++.set ra_link,                      ra30
++
++# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
++.set i_shift16,                    -16
++.set i_shift21,                    -11
++.set i_shift23,                     -9
++.set i_shift30,                     -2
++
++# Much of the setup code is common between Y & C
++# Macros that express this - obviously these can't be overlapped
++# so are probably unsuitable for loop code
++
++.macro m_calc_dma_regs, r_vpm, r_dma
++  mov r2, qpu_num
++  asr r1, r2, 2
++  shl r1, r1, 6
++  and r0, r2, 3
++  or  r0, r0, r1
++
++  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
++  add r_vpm, r0, r1  # VPM 8bit storage
++
++  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
++  shl r0, r0, 5
++  add r_dma, r0, r1  # DMA out
++.endm
++
++# For chroma use packed H = (qpu_num & 1), Y = (qpu_num >> 1) * 16
++.macro m_calc_dma_regs_c, r_vpm, r_dma
++  mov r2, qpu_num
++  asr r1, r2, 1
++  shl r1, r1, 5
++  and r0, r2, 1
++  or  r0, r0, r1
++
++  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
++  add r_vpm, r0, r1  # VPM 8bit storage
++
++  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
++  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
++  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later
++  shl r0, r0, 6
++  add r_dma, r0, r1  # DMA out
++.endm
++
++
++################################################################################
++# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
++::mc_setup_c
++  mov tmurs, 1          ; mov -, unif        # No swap TMUs ; Next fn (ignored)
++
++# Load first request location
++  mov ra0, unif         # next_x_y
 +
-+.set rb_frame_width_minus_1,       rb25
-+.set rb_frame_height_minus_1,      rb30
-+.set rb_pitch,                     rb16
-+.set ra_x,                         ra16
-+.set ra_y2,                        ra21.16a
-+.set ra_y2_next,                   ra21.16b
++  mov ra_base, unif                             # Store frame c base
 +
-+.set rb_x_next,                    rb19
-+.set rx_frame_base2_next,          rb19
++# Read image dimensions
++  sub rb_max_x, unif, 1     # pic c width
++  sub rb_max_y, unif, 1     # pic c height
 +
-+.set ra_frame_base,                ra24
-+.set ra_frame_base_next,           ra26
-+.set ra_xshift,                    ra17
++# load constants
++  mov ra_k1, 1
++  mov ra_k256, 256
++  mov rb_k255, 255
++  mov ra_k0, 0
 +
-+.set ra_u2v_ref_offset,            ra25
-+.set ra_frame_base2,               ra25
++# touch registers to keep simulator happy
 +
-+.set ra_xshift_next,               ra19
-+.set rx_xshift2,                   rb0
-+.set rx_xshift2_next,              rb1
++  # ra/b4..7: B0 -> B stash registers
++  mov ra4, 0 ; mov rb4, 0
++  mov ra5, 0 ; mov rb5, 0
++  mov ra6, 0 ; mov rb6, 0
++  mov ra7, 0 ; mov rb7, 0
 +
-+.set ra_u2v_dst_offset,            ra27
++  mov r1, vdw_setup_1(0)  # Merged with dst_stride shortly, delay slot for ra_base
 +
-+.set ra_y_next,                    ra28
-+.set ra_y,                         ra29
++# ; ra12..15: vertical scroll registers
++# get source pitch
++  mov rb_xpitch, unif   ; mov ra12, 0           # stride2
++  mov rb_pitch, unif    ; mov ra13, 0           # stride1
++  mov r0, elem_num      ; mov ra14, 0
++# get destination vdw setup
++  add rb24, r1, rb_pitch ; mov ra15, ra_k0 # vdw_setup_1
 +
-+.set ra_k1,                        ra20
-+.set rb_k255,                      rb22
-+.set ra_k256,                      ra22
++# Compute base address for first and second access
++# ra_base ends up with t0s base
++# ra_base2 ends up with t1s base
 +
-+# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
-+.set i_shift16,                    -16
-+.set i_shift21,                    -11
++  add r0, r0, ra0.16b                           # Add elem no to x to get X for this slice
++  max r0, r0, 0         ; mov ra_y, ra0.16a     # ; stash Y
++  min r0, r0, rb_max_x
 +
-+################################################################################
-+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
-+::mc_setup_uv
++# Get shift
++  and r1, r0, 1
++  shl ra_xshift_next, r1, 4
 +
-+# Read starting kernel
-+mov ra31, unif
++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
 +
-+# Load first request location
-+add ra_x, unif, elem_num # Store x
-+mov ra_y, unif # Store y
-+mov ra_frame_base, unif # Store frame u base
-+nop
-+sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
++  and r0, r0, -2
++  add r0, r0, r0        ; v8subs r1, r1, r1
++  sub r1, r1, rb_pitch
++  and r1, r0, r1
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov r1, ra_y
++  add ra_base, ra_base, r0
 +
-+# Read image dimensions
-+sub rb25,unif,1
-+sub rb30,unif,1
++  max r0, r1, 0
++  min r0, r0, rb_max_y
 +
-+# get source pitch
-+mov rb16, unif
++# submit texture requests for first line
++  add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
++  add t0s, ra_base, r0
 +
-+# get destination pitch
-+mov r0, unif
-+mov r1, vdw_setup_1(0)
-+add rb24, r1, r0
++# submit texture requests for 2nd line
 +
-+# load constants
++  max r0, r1, 0
++  min r0, r0, rb_max_y
 +
-+mov ra4, 0x10000
-+mov ra_k1, 1
-+mov ra_k256, 256
-+mov ra30, 64
++  add ra_y, r1, ra_k1   ; mul24 r0, r0, rb_pitch
++  add t0s, ra_base, r0
 +
-+mov rb20, 0xffffff00
-+mov rb_k255, 255
-+mov rb23, 24
++  add rb13, 9, unif     # denominator
++  mov -, unif           # Unused
 +
-+# touch vertical context to keep simulator happy
++# Compute part of VPM to use for DMA output
++  m_calc_dma_regs_c rb28, rb27
++
++# -----------------
++# And again for L1, but only worrying about frame2 stuff
++
++  mov ra_link, unif        # Next fn
++
++# Load first request location
++  mov ra0, unif            # next_x_y
 +
-+mov ra8, 0
-+mov ra9, 0
-+mov ra10, 0
-+mov ra11, 0
-+mov ra12, 0
-+mov ra13, 0
-+mov ra14, 0
-+mov ra15, 0
++  mov ra_base2, unif # Store frame c base
 +
 +# Compute base address for first and second access
-+mov r0, ra_x           # Load x
-+max r0, r0, 0; mov r1, ra_y # Load y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base  # Load the frame base
-+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+add ra_y, r1, 1
-+add r0, r0, r3
-+and r0, r0, ~3
-+max r1, r1, 0 ; mov ra_x, r0 # y
-+min r1, r1, rb_frame_height_minus_1
++# ra_base ends up with t0s base
++# ra_base2 ends up with t1s base
++
++  mov ra_y2, ra0.16a       # Store y
++  mov r0, ra0.16b          # Load x
++  add r0, r0, elem_num     # Add QPU slice
++  max r0, r0, 0         ; mov -, unif           # Unused 0
++  min r0, r0, rb_max_x  ; mov -, unif           # Unused 1
++
++# Get shift
++  and r1, r0, 1         ; mov -, unif           # Unused 2
++  shl rb_xshift2_next, r1, 4
++
++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
++
++  and r0, r0, -2
++  add r0, r0, r0        ; v8subs r1, r1, r1
++  sub r1, r1, rb_pitch
++  and r1, r0, r1
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov r1, ra_y2
++  add ra_base2, ra_base2, r0
++
++  max r0, r1, 0
++  min r0, r0, rb_max_y
++
 +# submit texture requests for first line
-+add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+add t0s, r0, r1 ; mov ra_frame_base, r2
-+add t1s, r2, r1
++  add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
++  add t1s, ra_base2, r0 ; mov -, unif           # Unused 3
 +
-+mov r2, 9
-+add rb13, r2, unif  # denominator
-+mov -, unif         # Unused
++# submit texture requests for 2nd line
 +
-+# Compute part of VPM to use for DMA output
-+mov r2, unif
-+shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
-+and r2, r2, 15
-+mov r1, r2
-+asr r1, r1, 2
-+shl r1, r1, 6
-+mov r0, r2
-+and r0, r0, 3
-+add r0, r0, r1
-+
-+mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+add rb28, r0, r1  # VPM 8bit storage
-+asr r2, r0, 1     # r0 = bc0000d
-+mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
-+add rb21, r2, r1  # VPM for 16bit intermediates
-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+shl r0, r0, 5
-+add rb27, r0, r1  # DMA out
++  max r0, r1, 0         ; mov -, unif           # Unused 4
++
++  bra -, ra_link
++
++  min r0, r0, rb_max_y  ; mov -, unif           # Unused 5
++  add ra_y2, r1, ra_k1   ; mul24 r0, r0, rb_pitch
++  add t1s, ra_base2, r0
++
++# >>> ra_link
 +
-+# submit texture requests for second line
-+max r1, ra_y, 0
-+min r1, r1, rb_frame_height_minus_1
-+add ra_y, ra_y, 1
-+bra -, ra31
-+nop ; mul24 r1, r1, rb_pitch
-+add t0s, r1, ra_x
-+add t1s, r1, ra_frame_base
 +
++.macro setf_nz_if_v
++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++.endm
 +
 +
 +################################################################################
@@ -13554,51 +16778,51 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.qasm ffmpeg-3.2.4.patch/libavcodec
 +# At this point we have already issued two pairs of texture requests for the current block
 +# ra_x, ra_x16_base point to the current coordinates for this block
 +::mc_filter_uv
-+mov ra31, unif
++  mov ra_link, unif     ; mov vw_setup, rb28    # ; x_y
 +
 +# per-channel shifts were calculated on the *previous* invocation
 +
 +# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num    # x
-+max r0, r0, 0         ; mov r1, unif # y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+# compute offset from frame base u to frame base v
-+sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
-+shl ra_xshift_next, r0, 3
-+add r0, r0, r3        ; mov ra1, unif  # ; width_height
-+and rb_x_next, r0, ~3 ; mov ra0, unif  # H filter coeffs
-+mov ra_y_next, r1     ; mov vw_setup, rb28
-+add ra_frame_base_next, rb_x_next, r2
++  mov ra2, unif         ; mov r0, elem_num
++
++  setf_nz_if_v                                  # Also acts as delay slot for ra2
++
++  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
++  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
++  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B
++  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
++
++  shl ra_xshift_next, r0, 4
++
++  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
++  add r0, r0, r0        ; mov ra_y_next, ra2.16a
++  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
++  add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
 +
 +# set up VPM write
-+# get width,height of block
 +
-+sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
-+add rb17, ra1.16a, 1
-+add rb18, ra1.16a, 3
-+shl r0,   ra1.16a, 7
-+add r0,   r0, ra1.16b    # Combine width and height of destination area
-+shl r0,   r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
-+add rb26, r0, rb27    ; mov ra3, unif  # ; V filter coeffs
++  sub rb29, rb24, r2    ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
++  add rb17, r1, 1       ; mov ra1, unif         # ; U offset/weight
++  add rb18, r1, 3       ; mov.ifnz ra1, unif    # ; V offset/weight
 +
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++# ; unpack filter coefficients
 +
-+# unpack filter coefficients
++  add r0,   r0, r2      ; mov rb8,  ra3.8a      # Combine width and height of destination area
++  shl r0,   r0, 15      ; mov rb9,  ra3.8b      # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb26, r0, rb27    ; mov r1, ra1.16b       # ; r1=weight
 +
-+mov ra1, unif         ; mov rb8,  ra3.8a   # U offset/weight
-+mov.ifnz ra1, unif    ; mov rb9,  ra3.8b   # V offset/weight
-+nop                   ; mov rb10, ra3.8c
-+mov r3, 0             ; mov rb11, ra3.8d   # Loop count
++  shl r1, r1, rb13      ; mov rb10, ra3.8c
++  mov r3, 0             ; mov rb11, ra3.8d   # Loop count
 +
-+shl r1, ra1.16b, rb13
-+asr rb12, r1, 1
-+shl rb14, ra1.16a, 1  # b14 = weight*2
++  asr rb12, r1, 1
++  shl rb14, ra1.16a, 1  # b14 = weight*2
 +
 +# rb14 - weight L0 * 2
 +# rb13 = weight denom + 6 + 9
 +# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
 +
-+# r2 is elem_num
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
@@ -13607,123 +16831,114 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.qasm ffmpeg-3.2.4.patch/libavcodec
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
++  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0     # loop counter increment
++  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
++  shr r1, r0, 8         ; mov.ifnz r3, ra_y
 +
-+max r2, ra_y, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+add t1s, ra_frame_base, r2
++  max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
++  min r2, r2, rb_max_y
++  add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
++  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
 +
 +# generate seven shifted versions
 +# interleave with scroll of vertical context
 +
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++  setf_nz_if_v
 +
 +# apply horizontal filter
-+nop                  ; mul24      r3, ra0.8a,       r0
-+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+sub r0, r2, r3       ; mov r3, rb31
-+sub.setf -, r3, 4    ; mov ra12, ra13
-+brr.anyn -, r:uvloop
-+mov ra13, ra14          ; mul24 r1, ra14, rb9
-+mov ra14, ra15
-+mov ra15, r0            ; mul24 r0, ra12, rb8
++# The filter coeffs for the two halves of this are the same (unlike in the
++# Y case) so it doesn't matter which ra0 we get them from
++
++  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
++  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++  sub r0, r2, r3        ; mov r3, rb31
++  sub.setf -, r3, 4     ; mov ra12, ra13
++  brr.anyn -, r:uvloop
++  mov ra13, ra14        ; mul24 r1, ra14, rb9
++  mov ra14, ra15
++  mov ra15, r0          ; mul24 r0, ra12, rb8
 +# >>> .anyn uvloop
 +
 +# apply vertical filter and write to VPM
 +
-+sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+add r1, r1, r0          ; mul24 r0, ra15, rb11
-+sub r1, r1, r0          ; mov -, vw_wait
-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+asr r1, r1, 14
-+nop                     ; mul24 r1, r1, rb14
-+shl r1, r1, 8
-+
-+add r1, r1, rb12
-+brr.anyn -, r:uvloop
-+asr r1, r1, rb13
-+min r1, r1, rb_k255       # Delay 2
-+max vpm, r1, 0         # Delay 3
-+
-+# DMA out for U
-+
-+mov vw_setup, rb26 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
-+
-+# DMA out for V
-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+# Could potentially push this write into the start of the next pipeline stage.
-+mov r0, 16
-+mov -, vw_wait
-+
-+bra -, ra31
-+add vw_setup, rb26, r0 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
++  sub r1, r1, r0        ; mul24 r0, ra14, rb10
++  add r1, r1, r0        ; mul24 r0, ra15, rb11
++  sub r1, r1, r0
++  sub.setf -, r3, rb18  ; mul24 r1, r1, ra_k256
++  asr r1, r1, 14
++  nop                   ; mul24 r1, r1, rb14
++  shl r1, r1, 8
 +
++  add r1, r1, rb12
++  asr ra1.8as, r1, rb13
++  nop                   ; mov r1, r1 << 8
++  brr.anyn -, r:uvloop
++  asr ra1.8bs, r1, rb13
++  mov -, vw_wait
++  mov vpm, ra1
++
++# >>>
++
++# DMA out for U & stash for V
++  bra -, ra_link
++  mov vw_setup, rb26
++  mov vw_setup, rb29
++  mov vw_addr, unif     # u_dst_addr
++# >>>
 +
 +################################################################################
 +
-+# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
++# mc_filter_uv_b0(next_kernel, x, y, frame_c_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
 +
 +# At this point we have already issued two pairs of texture requests for the current block
 +# ra_x, ra_x16_base point to the current coordinates for this block
 +::mc_filter_uv_b0
-+mov ra31, unif
++  mov -, unif           ; mov vw_setup, rb28    # next_fn ignored - always uv_b
 +
 +# per-channel shifts were calculated on the *previous* invocation
 +
 +# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num       # x
-+max r0, r0, 0                ; mov r1, unif # y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next # compute offset from frame base u to frame base v ;
-+shl ra_xshift_next, r0, 3
-+add r0, r0, r3  	     ; mov ra1, unif   # ; width_height
-+and rb_x_next, r0, ~3        ; mov ra0, unif   # ; H filter coeffs
-+mov ra_y_next, r1            ; mov vw_setup, rb21
-+
-+add ra_frame_base_next, rb_x_next, r2
-+
-+# Need to have unsigned coeffs to so we can just unpack in the filter
-+# chroma filter always goes -ve, +ve, +ve, -ve. This is fixed in the
-+# filter code. Unpack into b regs for V
-+
-+# set up VPM write, we need to save 16bit precision
-+
-+sub rb29, rb24, ra1.16b         # Compute vdw_setup1(dst_pitch-width)
-+add rb17, ra1.16a, 1
-+add rb18, ra1.16a, 3
-+shl r0,   ra1.16a, 7
-+add r0,   r0, ra1.16b           # Combine width and height of destination area
-+shl r0,   r0, i_shift16      ; mov ra3, unif  # ; V filter coeffs
-+add rb26, r0, rb27
-+
-+mov rb8, ra3.8a
-+mov rb9, ra3.8b
-+mov rb10, ra3.8c
-+mov rb11, ra3.8d
-+
-+# r2 is elem_num
-+# r3 is loop counter
++  mov ra2, unif         ; mov r0, elem_num
 +
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++  setf_nz_if_v                                  # Also acts as delay slot for ra2
++
++  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
++  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
++  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B
++  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
++
++  shl ra_xshift_next, r0, 4
++
++  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
++  add r0, r0, r0        ; mov ra_y_next, ra2.16a
++  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
++  add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
++
++# set up VPM write
++
++  sub rb29, rb24, r2    ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
++  add rb17, r1, 1
++  add ra31, r1, 3       ; mov rb8,  ra3.8a      # Combine width and height of destination area
++
++# ; unpack filter coefficients
++
++  add r0,   r0, r2      ; mov rb9,  ra3.8b
++  shl r0,   r0, 15      ; mov rb10, ra3.8c      # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb26, r0, rb27
++
++  mov r3, 0             ; mov rb11, ra3.8d      # Loop count
++
++  mov rb14, unif                                # U weight
++  mov.ifnz rb14, unif                           # V weight
 +
-+mov      rb14, unif                 # U weight L0
-+mov.ifnz rb14, unif    ; mov r3, 0  # V weight L0 ; Loop counter
 +# rb14 unused in b0 but will hang around till the second pass
 +
 +# retrieve texture results and pick out bytes
@@ -13734,108 +16949,143 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.qasm ffmpeg-3.2.4.patch/libavcodec
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
++  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0     # loop counter increment
++  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
++  shr r1, r0, 8         ; mov.ifnz r3, ra_y
 +
-+max r2, ra_y, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+add t1s, ra_frame_base, r2
++  max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
++  min r2, r2, rb_max_y
++  add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
++  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
 +
 +# generate seven shifted versions
 +# interleave with scroll of vertical context
 +
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+nop                  ; mul24      r3, ra0.8a,       r0
-+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+sub r0, r2, r3       ; mov r3, rb31
-+sub.setf -, r3, 4    ; mov ra12, ra13
-+brr.anyn -, r:uvloop_b0
-+mov ra13, ra14          ; mul24 r1, ra14, rb9  # ra14 is about to be ra13
-+mov ra14, ra15
-+mov ra15, r0            ; mul24 r0, ra12, rb8
-+# >>> .anyn uvloop_b0
-+
-+# apply vertical filter and write to VPM
++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
 +
-+sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+sub.setf -, r3, rb18
-+brr.anyn -, r:uvloop_b0
-+add r1, r1, r0          ; mul24 r0, ra15, rb11
-+sub r1, r1, r0          ; mov -, vw_wait
-+asr vpm, r1, 6
++  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
++  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0  # Need to wait 1 cycle for rotated r1
++  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++  sub r0, r2, r3        ; mov r3, rb31
++  sub.setf -, r3, 4     ; mov ra12, ra13
++  brr.anyn -, r:uvloop_b0
++  mov ra13, ra14        ; mul24 r1, ra14, rb9   # ra14 is about to be ra13
++  mov ra14, ra15        ; mul24 r2, ra15, rb10  # ra15 is about to be ra14
++  mov ra15, r0          ; mul24 r0, ra12, rb8
 +# >>> .anyn uvloop_b0
 +
-+# in pass0 we don't really need to save any results, but need to discard the uniforms
-+# DMA out for U
-+
-+bra -, ra31
-+mov -, unif           # Delay 1
-+mov -, unif           # Delay 2
-+nop                   # Delay 3
-+
-+
-+################################################################################
-+
-+::mc_filter_uv_b
-+mov ra31, unif
++# apply vertical filter and write to B-FIFO
++
++  sub r1, r1, r0        ; mov ra8.16b, ra7      # start of B FIFO writes
++  add r1, r1, r2        ; mul24 r0, ra15, rb11  # N.B. ra15 write gap
++  sub r1, r1, r0        ; mov ra7, rb6
++
++# FIFO goes:
++# b7a, a6a, b5a, a4a, b4a, a5a, b6a, a7a : b7b, a6b, b5b, a4b, b4b, a5b, b6b, a7b
++# This arrangement optimizes the inner loop FIFOs at the expense of making the
++# bulk shift between loops quite a bit nastier
++# a8 used as temp
++
++  sub.setf -, r3, ra31
++  asr ra8.16a, r1, 6    ; mov rb6, ra5          # This discards the high bits that might be bad
++  brr.anyn -, r:uvloop_b0
++  mov ra5, rb4          ; mov rb4, ra4
++  mov ra4, rb5          ; mov rb5, ra6
++  mov ra6, rb7          ; mov rb7, ra8
++# >>>
++
++# 1st half done all results now in the a/b4..7 fifo
++
++# Need to bulk rotate FIFO for heights other than 16
++# plausible heights are 16, 12, 8, 6, 4, 2 and that is all we deal with
++# we are allowed 3/4 cb_size w/h :-(
++
++# Destination uniforms discarded
++# At the end drop through to _b - we will always do b after b0
++
++  sub.setf -, 15, r3    # 12 + 3 of preroll
++  brr.anyn -, r:uv_b0_post_fin                  # h > 12 (n) => 16 (do nothing)
++  sub r3, 11, r3        ; mov -, unif           # r3 = shifts wanted ; Discard u_dst_addr
++  mov r0, i_shift16     ; mov ra_link, unif
++  mov r1, 0x10000
++# >>>
++  brr.anyz -, r:uv_b0_post12                    # h == 12 deal with specially
++# If h != 16 && h != 12 then h <= 8 so
++# shift 8 with discard (.16b = .16a on all regs)
++  shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
++  shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
++  shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
++# >>>
++  shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
++
++  shl.setf -, r3, i_shift30  # b2 -> C, b1 -> N
++# Shift 4
++  mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
++  mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
++  # If we shifted by 4 here then the max length remaining is 4
++  # so that is it
++
++  brr -, r:uv_b0_post_fin
++# Shift 2
++  mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
++  mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
++  mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
++  # 6 / 2 so need 6 outputs
++# >>>
++
++:uv_b0_post12
++# this one is annoying as we need to swap halves of things that don't
++# really want to be swapped
++
++# b7a, a6a, b5a, a4a
++# b4a, a5a, b6a, a7a
++# b7b, a6b, b5b, a4b
++# b4b, a5b, b6b, a7b
++
++  mov r2, ra6           ; mov r3, rb7
++  shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
++  mov ra5, r2           ; mov rb4, r3
++
++  mov r2,  ra4          ; mov r3,  rb5
++  shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
++  mov ra7, r2           ; mov rb6, r3
++
++:uv_b0_post_fin
++
++##### L1 B processing
 +
 +# per-channel shifts were calculated on the *previous* invocation
 +
-+# set up VPM write
-+mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
-+
 +# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num    # x
-+max r0, r0, 0                      ; mov ra_y_next, unif # y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif        # V frame_base
-+# compute offset from frame base u to frame base v
-+sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8 # U frame_base
-+add r0, r0, r3                     ; mov ra1, unif       # width_height
-+and rb_x_next, r0, ~3              ; mov ra0, unif       # H filter coeffs
++  mov ra2, unif         ; mov r0, elem_num
 +
-+sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
-+add rb17, ra1.16a, 1
-+add rb18, ra1.16a, 3
-+shl r0,   ra1.16a, 7
++  setf_nz_if_v                                  # Also acts as delay slot for ra2
 +
-+add ra_frame_base_next, rb_x_next, r2
++  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
++  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
++  max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
++  min r0, r0, rb_max_x  ; mov -, unif           # ; width_height
 +
-+# r0 is currently height<<7
-+# For vr_setup we want height<<20 (so 20-7=13 additional bits)
-+shl r3, r0, i_shift21     ; mov ra3, unif # Shl 13 + Mask off top 8 bits ; V filter coeffs
-+shr r3, r3, 8
-+add vr_setup, r3, rb21
++  shl rb_xshift2_next, r0, 4
 +
-+add r0, r0, ra1.16b    # Combine width and height of destination area
-+shl r0, r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
-+add rb26, r0, rb27
++  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
++  add r0, r0, r0        ; mov ra_y2_next, ra2.16a
++  and r1, r0, r1        ; mov ra3, unif         # ; V filter coeffs
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov rb8,  ra3.8a      # Add stripe offsets ; start unpacking filter coeffs
++  add rb_base2_next, r3, r0
 +
-+# get filter coefficients
++  mov ra1, unif         ; mov rb9,  ra3.8b      # U offset/weight
++  mov.ifnz ra1, unif    ; mov rb10, ra3.8c      # V offset/weight
 +
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+# Get offset & weight stuff
-+
-+# The unif read occurs unconditionally, only the write is conditional
-+mov      ra1, unif  ; mov rb8,  ra3.8a    # U offset/weight ;
-+mov.ifnz ra1, unif  ; mov rb9,  ra3.8b    # V offset/weight ;
-+nop                 ; mov rb10, ra3.8c
-+mov r3, 0           ; mov rb11, ra3.8d    # Loop counter ;
-+
-+shl r1, ra1.16b, rb13
-+asr rb12, r1, 1
++  nop                   ; mov rb11, ra3.8d
++  shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3     # ; r3 (loop counter)  = 0
++  asr rb12, r1, 1
 +
 +# ra1.16a used directly in the loop
 +
@@ -13843,125 +17093,147 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.qasm ffmpeg-3.2.4.patch/libavcodec
 +# then submit two more texture requests
 +
 +# r3 = 0
++
 +:uvloop_b
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
++  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu1     # loop counter increment
++  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next
++  shr r1, r0, 8         ; mov.ifnz r3, ra_y2
 +
-+max r2, ra_y, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x, r2         ; v8subs r1, r1, rb20
-+add t1s, ra_frame_base, r2
++  max r2, r3, ra_k0     ; mov.ifz ra_base2, rb_base2_next
++  min r2, r2, rb_max_y
++  add ra_y2, r3, ra_k1  ; mul24 r2, r2, rb_pitch
++  add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
 +
 +# generate seven shifted versions
 +# interleave with scroll of vertical context
 +
 +mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
 +
-+nop                  ; mul24      r3, ra0.8a,       r0
-+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+sub r0, r2, r3       ; mov r3, rb31
-+sub.setf -, r3, 4    ; mov ra12, ra13
-+brr.anyn -, r:uvloop_b
-+mov ra13, ra14          ; mul24 r1, ra14, rb9
-+mov ra14, ra15
-+mov ra15, r0            ; mul24 r0, ra12, rb8
++  and r1, r1, rb_k255  ; mul24      r3, ra0.8a,       r0
++  nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
++  nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
++  sub r0, r2, r3       ; mov r3, rb31
++  sub.setf -, r3, 4    ; mov ra12, ra13
++  brr.anyn -, r:uvloop_b
++  mov ra13, ra14          ; mul24 r1, ra14, rb9
++  mov ra14, ra15          ; mul24 r2, ra15, rb10
++  mov ra15, r0            ; mul24 r0, ra12, rb8
 +# >>> .anyn uvloop_b
 +
 +# apply vertical filter and write to VPM
 +
-+sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+add r1, r1, r0          ; mul24 r0, ra15, rb11
-+# Beware: vpm read gets unsigned 16-bit value, so we must sign extend it
-+sub r1, r1, r0          ; mul24 r0, vpm, ra4  # ra4 = 0x10000
-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+asr r1, r1, 14          # shift2=6
-+
-+asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
-+nop                     ; mul24 r0, r0, rb14
-+
-+add r1, r1, r0          ; mov -, vw_wait
-+shl r1, r1, 8           # Lose bad top 8 bits & sign extend
++  sub r1, r1, r0        ; mov ra8.16b, ra7      # FIFO rotate (all ra/b4..7)
++  add r1, r1, r2        ; mul24 r0, ra15, rb11
++  sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
++  mov ra7, rb6          ; mul24 r1, r1, ra_k256
++  asr r1, r1, 14        ; mov rb6, ra5 # shift2=6
 +
-+add r1, r1, rb12        # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
++  mov ra5, rb4          ; mul24 r1, r1, ra1.16a
++  add r1, r1, r0        ; mov rb4, ra4
 +
-+brr.anyn -, r:uvloop_b
-+asr r1, r1, rb13         # Delay 1
-+min r1, r1, rb_k255       # Delay 2
-+max vpm, r1, 0         # Delay 3
++  mov ra4, rb5          ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend
++  add r1, r1, rb12      ; mov rb5, ra6          # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
 +
++  sub.setf -, r3, ra31  ; mov ra6, rb7
++  asr ra3.8as, r1, rb13
++  nop                   ; mov r1, r1 << 8
++  brr.anyn -, r:uvloop_b
++  asr ra3.8bs, r1, rb13
++  mov -, vw_wait        ; mov rb7, ra8          #  vw_wait is B-reg (annoyingly) ; Final FIFO mov
++  mov vpm, ra3
++# >>>
 +
-+# DMA out for U
-+
-+mov vw_setup, rb26 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
++# DMA out
 +
-+# DMA out for V
-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+# Could potentially push this write into the start of the next pipeline stage.
-+mov r0, 16
-+mov -, vw_wait
++  bra -, ra_link
++  mov vw_setup, rb26
++  mov vw_setup, rb29
++  mov vw_addr, unif     # c_dst_addr
 +
-+bra -, ra31
-+add vw_setup, rb26, r0 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
 +
 +################################################################################
 +
 +# mc_exit()
 +
-+::mc_exit
-+mov  -, vw_wait # wait on the VDW
++::mc_interrupt_exit8c
++  ldtmu0
++  ldtmu1
++  ldtmu1
++  mov  -, vw_wait ; nop ; ldtmu0  # wait on the VDW
++
++  mov -,sacq(0) # 1
++  mov -,sacq(0) # 2
++  mov -,sacq(0) # 3
++  mov -,sacq(0) # 4
++  mov -,sacq(0) # 5
++  mov -,sacq(0) # 6
++  mov -,sacq(0) # 7
++#  mov -,sacq(0) # 8
++#  mov -,sacq(0) # 9
++#  mov -,sacq(0) # 10
++#  mov -,sacq(0) # 11
 +
-+mov -,srel(0)
++  nop        ; nop ; thrend
++  mov interrupt, 1; nop # delay slot 1
++  nop        ; nop # delay slot 2
 +
-+ldtmu0
-+ldtmu1
-+ldtmu0
-+ldtmu1
++# Chroma & Luma the same now
++::mc_exit_c
++::mc_exit
++  ldtmu0
++  ldtmu1
++  ldtmu0
++  mov  -, vw_wait ; nop ; ldtmu1 # wait on the VDW
 +
-+nop        ; nop ; thrend
-+nop        ; nop # delay slot 1
-+nop        ; nop # delay slot 2
++  mov -,srel(0)
 +
-+# mc_interrupt_exit8()
-+::mc_interrupt_exit8
-+mov  -, vw_wait # wait on the VDW
++  nop        ; nop ; thrend
++  nop        ; nop # delay slot 1
++  nop        ; nop # delay slot 2
 +
-+ldtmu0
-+ldtmu1
-+ldtmu0
-+ldtmu1
 +
-+mov -,sacq(0) # 1
-+mov -,sacq(0) # 2
-+mov -,sacq(0) # 3
-+mov -,sacq(0) # 4
-+mov -,sacq(0) # 5
-+mov -,sacq(0) # 6
-+mov -,sacq(0) # 7
++# mc_interrupt_exit12()
++::mc_interrupt_exit12
++  ldtmu0
++  ldtmu1
++  ldtmu0
++  mov  -, vw_wait ; nop ; ldtmu1  # wait on the VDW
 +
-+nop        ; nop ; thrend
-+mov interrupt, 1; nop # delay slot 1
-+nop        ; nop # delay slot 2
++  mov -,sacq(0) # 1
++  mov -,sacq(0) # 2
++  mov -,sacq(0) # 3
++  mov -,sacq(0) # 4
++  mov -,sacq(0) # 5
++  mov -,sacq(0) # 6
++  mov -,sacq(0) # 7
++  mov -,sacq(0) # 8
++  mov -,sacq(0) # 9
++  mov -,sacq(0) # 10
++  mov -,sacq(0) # 11
 +
++  nop        ; nop ; thrend
++  mov interrupt, 1; nop # delay slot 1
++  nop        ; nop # delay slot 2
 +
 +
++::mc_exit1
++  mov  -, vw_wait # wait on the VDW
 +
++  ldtmu0
++  ldtmu1
++  ldtmu0
++  ldtmu1
++  nop        ; nop ; thrend
++  mov interrupt, 1; nop # delay slot 1
++  nop        ; nop # delay slot 2
 +
 +# LUMA CODE
 +
@@ -13971,116 +17243,104 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.qasm ffmpeg-3.2.4.patch/libavcodec
 +################################################################################
 +# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel)
 +::mc_setup
-+  mov r3, 16
-+
 +  # Need to save these because we need to know the frame dimensions before computing texture coordinates
-+  mov ra8, unif  # y_x
-+  mov ra9, unif  # ref_y_base
-+  mov ra10, unif # y2_x2
-+  mov ra11, unif # ref_y2_base
++  mov tmurs, 1          ; mov ra8, unif         # No TMU swap ; y_x
++  mov ra9, unif         # ref_y_base
++  mov ra10, unif        # y2_x2
++  mov ra11, unif        # ref_y2_base
 +
 +# Read image dimensions
-+  mov r1, unif # width_height
-+  shl r0,r1,r3
-+  asr r1,r1,r3 # width
-+  asr r0,r0,r3 # height
-+  sub rb_frame_width_minus_1,r1,1
-+  sub rb_frame_height_minus_1,r0,1
-+
-+# get source pitch
-+  mov rb_pitch, unif # src_pitch
++  mov ra3, unif         # width_height
++  mov rb_xpitch, unif   # stride2
++  sub rb_max_x, ra3.16b, 1
++  sub rb_max_y, ra3.16a, 1
++  mov rb_pitch, unif    # stride1
 +
 +# get destination pitch
-+  mov r0, unif       # dst_pitch
 +  mov r1, vdw_setup_1(0)
-+  add rb24, r1, r0
++  or  rb24, r1, rb_pitch
 +
 +# Compute base address for first and second access
-+  mov r1, ra8 # y_x
-+  shl r0,r1,r3 # r0 is x<<16
-+  asr r1,r1,r3 # r1 is y
-+  asr r0,r0,r3 # r0 is x
-+  add r0, r0, elem_num # Load x
++  mov r3, elem_num
++  add r0, ra8.16a, r3   # Load x + elem_num
 +  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9  # Load the frame base
++  min r0, r0, rb_max_x
 +  shl ra_xshift_next, r0, 3 # Compute shifts
-+  add ra_y, r1, 1
-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+  add r2, r2, r0  # r2 is address for frame0 (not including y offset)
++
++
++# In a single 32 bit word we get 4 Y Pels so mask 2 bottom bits of xs
++
++  and r0, r0, -4        ; v8subs r2, r2, r2
++  sub r2, r2, rb_pitch
++  and r1, r0, r2
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        # Add stripe offsets
++  add ra_base, ra9, r0
++
++  mov r1, ra8.16b       # Load y
++  add ra_y, r1, 1       # Set for next
 +  max r1, r1, 0
-+  min r1, r1, rb_frame_height_minus_1
-+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-+  add t0s, r2, r1 ; mov ra_frame_base, r2
-+
-+  mov r1, ra10 # y_x
-+  shl r0,r1,r3 # r0 is x<<16
-+  asr r1,r1,r3 # r1 is y
-+  asr r0,r0,r3 # r0 is x
-+  add r0, r0, elem_num # Load x
++  min r1, r1, rb_max_y
++
++# submit texture requests for first line
++  nop                   ; mul24 r1, r1, rb_pitch
++  add t0s, ra_base, r1
++
++
++  # r3 still contains elem_num
++  add r0, ra10.16a, r3  # Load x
 +  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11  # Load the frame base
-+  shl rx_xshift2_next, r0, 3 # Compute shifts
-+  add ra_y2, r1, 1
-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+  add r2, r2, r0  # r2 is address for frame1 (not including y offset)
++  min r0, r0, rb_max_x
++  shl rb_xshift2_next, r0, 3 # Compute shifts
++
++  # r2 still contains mask
++  and r0, r0, -4
++  and r1, r0, r2
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        # Add stripe offsets
++  add ra_base2, ra11, r0
++
++  mov r1, ra10.16b       # Load y
++  add ra_y2, r1, 1       # Set for next
 +  max r1, r1, 0
-+  min r1, r1, rb_frame_height_minus_1
-+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-+  add t1s, r2, r1 ; mov ra_frame_base2, r2
++  min r1, r1, rb_max_y
 +
++# submit texture requests for first line
++  nop                   ; mul24 r1, r1, rb_pitch
++  add t1s, ra_base2, r1
 +
 +# load constants
 +
 +  mov ra_k1, 1
 +  mov ra_k256, 256
-+  mov ra30, 64
-+
-+  mov rb20, 0xffffff00
 +  mov rb_k255, 255
-+  mov rb23, 24
++  mov ra_k0, 0
 +
 +# touch vertical context to keep simulator happy
 +
-+  mov ra8, 0
-+  mov ra9, 0
-+  mov ra10, 0
-+  mov ra11, 0
-+  mov ra12, 0
-+  mov ra13, 0
-+  mov ra14, 0
-+  mov ra15, 0
++  mov ra8,  0           ; mov rb8,  0
++  mov ra9,  0           ; mov rb9,  0
++  mov ra10, 0           ; mov rb10, 0
++  mov ra11, 0           ; mov rb11, 0
 +
 +# Compute part of VPM to use
-+  mov r2, qpu_num
-+  mov r1, r2
-+  asr r1, r1, 2
-+  shl r1, r1, 6
-+  mov r0, r2
-+  and r0, r0, 3
-+  add r0, r0, r1
-+  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+  add rb28, r0, r1  # VPM for saving data
-+  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+  shl r0, r0, 5
-+  add rb27, r0, r1  # Command for dma output
++  m_calc_dma_regs rb28, rb27
 +
 +# Weighted prediction denom
-+  add rb13, unif, 9  # unif = weight denom + 6
-+
-+  mov -, unif # Unused
++  add rb13, unif, 9     # unif = weight denom + 6
 +
 +# submit texture requests for second line
 +  max r1, ra_y, 0
-+  min r1, r1, rb_frame_height_minus_1
++  min r1, r1, rb_max_y
 +  add ra_y, ra_y, 1
-+  nop ; mul24 r1, r1, rb_pitch
-+  add t0s, r1, ra_frame_base
++  mov -, unif           ; mul24 r1, r1, rb_pitch  # unused ;
++  add t0s, r1, ra_base
 +
 +  max r1, ra_y2, 0
-+  min r1, r1, rb_frame_height_minus_1
++  min r1, r1, rb_max_y
 +  add ra_y2, ra_y2, 1
-+  nop ; mul24 r1, r1, rb_pitch
-+  add t1s, r1, ra_frame_base2
++  nop                   ; mul24 r1, r1, rb_pitch
++  add t1s, r1, ra_base2
 +
 +# FALL THROUGHT TO PER-BLOCK SETUP
 +
@@ -14088,47 +17348,63 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.qasm ffmpeg-3.2.4.patch/libavcodec
 +# P and B blocks share the same setup code to save on Icache space
 +:per_block_setup
 +  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+  mov ra31, unif
++  mov ra_link, unif
++#### We do all the setup even if we are about to exit - reading junk from unif....
 +
-+  mov ra1, unif  ; mov r1, elem_num  # y_x ; elem_num has implicit unpack??
++  mov ra1, unif         ; mov r3, elem_num  # y_x ; elem_num has implicit unpack??
 +
 +# per-channel shifts were calculated on the *previous* invocation
 +  mov ra_xshift, ra_xshift_next
-+  mov rx_xshift2, rx_xshift2_next
++  mov rb_xshift2, rb_xshift2_next
 +
 +# get base addresses and per-channel shifts for *next* invocation
 +
-+  add r0, ra1.16a, r1 # Load x
++  add r0, ra1.16a, r3   # Load x
 +  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+  shl ra_xshift_next, r0, 3 # Compute shifts
-+  mov r3, 8                          ; mov ra_y_next, ra1.16b
-+  and r0, r0, ~3                     ; mov ra1, unif # y2_x2
-+  add ra_frame_base_next, r2, r0
-+
-+  add r0, ra1.16a, r1 # Load x
++  min r0, r0, rb_max_x
++
++  shl ra_xshift_next, r0, 3         # Compute shifts
++  and r0, r0, -4        ; v8subs r2, r2, r2
++  sub r2, r2, rb_pitch
++  and r1, r0, r2
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        # Add stripe offsets
++  add ra_base_next, unif, r0              # Base1
++  mov ra_y_next, ra1.16b                      # Load y
++  mov ra1, unif         # x2_y2
++  nop                   # ra1 delay
++
++  add r0, ra1.16a, r3   # Load x2
 +  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+  shl rx_xshift2_next, r0, 3         # Compute shifts
-+  add r3, r3, r3                     ; mov ra_y2_next, ra1.16b  # r3 = 16 ;
-+  and r0, r0, ~3                     ; mov ra1, unif  # width_height ; r0 gives the clipped and aligned x coordinate
-+  add rx_frame_base2_next, r2, r0    # r2 is address for frame1 (not including y offset)
++  min r0, r0, rb_max_x
++
++  shl rb_xshift2_next, r0, 3         # Compute shifts
++  and r0, r0, -4
++  and r1, r0, r2
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        # Add stripe offsets
++  add rb_base2_next, unif, r0              # Base1
++  mov ra_y2_next, ra1.16b                      # Load y
++  mov ra_width_height, unif         # width_height
 +
 +# set up VPM write
-+  mov vw_setup, rb28
++  mov vw_setup, rb28    # [ra1 delay]
 +
 +# get width,height of block (unif load above)
-+  sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
-+  add rb17, ra1.16a, 5
-+  add rb18, ra1.16a, 7
-+  shl r0,   ra1.16a, 7
-+  add r0,   r0, ra1.16b # Combine width and height of destination area
-+  shl r0,   r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register
++  sub rb29, rb24, ra_width # Compute vdw_setup1(dst_pitch-width)
++  add rb17, ra_height, 5  ; mov r0, ra_height
++  mov r1, 16
++  min r0, r0, r1
++  add rb18, r0, 7
++  shl r0,   r0, 7
++  add r0,   r0, ra_width                        # Combine width and height of destination area
++  shl r0,   r0, i_shift16                       # Shift into bits 16 upwards of the vdw_setup0 register
 +  add rb26, r0, rb27                 ; mov r0, unif   # Packed filter offsets
 +
 +# get filter coefficients and discard unused B frame values
-+  shl.ifz r0, r0, i_shift16      # Pick half to use
-+  shl ra8, r0, 3
++  shl.ifz r0, r0, i_shift16          ; mov ra5, unif    #  Pick half to use ; L0 offset/weight
++  mov r2, 0x01040400                 # [ra5 delay]
++  shl ra8, r0, 3                     ; mov rb14, ra5.16a
 +
 +# Pack the 1st 4 filter coefs for H & V tightly
 +
@@ -14136,9 +17412,8 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.qasm ffmpeg-3.2.4.patch/libavcodec
 +  ror ra2.8a, r1, ra8.8d
 +  ror ra0.8a, r1, ra8.8c
 +
-+  mov r1,0x01040400
-+  ror ra2.8b, r1, ra8.8d
-+  ror ra0.8b, r1, ra8.8c
++  ror ra2.8b, r2, ra8.8d
++  ror ra0.8b, r2, ra8.8c
 +
 +  mov r1,0x050b0a00  # -ve
 +  ror ra2.8c, r1, ra8.8d
@@ -14164,37 +17439,44 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.qasm ffmpeg-3.2.4.patch/libavcodec
 +  ror ra3.8c, r1, ra8.8d
 +  ror ra1.8c, r1, ra8.8c
 +
++  mov r1,0x01010000  # -ve
++  ror ra3.8d, r1, ra8.8d
++  ror ra1.8d, r1, ra8.8c
++
 +# Extract weighted prediction information in parallel
++# We are annoyingly A src limited here
 +
-+  mov r1,0x01010000  # -ve
-+  ror ra3.8d, r1, ra8.8d    ; mov r0, unif      # ; weight L1 weight L1 (hi16)/weight L0 (lo16)
-+  ror ra1.8d, r1, ra8.8c    ; mov r1, rb13      # ; rb13 = weight denom + 6 + 9
-+
-+# r3 = 16 from (long way) above
-+  shl r1, unif, r1          ; mov rb4, ra3.8a   # combined offet = ((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) ;
-+  asr ra18, r0, r3          ; mov rb5, ra3.8b
-+  bra -, ra31
-+  shl r0, r0, r3            ; mov rb6, ra3.8c
-+  mov r3, 0                 ; mov rb7, ra3.8d   # loop count ;
-+  asr rb12, r1, 9
-+
-+# >>> branch ra31
++  mov rb4, ra3.8a            ; mov ra18, unif
++  mov rb5, ra3.8b
++  mov rb6, ra3.8c
++  mov.ifnz ra5, ra18
++
++  mov rb_dest, unif     # Destination address
++
++  bra -, ra_link
++
++  shl r0, ra5.16b, rb13      # Offset calc
++  asr rb12, r0, 9            # For B l1 & L0 offsets should be identical so it doesn't matter which we use
++  mov r3, 0                  ; mov rb7, ra3.8d
++# >>> branch ra_link
 +#
 +# r3 = 0
-+# ra18 = weight L1
-+# r0   = weight L0 << 16 (will be put into rb14 in filter preamble)
-+# rb13 = weight denom + 6 + 9
-+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
++# ra18.16a = weight L1
++# ra5.16a  = weight L0/L1 depending on side (wanted for 2x mono-pred)
++# rb12     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
++# rb13     = weight denom + 6 + 9
++# rb14     = weight L0
 +
 +
 +################################################################################
-+# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
++# mc_filter(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
 +# In a P block, y2_x2 should be y_x+8
 +# At this point we have already issued two pairs of texture requests for the current block
 +
 +::mc_filter
-+# r0 = weight << 16; We want weight * 2 in rb14
-+  asr rb14, r0, 15
++# ra5.16a = weight << 16; We want weight * 2 in rb14
++
++  shl rb14, ra5.16a, 1
 +
 +# r3 = 0
 +
@@ -14210,20 +17492,20 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.qasm ffmpeg-3.2.4.patch/libavcodec
 +# might be B where y != y2 so we must do full processing on both y and y2
 +
 +  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++  shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
++  mov.ifz ra_base, ra_base_next ; mov rb31, r3
 +  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
++  shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
 +
 +  max r2, ra_y, 0  # y
-+  min r2, r2, rb_frame_height_minus_1
++  min r2, r2, rb_max_y
 +  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
++  add t0s, ra_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
 +
 +  max r2, ra_y2, 0  # y
-+  min r2, r2, rb_frame_height_minus_1
++  min r2, r2, rb_max_y
 +  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
++  add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
 +
 +# generate seven shifted versions
 +# interleave with scroll of vertical context
@@ -14232,21 +17514,21 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.qasm ffmpeg-3.2.4.patch/libavcodec
 +
 +# apply horizontal filter
 +  nop                  ; mul24      r3, ra0.8a,      r0
-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
++  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
 +  sub r0, r2, r3       ; mov r3, rb31
 +
 +  sub.setf -, r3, 8       ; mov r1,   ra8
@@ -14285,18 +17567,48 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.qasm ffmpeg-3.2.4.patch/libavcodec
 +  max vpm, r1, 0         # Delay 3
 +# >>> branch.anyn yloop
 +
-+# DMA out
++# If looping again the we consumed 16 height last loop
++  # rb29 (stride) remains constant
++  # rb17 remains const (based on total height)
++  # recalc rb26, rb18 based on new segment height
++  # N.B. r3 is loop counter still
++
++  mov r1, 16
++  sub r0, ra_height, r1
++  mov ra_height, r0
++  max.setf r0, r0, 0    # Done if Z now
 +
-+  brr -, r:per_block_setup
++# DMA out
++  brr.anyz -, r:per_block_setup
 +  mov vw_setup, rb26 # VDW setup 0    Delay 1
 +  mov vw_setup, rb29 # Stride         Delay 2
-+  mov vw_addr, unif # start the VDW   Delay 3
++  mov vw_addr, rb_dest # start the VDW   Delay 3
++# >>> .anyz per_block_setup
++
++  min r0, r0, r1
++  add rb18, rb18, r0
++  sub r0, r0, r1
++  shl r0, r0, i_shift23
++  add rb26, rb26, r0
++
++  nop ; mul24 r0, r1, rb_pitch  # r0 = pitch*16
++  add rb_dest, rb_dest, r0
++
++  mov vw_setup, rb28    # Reset our VDM write pointer
++
++  brr -, r:yloop
++  nop
++  nop
++  nop
++# >>>
++
++
 +
 +
 +
 +################################################################################
 +
-+# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
 +# In a P block, only the first half of coefficients contain used information.
 +# At this point we have already issued two pairs of texture requests for the current block
 +# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
@@ -14308,7 +17620,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.qasm ffmpeg-3.2.4.patch/libavcodec
 +
 +::mc_filter_b
 +  # r0 = weightL0 << 16, we want it in rb14
-+  asr rb14, r0, i_shift16
++#  asr rb14, r0, i_shift16
 +
 +:yloopb
 +# retrieve texture results and pick out bytes
@@ -14318,20 +17630,20 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.qasm ffmpeg-3.2.4.patch/libavcodec
 +# Perhaps we could add on the pitch and clip using larger values?
 +
 +  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++  shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
++  mov.ifz ra_base, ra_base_next ; mov rb31, r3
 +  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
++  shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
 +
 +  max r2, ra_y, 0  # y
-+  min r2, r2, rb_frame_height_minus_1
++  min r2, r2, rb_max_y
 +  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
++  add t0s, ra_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
 +
 +  max r2, ra_y2, 0  # y
-+  min r2, r2, rb_frame_height_minus_1
++  min r2, r2, rb_max_y
 +  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
++  add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
 +
 +# generate seven shifted versions
 +# interleave with scroll of vertical context
@@ -14340,596 +17652,119 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_shader.qasm ffmpeg-3.2.4.patch/libavcodec
 +
 +# apply horizontal filter
 +  nop                  ; mul24      r3, ra0.8a,      r0
-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
++  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
 +  sub r0, r2, r3       ; mov r3, rb31
 +
 +  sub.setf -, r3, 8       ; mov r1,   ra8
-+  mov ra8,  ra9           ; mov rb8,  rb9
-+  brr.anyn -, r:yloopb
-+  mov ra9,  ra10          ; mov rb9,  rb10
-+  mov ra10, ra11          ; mov rb10, rb11
-+  mov ra11, r0            ; mov rb11, r1
-+  # >>> .anyn yloopb
-+
-+  # apply vertical filter and write to VPM
-+
-+  nop                     ; mul24 r0, rb8,  ra2.8a
-+  nop                     ; mul24 r1, rb9,  ra2.8b
-+  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+  add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+  add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+  sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+  add r1, r1, r0          ; mul24 r0, ra11, rb7
-+  sub r1, r1, r0          ; mov r2, rb12
-+# As with P-pred r1 is a 22-bit signed quantity in 32-bits
-+# Top 8 bits are bad - low 6 bits should be discarded
-+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+
-+  asr r1, r1, 14
-+  nop                     ; mul24 r0, r1, rb14
-+  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
-+
-+  add r1, r1, r0          ; mov -, vw_wait
-+  shl r1, r1, 8
-+
-+  brr.anyn -, r:yloopb
-+  asr r1, r1, rb13         # Delay 1
-+  min r1, r1, rb_k255       # Delay 2
-+  max vpm, r1, 0         # Delay 3
-+
-+# DMA out
-+  brr -, r:per_block_setup
-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
-+  mov vw_setup, rb29 # Stride         Delay 2
-+  mov vw_addr, unif # start the VDW   Delay 3
-+
-+################################################################################
-+
-+# mc_interrupt_exit12()
-+::mc_interrupt_exit12
-+  mov  -, vw_wait # wait on the VDW
-+
-+  # Dummy wait to test instructions
-+#  mov r3,1000000
-+#:dummy_loop
-+#  sub.setf r3, r3, 1
-+#  nop
-+#  nop
-+#  brr.anynn -, r:dummy_loop
-+#  nop
-+#  nop
-+#  nop
-+
-+  ldtmu0
-+  ldtmu0
-+  ldtmu1
-+  ldtmu1
-+
-+  mov -,sacq(0) # 1
-+  mov -,sacq(0) # 2
-+  mov -,sacq(0) # 3
-+  mov -,sacq(0) # 4
-+  mov -,sacq(0) # 5
-+  mov -,sacq(0) # 6
-+  mov -,sacq(0) # 7
-+  mov -,sacq(0) # 8
-+  mov -,sacq(0) # 9
-+  mov -,sacq(0) # 10
-+  mov -,sacq(0) # 11
-+
-+  nop        ; nop ; thrend
-+  mov interrupt, 1; nop # delay slot 1
-+  nop        ; nop # delay slot 2
-+
-+
-+::mc_exit1
-+  mov  -, vw_wait # wait on the VDW
-+
-+  ldtmu0
-+  ldtmu1
-+  ldtmu0
-+  ldtmu1
-+  nop        ; nop ; thrend
-+  mov interrupt, 1; nop # delay slot 1
-+  nop        ; nop # delay slot 2
-+
++  mov ra8,  ra9           ; mov rb8,  rb9
++  brr.anyn -, r:yloopb
++  mov ra9,  ra10          ; mov rb9,  rb10
++  mov ra10, ra11          ; mov rb10, rb11
++  mov ra11, r0            ; mov rb11, r1
++  # >>> .anyn yloopb
 +
-+::mc_end
-+# Do not add code here because mc_end must appear after all other code.
-diff -Naur ffmpeg-3.2.4/libavcodec/rpi_user_vcsm.h ffmpeg-3.2.4.patch/libavcodec/rpi_user_vcsm.h
---- ffmpeg-3.2.4/libavcodec/rpi_user_vcsm.h	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/rpi_user_vcsm.h	2017-03-22 22:42:34.851798583 +0100
-@@ -0,0 +1,459 @@
-+/*****************************************************************************
-+* Copyright 2001 - 2011 Broadcom Corporation.  All rights reserved.
-+*
-+* This program is the proprietary software of Broadcom Corporation and/or
-+* its licensors, and may only be used, duplicated, modified or distributed
-+* pursuant to the terms and conditions of a separate, written license
-+* agreement executed between you and Broadcom (an "Authorized License").
-+* Except as set forth in an Authorized License, Broadcom grants no license
-+* (express or implied), right to use, or waiver of any kind with respect to
-+* the Software, and Broadcom expressly reserves all rights in and to the
-+* Software and all intellectual property rights therein.  IF YOU HAVE NO
-+* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
-+* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
-+* THE SOFTWARE.
-+*
-+* Except as expressly set forth in the Authorized License,
-+* 1. This program, including its structure, sequence and organization,
-+*    constitutes the valuable trade secrets of Broadcom, and you shall use
-+*    all reasonable efforts to protect the confidentiality thereof, and to
-+*    use this information only in connection with your use of Broadcom
-+*    integrated circuit products.
-+* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
-+*    AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
-+*    WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
-+*    RESPECT TO THE SOFTWARE.  BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
-+*    IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
-+*    FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
-+*    QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
-+*    ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
-+* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
-+*    LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
-+*    OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
-+*    YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
-+*    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
-+*    OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
-+*    IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
-+*    ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
-+*****************************************************************************/
-+
-+#ifndef __USER_VCSM__H__INCLUDED__
-+#define __USER_VCSM__H__INCLUDED__
-+
-+/* VideoCore Shared Memory - user interface library.
-+**
-+** This library provides all the necessary abstraction for any application to
-+** make use of the shared memory service which is distributed accross a kernel
-+** driver and a videocore service.
-+**
-+** It is an application design decision to choose or not to use this service.
-+**
-+** The logical flow of operations that a user application needs to follow when
-+** using this service is:
-+**
-+**       1) Initialize the service.
-+**       2) Allocate shared memory blocks.
-+**       3) Start using the allocated blocks.
-+**          - In order to gain ownership on a block, lock the allocated block,
-+**            locking a block returns a valid address that the user application
-+**            can access.
-+**          - When finished with using the block for the current execution cycle
-+**            or function, and so when giving up the ownership, unlock the block.
-+**       4) A block can be locked/unlocked as many times required - within or outside
-+**          of - a specific execution context.
-+**       5) To completely release an allocated block, free it.
-+**       6) If the service is no longer required, terminate it.
-+**
-+**
-+** Some generic considerations:
-+
-+** Allocating memory blocks.
-+**
-+**   Memory blocks can be allocated in different manners depending on the cache
-+**   behavior desired.  A given block can either be:
-+
-+**       - Allocated in a non cached fashion all the way through host and videocore.
-+**       - Allocated in a cached fashion on host OR videocore.
-+**       - Allocated in a cached fashion on host AND videocore.
-+**
-+**   It is an application decision to determine how to allocate a block.  Evidently
-+**   if the application will be doing substantial read/write accesses to a given block,
-+**   it is recommended to allocate the block at least in a 'host cached' fashion for
-+**   better results.
-+**
-+**
-+** Locking memory blocks.
-+**
-+**   When the memory block has been allocated in a host cached fashion, locking the
-+**   memory block (and so taking ownership of it) will trigger a cache invalidation.
-+**
-+**   For the above reason and when using host cached allocation, it is important that
-+**   an application properly implements the lock/unlock mechanism to ensure cache will
-+**   stay coherent, otherwise there is no guarantee it will at all be.
-+**
-+**   It is possible to dynamically change the host cache behavior (ie cached or non
-+**   cached) of a given allocation without needing to free and re-allocate the block.
-+**   This feature can be useful for such application which requires access to the block
-+**   only at certain times and not otherwise.  By changing the cache behavior dynamically
-+**   the application can optimize performances for a given duration of use.
-+**   Such dynamic cache behavior remapping only applies to host cache and not videocore
-+**   cache.  If one requires to change the videocore cache behavior, then a new block
-+**   must be created to replace the old one.
-+**
-+**   On successful locking, a valid pointer is returned that the application can use
-+**   to access to data inside the block.  There is no guarantee that the pointer will
-+**   stay valid following the unlock action corresponding to this lock.
-+**
-+**
-+** Unocking memory blocks.
-+**
-+**   When the memory block has been allocated in a host cached fashion, unlocking the
-+**   memory block (and so forgiving its ownership) will trigger a cache flush unless
-+**   explicitely asked not to flush the cache for performances reasons.
-+**
-+**   For the above reason and when using host cached allocation, it is important that
-+**   an application properly implements the lock/unlock mechanism to ensure cache will
-+**   stay coherent, otherwise there is no guarantee it will at all be.
-+**
-+**
-+** A complete API is defined below.
-+*/
++  # apply vertical filter and write to VPM
++  nop                     ; mul24 r0, rb8,  ra2.8a
++  nop                     ; mul24 r1, rb9,  ra2.8b
++  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
++  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
++  add r1, r1, r0          ; mul24 r0, ra8,  rb4
++  add r1, r1, r0          ; mul24 r0, ra9,  rb5
++  sub r1, r1, r0          ; mul24 r0, ra10, rb6
++  add r1, r1, r0          ; mul24 r0, ra11, rb7
++  sub r1, r1, r0          ; mov r2, rb12
++# As with P-pred r1 is a 22-bit signed quantity in 32-bits
++# Top 8 bits are bad - low 6 bits should be discarded
++  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
 +
-+#ifdef __cplusplus
-+extern "C"
-+{
-+#endif
++  asr r1, r1, 14
++  nop                     ; mul24 r0, r1, rb14
++  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
 +
-+/* Different status that can be dumped.
-+*/
-+typedef enum
-+{
-+   VCSM_STATUS_VC_WALK_ALLOC = 0,   // Walks *all* the allocation on videocore.
-+                                    // Result of the walk is seen in the videocore
-+                                    // log.
-+   VCSM_STATUS_HOST_WALK_MAP,       // Walks the *full* mapping allocation on host
-+                                    // driver (ie for all processes).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_HOST_WALK_PID_MAP,   // Walks the per process mapping allocation on host
-+                                    // driver (for current process).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
-+                                    // driver (for current process).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_VC_MAP_ALL,          // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
-+                                    // VCSM_STATUS_HOST_WALK_MAP.
-+                                    //
-+   VCSM_STATUS_NONE,                // Must be last - invalid.
-+
-+} VCSM_STATUS_T;
-+
-+/* Different kind of cache behavior.
-+*/
-+typedef enum
-+{
-+   VCSM_CACHE_TYPE_NONE = 0,        // No caching applies.
-+   VCSM_CACHE_TYPE_HOST,            // Allocation is cached on host (user space).
-+   VCSM_CACHE_TYPE_VC,              // Allocation is cached on videocore.
-+   VCSM_CACHE_TYPE_HOST_AND_VC,     // Allocation is cached on both host and videocore.
-+
-+} VCSM_CACHE_TYPE_T;
-+
-+/* Initialize the vcsm processing.
-+**
-+** Must be called once before attempting to do anything else.
-+**
-+** Returns 0 on success, -1 on error.
-+*/
-+int vcsm_init( void );
++  add r1, r1, r0          ; mov -, vw_wait
++  shl r1, r1, 8
 +
++  brr.anyn -, r:yloopb
++  asr r1, r1, rb13         # Delay 1
++  min r1, r1, rb_k255       # Delay 2
++  max vpm, r1, 0         # Delay 3
 +
-+/* Terminates the vcsm processing.
-+**
-+** Must be called vcsm services are no longer needed, it will
-+** take care of removing any allocation under the current process
-+** control if deemed necessary.
-+*/
-+void vcsm_exit( void );
 +
++# If looping again the we consumed 16 height last loop
++  # rb29 (stride) remains constant
++  # rb17 remains const (based on total height)
++  # recalc rb26, rb18 based on new segment height
++  # N.B. r3 is loop counter still
 +
-+/* Queries the status of the the vcsm.
-+**
-+** Triggers dump of various kind of information, see the
-+** different variants specified in VCSM_STATUS_T.
-+**
-+** Pid is optional.
-+*/
-+void vcsm_status( VCSM_STATUS_T status, int pid );
-+
-+
-+/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
-+** allocator.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+**
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc( unsigned int size, char *name );
-+
-+
-+/* Allocates a cached block of memory of size 'size' via the vcsm memory
-+** allocator, the type of caching requested is passed as argument of the
-+** function call.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+**
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
-+
-+
-+/* Shares an allocated block of memory via the vcsm memory allocator.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+**
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc_share( unsigned int handle );
-+
-+
-+/* Resizes a block of memory allocated previously by vcsm_alloc.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** The handle must be unlocked by user prior to attempting any
-+** resize action.
-+**
-+** On error, the original size allocated against the handle
-+** remains available the same way it would be following a
-+** successful vcsm_malloc.
-+*/
-+int vcsm_resize( unsigned int handle, unsigned int new_size );
-+
-+
-+/* Frees a block of memory that was successfully allocated by
-+** a prior call the vcms_alloc.
-+**
-+** The handle should be considered invalid upon return from this
-+** call.
-+**
-+** Whether any memory is actually freed up or not as the result of
-+** this call will depends on many factors, if all goes well it will
-+** be freed.  If something goes wrong, the memory will likely end up
-+** being freed up as part of the vcsm_exit process.  In the end the
-+** memory is guaranteed to be freed one way or another.
-+*/
-+void vcsm_free( unsigned int handle );
-+
-+
-+/* Retrieves a videocore opaque handle from a mapped user address
-+** pointer.  The videocore handle will correspond to the actual
-+** memory mapped in videocore.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** Note: the videocore opaque handle is distinct from the user
-+**       opaque handle (allocated via vcsm_malloc) and it is only
-+**       significant for such application which knows what to do
-+**       with it, for the others it is just a number with little
-+**       use since nothing can be done with it (in particular
-+**       for safety reason it cannot be used to map anything).
-+*/
-+unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
-+
-+
-+/* Retrieves a videocore opaque handle from a opaque handle
-+** pointer.  The videocore handle will correspond to the actual
-+** memory mapped in videocore.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** Note: the videocore opaque handle is distinct from the user
-+**       opaque handle (allocated via vcsm_malloc) and it is only
-+**       significant for such application which knows what to do
-+**       with it, for the others it is just a number with little
-+**       use since nothing can be done with it (in particular
-+**       for safety reason it cannot be used to map anything).
-+*/
-+unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
++  mov r1, 16
++  sub r0, ra_height, r1
++  mov ra_height, r0
++  max.setf r0, r0, 0    # Done if Z now
 +
++# DMA out
++  brr.anyz -, r:per_block_setup
++  mov vw_setup, rb26 # VDW setup 0    Delay 1
++  mov vw_setup, rb29 # Stride         Delay 2
++  mov vw_addr, rb_dest # start the VDW   Delay 3
++# >>> .anyz per_block_setup
 +
-+/* Retrieves a user opaque handle from a mapped user address
-+** pointer.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+*/
-+unsigned int vcsm_usr_handle( void *usr_ptr );
-+
-+
-+/* Retrieves a mapped user address from an opaque user
-+** handle.
-+**
-+** Returns:        0 on error
-+**                 a non-zero address on success.
-+**
-+** On success, the address corresponds to the pointer
-+** which can access the data allocated via the vcsm_malloc
-+** call.
-+*/
-+void *vcsm_usr_address( unsigned int handle );
-+
-+
-+/* Locks the memory associated with this opaque handle.
-+**
-+** Returns:        NULL on error
-+**                 a valid pointer on success.
-+**
-+** A user MUST lock the handle received from vcsm_malloc
-+** in order to be able to use the memory associated with it.
-+**
-+** On success, the pointer returned is only valid within
-+** the lock content (ie until a corresponding vcsm_unlock_xx
-+** is invoked).
-+*/
-+void *vcsm_lock( unsigned int handle );
-+
-+
-+/* Locks the memory associated with this opaque handle.  The lock
-+** also gives a chance to update the *host* cache behavior of the
-+** allocated buffer if so desired.  The *videocore* cache behavior
-+** of the allocated buffer cannot be changed by this call and such
-+** attempt will be ignored.
-+**
-+** The system will attempt to honour the cache_update mode request,
-+** the cache_result mode will provide the final answer on which cache
-+** mode is really in use.  Failing to change the cache mode will not
-+** result in a failure to lock the buffer as it is an application
-+** decision to choose what to do if (cache_result != cache_update)
-+**
-+** The value returned in cache_result can only be considered valid if
-+** the returned pointer is non NULL.  The cache_result pointer may be
-+** NULL if the application does not care about the actual outcome of
-+** its action with regards to the cache behavior change.
-+**
-+** Returns:        NULL on error
-+**                 a valid pointer on success.
-+**
-+** A user MUST lock the handle received from vcsm_malloc
-+** in order to be able to use the memory associated with it.
-+**
-+** On success, the pointer returned is only valid within
-+** the lock content (ie until a corresponding vcsm_unlock_xx
-+** is invoked).
-+*/
-+void *vcsm_lock_cache( unsigned int handle,
-+                       VCSM_CACHE_TYPE_T cache_update,
-+                       VCSM_CACHE_TYPE_T *cache_result );
-+
-+
-+/* Unlocks the memory associated with this user mapped address.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking a mapped address, the user should no longer
-+** attempt to reference it.
-+*/
-+int vcsm_unlock_ptr( void *usr_ptr );
-+
-+
-+/* Unlocks the memory associated with this user mapped address.
-+** Apply special processing that would override the otherwise
-+** default behavior.
-+**
-+** If 'cache_no_flush' is specified:
-+**    Do not flush cache as the result of the unlock (if cache
-+**    flush was otherwise applicable in this case).
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking a mapped address, the user should no longer
-+** attempt to reference it.
-+*/
-+int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
++  min r0, r0, r1
++  add rb18, rb18, r0
++  sub r0, r0, r1
++  shl r0, r0, i_shift23
++  add rb26, rb26, r0
 +
++  nop ; mul24 r0, r1, rb_pitch  # r0 = pitch*16
++  add rb_dest, rb_dest, r0
 +
-+/* Unlocks the memory associated with this user opaque handle.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking an opaque handle, the user should no longer
-+** attempt to reference the mapped addressed once associated
-+** with it.
-+*/
-+int vcsm_unlock_hdl( unsigned int handle );
-+
-+
-+/* Unlocks the memory associated with this user opaque handle.
-+** Apply special processing that would override the otherwise
-+** default behavior.
-+**
-+** If 'cache_no_flush' is specified:
-+**    Do not flush cache as the result of the unlock (if cache
-+**    flush was otherwise applicable in this case).
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking an opaque handle, the user should no longer
-+** attempt to reference the mapped addressed once associated
-+** with it.
-+*/
-+int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
-+
-+/* Clean and/or invalidate the memory associated with this user opaque handle
-+**
-+** Returns:        non-zero on error
-+**
-+** structure contains a list of flush/invalidate commands. Commands are:
-+** 0: nop
-+** 1: invalidate       given virtual range in L1/L2
-+** 2: clean            given virtual range in L1/L2
-+** 3: clean+invalidate given virtual range in L1/L2
-+** 4: flush all L1/L2
-+*/
-+struct vcsm_user_clean_invalid_s {
-+   struct {
-+      unsigned int cmd;
-+      unsigned int handle;
-+      unsigned int addr;
-+      unsigned int size;
-+   } s[8];
-+};
++  mov vw_setup, rb28    # Reset our VDM write pointer
 +
-+int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
++  brr -, r:yloopb
++  nop
++  nop
++  nop
 +
-+#ifdef __cplusplus
-+}
-+#endif
++################################################################################
 +
-+#endif /* __USER_VCSM__H__INCLUDED__ */
++::mc_end
++# Do not add code here because mc_end must appear after all other code.
 diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc.c
 --- ffmpeg-3.2.4/libavcodec/rpi_zc.c	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/rpi_zc.c	2017-03-22 22:42:34.852798585 +0100
-@@ -0,0 +1,406 @@
++++ ffmpeg-3.2.4.patch/libavcodec/rpi_zc.c	2017-05-28 20:42:45.755088727 +0200
+@@ -0,0 +1,581 @@
 +#include "config.h"
 +#ifdef RPI
 +#include "rpi_qpu.h"
++#include "rpi_mailbox.h"
 +#include "rpi_zc.h"
++#include "libavutil/avassert.h"
++#include <pthread.h>
 +
 +#include "libavutil/buffer_internal.h"
++#include <interface/vctypes/vc_image_types.h>
++
++#define TRACE_ALLOC 0
 +
 +struct ZcPoolEnt;
 +
 +typedef struct ZcPool
 +{
 +    int numbytes;
++    unsigned int n;
 +    struct ZcPoolEnt * head;
 +    pthread_mutex_t lock;
 +} ZcPool;
@@ -14938,27 +17773,56 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +{
 +    // It is important that we start with gmem as other bits of code will expect to see that
 +    GPU_MEM_PTR_T gmem;
++    unsigned int n;
 +    struct ZcPoolEnt * next;
 +    struct ZcPool * pool;
 +} ZcPoolEnt;
 +
-+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const int size)
++#if 1
++//#define ALLOC_PAD       0x1000
++#define ALLOC_PAD       0
++#define ALLOC_ROUND     0x1000
++//#define ALLOC_N_OFFSET  0x100
++#define ALLOC_N_OFFSET  0
++#define STRIDE_ROUND    0x80
++#define STRIDE_OR       0x80
++#else
++#define ALLOC_PAD       0
++#define ALLOC_ROUND     0x1000
++#define ALLOC_N_OFFSET  0
++#define STRIDE_ROUND    32
++#define STRIDE_OR       0
++#endif
++
++#define DEBUG_ZAP0_BUFFERS 0
++
++
++static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const unsigned int req_size)
 +{
 +    ZcPoolEnt * const zp = av_malloc(sizeof(ZcPoolEnt));
 +
++    // Round up to 4k & add 4k
++    const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
++
 +    if (zp == NULL) {
 +        av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
 +        goto fail0;
 +    }
 +
-+    if (gpu_malloc_cached(size, &zp->gmem) != 0)
++    if (gpu_malloc_cached(alloc_size, &zp->gmem) != 0)
 +    {
-+        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", size);
++        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
 +        goto fail1;
 +    }
 +
++#if TRACE_ALLOC
++    printf("%s: Alloc %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
++#endif
++
++    pool->numbytes = zp->gmem.numbytes;
 +    zp->next = NULL;
 +    zp->pool = pool;
++    zp->n = pool->n++;
 +    return zp;
 +
 +fail1:
@@ -14969,6 +17833,10 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +
 +static void zc_pool_ent_free(ZcPoolEnt * const zp)
 +{
++#if TRACE_ALLOC
++    printf("%s: Free %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
++#endif
++
 +    gpu_free(&zp->gmem);
 +    av_free(zp);
 +}
@@ -14977,6 +17845,8 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +{
 +    ZcPoolEnt * p = pool->head;
 +    pool->head = NULL;
++    pool->numbytes = -1;
++
 +    while (p != NULL)
 +    {
 +        ZcPoolEnt * const zp = p;
@@ -14985,15 +17855,21 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +    }
 +}
 +
-+static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int numbytes)
++static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int req_bytes)
 +{
 +    ZcPoolEnt * zp;
++    int numbytes;
++
 +    pthread_mutex_lock(&pool->lock);
 +
-+    if (numbytes != pool->numbytes)
++    numbytes = pool->numbytes;
++
++    // If size isn't close then dump the pool
++    // Close in this context means within 128k
++    if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes)
 +    {
 +        zc_pool_flush(pool);
-+        pool->numbytes = numbytes;
++        numbytes = req_bytes;
 +    }
 +
 +    if (pool->head != NULL)
@@ -15007,6 +17883,10 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +    }
 +
 +    pthread_mutex_unlock(&pool->lock);
++
++    // Start with our buffer empty of preconceptions
++//    rpi_cache_flush_one_gm_ptr(&zp->gmem, RPI_CACHE_FLUSH_MODE_INVALIDATE);
++
 +    return zp;
 +}
 +
@@ -15016,6 +17896,10 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +    if (zp != NULL)
 +    {
 +        pthread_mutex_lock(&pool->lock);
++#if TRACE_ALLOC
++        printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->gmem.numbytes);
++#endif
++
 +        if (pool->numbytes == zp->gmem.numbytes)
 +        {
 +            zp->next = pool->head;
@@ -15046,10 +17930,18 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +    pthread_mutex_destroy(&pool->lock);
 +}
 +
++typedef struct ZcOldCtxVals
++{
++    int thread_safe_callbacks;
++    int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
++    void * get_buffer_context;
++} ZcOldCtxVals;
 +
 +typedef struct AVZcEnv
 +{
++    unsigned int refcount;
 +    ZcPool pool;
++    ZcOldCtxVals old;
 +} ZcEnv;
 +
 +// Callback when buffer unrefed to zero
@@ -15069,28 +17961,94 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +}
 +
 +AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
-+    const unsigned int video_width, const unsigned int video_height)
++    const int format, const unsigned int video_width, const unsigned int video_height)
 +{
 +    AVRpiZcFrameGeometry geo;
-+    geo.stride_y = (video_width + 32 + 31) & ~31;
-+    geo.stride_c = geo.stride_y / 2;
-+//    geo.height_y = (video_height + 15) & ~15;
-+    geo.height_y = (video_height + 32 + 31) & ~31;
-+    geo.height_c = geo.height_y / 2;
++
++    switch (format)
++    {
++        case AV_PIX_FMT_YUV420P:
++            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
++        //    geo.stride_y = ((video_width + 32 + 31) & ~31);
++            geo.stride_c = geo.stride_y / 2;
++        //    geo.height_y = (video_height + 15) & ~15;
++            geo.height_y = (video_height + 32 + 31) & ~31;
++            geo.height_c = geo.height_y / 2;
++            geo.planes_c = 2;
++            geo.stripes = 1;
++            break;
++
++        case AV_PIX_FMT_SAND128:
++        {
++            const unsigned int stripe_w = 128;
++
++            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++            static VC_IMAGE_T img = {0};
++
++            // Given the overhead of calling the mailbox keep a stashed
++            // copy as we will almost certainly just want the same numbers again
++            // but that means we need a lock
++            pthread_mutex_lock(&sand_lock);
++
++            if (img.width != video_width || img.height != video_height)
++            {
++                VC_IMAGE_T new_img = {
++                    .type = VC_IMAGE_YUV_UV,
++                    .width = video_width,
++                    .height = video_height
++                };
++
++                gpu_ref();
++                mbox_get_image_params(gpu_get_mailbox(), &new_img);
++                gpu_unref();
++                img = new_img;
++            }
++
++            geo.stride_y = stripe_w;
++            geo.stride_c = stripe_w;
++            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++            geo.height_c = img.pitch / stripe_w - geo.height_y;
++            geo.planes_c = 1;
++            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
++
++            pthread_mutex_unlock(&sand_lock);
++
++            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
++            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
++            break;
++        }
++
++        default:
++            memset(&geo, 0, sizeof(geo));
++            break;
++    }
 +    return geo;
 +}
 +
++
 +static AVBufferRef * rpi_buf_pool_alloc(ZcPool * const pool, int size)
 +{
 +    ZcPoolEnt *const zp = zc_pool_alloc(pool, size);
 +    AVBufferRef * buf;
++    intptr_t idata = (intptr_t)zp->gmem.arm;
++#if ALLOC_N_OFFSET != 0
++    intptr_t noff = (zp->n * ALLOC_N_OFFSET) & (ALLOC_PAD - 1);
++#endif
 +
 +    if (zp == NULL) {
 +        av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
 +        goto fail0;
 +    }
 +
-+    if ((buf = av_buffer_create(zp->gmem.arm, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
++#if ALLOC_N_OFFSET != 0
++    idata = ((idata & ~(ALLOC_PAD - 1)) | noff) + (((idata & (ALLOC_PAD - 1)) > noff) ? ALLOC_PAD : 0);
++#endif
++
++#if DEBUG_ZAP0_BUFFERS
++    memset((void*)idata, 0, size);
++#endif
++
++    if ((buf = av_buffer_create((void *)idata, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
 +    {
 +        av_log(NULL, AV_LOG_ERROR, "av_buffer_create() failed\n");
 +        goto fail2;
@@ -15104,13 +18062,12 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +    return NULL;
 +}
 +
-+static int rpi_get_display_buffer(struct AVCodecContext * const s, AVFrame * const frame)
++static int rpi_get_display_buffer(ZcEnv *const zc, AVFrame * const frame)
 +{
-+    ZcEnv *const zc = s->get_buffer_context;
-+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->width, frame->height);
++    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);
 +    const unsigned int size_y = geo.stride_y * geo.height_y;
 +    const unsigned int size_c = geo.stride_c * geo.height_c;
-+    const unsigned int size_pic = size_y + size_c * 2;
++    const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes;
 +    AVBufferRef * buf;
 +    unsigned int i;
 +
@@ -15118,7 +18075,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +
 +    if ((buf = rpi_buf_pool_alloc(&zc->pool, size_pic)) == NULL)
 +    {
-+        av_log(s, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
++        av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
 +        return AVERROR(ENOMEM);
 +    }
 +
@@ -15129,19 +18086,24 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +    }
 +
 +    frame->buf[0] = buf;
++
 +    frame->linesize[0] = geo.stride_y;
 +    frame->linesize[1] = geo.stride_c;
 +    frame->linesize[2] = geo.stride_c;
++    if (geo.stripes > 1)
++        frame->linesize[3] = geo.height_y + geo.height_c;      // abuse: linesize[3] = stripe stride
++
 +    frame->data[0] = buf->data;
 +    frame->data[1] = frame->data[0] + size_y;
-+    frame->data[2] = frame->data[1] + size_c;
++    if (geo.planes_c > 1)
++        frame->data[2] = frame->data[1] + size_c;
++
 +    frame->extended_data = frame->data;
 +    // Leave extended buf alone
 +
 +    return 0;
 +}
 +
-+
 +#define RPI_GET_BUFFER2 1
 +
 +int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
@@ -15151,21 +18113,25 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +#else
 +    int rv;
 +
-+    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0 ||
-+        frame->format != AV_PIX_FMT_YUV420P)
++    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0)
 +    {
 +//        printf("Do default alloc: format=%#x\n", frame->format);
 +        rv = avcodec_default_get_buffer2(s, frame, flags);
 +    }
++    else if (frame->format == AV_PIX_FMT_YUV420P ||
++             frame->format == AV_PIX_FMT_SAND128)
++    {
++        rv = rpi_get_display_buffer(s->get_buffer_context, frame);
++    }
 +    else
 +    {
-+        rv = rpi_get_display_buffer(s, frame);
++        rv = avcodec_default_get_buffer2(s, frame, flags);
 +    }
 +
 +#if 0
-+    printf("%s: %dx%d lsize=%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
-+        frame->width, frame->height,
-+        frame->linesize[0], frame->linesize[1], frame->linesize[2],
++    printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
++        frame->format, frame->width, frame->height,
++        frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3],
 +        frame->data[0], frame->data[1], frame->data[2],
 +        frame->buf[0], frame->buf[1], frame->buf[2],
 +        av_buffer_get_opaque(frame->buf[0]));
@@ -15186,7 +18152,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +    dest->width = src->width;
 +    dest->height = src->height;
 +
-+    if (rpi_get_display_buffer(s, dest) != 0)
++    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
 +    {
 +        return NULL;
 +    }
@@ -15219,14 +18185,16 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +{
 +    assert(s != NULL);
 +
-+    if (frame->format != AV_PIX_FMT_YUV420P)
++    if (frame->format != AV_PIX_FMT_YUV420P &&
++        frame->format != AV_PIX_FMT_SAND128)
 +    {
-+        av_log(s, AV_LOG_WARNING, "%s: *** Format not YUV420P: %d\n", __func__, frame->format);
++        av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
 +        return NULL;
 +    }
 +
 +    if (frame->buf[1] != NULL)
 +    {
++        av_assert0(frame->format == AV_PIX_FMT_YUV420P);
 +        if (maycopy)
 +        {
 +            av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
@@ -15262,6 +18230,18 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +    return p == NULL ? -1 : p->vc_handle;
 +}
 +
++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
++{
++    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
++    return p == NULL ? 0 : fr_ref->data - p->arm;
++}
++
++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
++{
++    return fr_ref == NULL ? 0 : fr_ref->size;
++}
++
++
 +int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
 +{
 +    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
@@ -15298,27 +18278,50 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +    }
 +}
 +
++int av_rpi_zc_in_use(const struct AVCodecContext * const s)
++{
++    return s->get_buffer2 == av_rpi_zc_get_buffer2;
++}
++
 +int av_rpi_zc_init(struct AVCodecContext * const s)
 +{
-+    ZcEnv * const zc = av_rpi_zc_env_alloc();
-+    if (zc == NULL)
++    if (av_rpi_zc_in_use(s))
 +    {
-+        return AVERROR(ENOMEM);
++        ZcEnv * const zc = s->get_buffer_context;
++        ++zc->refcount;
 +    }
++    else
++    {
++        ZcEnv *const zc = av_rpi_zc_env_alloc();
++        if (zc == NULL)
++        {
++            return AVERROR(ENOMEM);
++        }
 +
-+    s->get_buffer_context = zc;
-+    s->get_buffer2 = av_rpi_zc_get_buffer2;
++        zc->refcount = 1;
++        zc->old.get_buffer_context = s->get_buffer_context;
++        zc->old.get_buffer2 = s->get_buffer2;
++        zc->old.thread_safe_callbacks = s->thread_safe_callbacks;
++
++        s->get_buffer_context = zc;
++        s->get_buffer2 = av_rpi_zc_get_buffer2;
++        s->thread_safe_callbacks = 1;
++    }
 +    return 0;
 +}
 +
 +void av_rpi_zc_uninit(struct AVCodecContext * const s)
 +{
-+    if (s->get_buffer2 == av_rpi_zc_get_buffer2)
++    if (av_rpi_zc_in_use(s))
 +    {
 +        ZcEnv * const zc = s->get_buffer_context;
-+        s->get_buffer2 = avcodec_default_get_buffer2;
-+        s->get_buffer_context = NULL;
-+        av_rpi_zc_env_free(zc);
++        if (--zc->refcount == 0)
++        {
++            s->get_buffer2 = zc->old.get_buffer2;
++            s->get_buffer_context = zc->old.get_buffer_context;
++            s->thread_safe_callbacks = zc->old.thread_safe_callbacks;
++            av_rpi_zc_env_free(zc);
++        }
 +    }
 +}
 +
@@ -15326,17 +18329,17 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.c ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +
 diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.h ffmpeg-3.2.4.patch/libavcodec/rpi_zc.h
 --- ffmpeg-3.2.4/libavcodec/rpi_zc.h	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/rpi_zc.h	2017-03-22 22:42:34.852798585 +0100
-@@ -0,0 +1,83 @@
++++ ffmpeg-3.2.4.patch/libavcodec/rpi_zc.h	2017-05-28 20:42:45.756088730 +0200
+@@ -0,0 +1,137 @@
 +#ifndef LIBAVCODEC_RPI_ZC_H
 +#define LIBAVCODEC_RPI_ZC_H
 +
 +// Zero-Copy frame code for RPi
 +// RPi needs Y/U/V planes to be contiguous for display.  By default
 +// ffmpeg will allocate separated planes so a memcpy is needed before
-+// display.  This code prodes a method a making ffmpeg allocate a single
-+// bit of memory for the frame when can then be refrence counted until
-+// display ahs finsihed with it.
++// display.  This code provides a method a making ffmpeg allocate a single
++// bit of memory for the frame when can then be reference counted until
++// display has finished with it.
 +
 +#include "libavutil/frame.h"
 +#include "libavcodec/avcodec.h"
@@ -15353,10 +18356,13 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.h ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +    unsigned int height_y;
 +    unsigned int stride_c;
 +    unsigned int height_c;
++    unsigned int planes_c;
++    unsigned int stripes;
 +} AVRpiZcFrameGeometry;
 +
 +
 +AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
++    const int format,
 +    const unsigned int video_width, const unsigned int video_height);
 +
 +// Replacement fn for avctx->get_buffer2
@@ -15365,7 +18371,7 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.h ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +// N.B. in addition to to setting avctx->get_buffer2, avctx->refcounted_frames
 +// must be set to 1 as otherwise the buffer info is killed before being returned
 +// by avcodec_decode_video2.  Note also that this means that the AVFrame that is
-+// return must be manually derefed with av_frame_unref.  This should be done
++// returned must be manually derefed with av_frame_unref.  This should be done
 +// after av_rpi_zc_ref has been called.
 +int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags);
 +
@@ -15382,6 +18388,11 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.h ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +// Get the vc_handle from the frame ref
 +// Returns -1 if ref doesn't look valid
 +int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
++// Get offset from the start of the memory referenced
++// by the vc_handle to valid data
++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
++// Length of buffer data
++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
 +// Get the number of bytes allocated from the frame ref
 +// Returns 0 if ref doesn't look valid
 +int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
@@ -15398,6 +18409,8 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.h ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +// Allocate the environment used by the ZC code
 +void av_rpi_zc_env_free(AVZcEnvPtr);
 +
++// Test to see if the context is using zc (checks get_buffer2)
++int av_rpi_zc_in_use(const struct AVCodecContext * const s);
 +
 +// Init ZC into a context
 +// There is nothing magic in this fn - it just packages setting
@@ -15409,11 +18422,55 @@ diff -Naur ffmpeg-3.2.4/libavcodec/rpi_zc.h ffmpeg-3.2.4.patch/libavcodec/rpi_zc
 +// get_buffer2 & get_buffer_context
 +void av_rpi_zc_uninit(struct AVCodecContext * const s);
 +
++
++
++static inline unsigned int rpi_sliced_frame_stride2(const AVFrame * const frame)
++{
++    return frame->linesize[3];
++}
++
++static inline unsigned int rpi_sliced_frame_off_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    const unsigned int stride1 = frame->linesize[0];
++    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y + stride2 * x2;
++}
++
++static inline unsigned int rpi_sliced_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
++{
++    const unsigned int stride1 = frame->linesize[0];
++    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
++    const unsigned int x = x_c * 2;
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y_c + stride2 * x2;
++}
++
++static inline uint8_t * rpi_sliced_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[0] + rpi_sliced_frame_off_y(frame, x, y);
++}
++
++static inline uint8_t * rpi_sliced_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[1] + rpi_sliced_frame_off_c(frame, x, y);
++}
++
++static inline int rpi_sliced_frame(const AVFrame * const frame)
++{
++    return frame->format == AV_PIX_FMT_SAND128;
++}
++
++
 +#endif
 +
 diff -Naur ffmpeg-3.2.4/libavcodec/utils.c ffmpeg-3.2.4.patch/libavcodec/utils.c
 --- ffmpeg-3.2.4/libavcodec/utils.c	2017-02-10 14:25:27.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavcodec/utils.c	2017-03-22 22:42:34.853798588 +0100
++++ ffmpeg-3.2.4.patch/libavcodec/utils.c	2017-05-28 20:42:45.757088734 +0200
 @@ -26,6 +26,12 @@
   */
  
@@ -15501,9 +18558,32 @@ diff -Naur ffmpeg-3.2.4/libavcodec/utils.c ffmpeg-3.2.4.patch/libavcodec/utils.c
                  pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
                                                       CONFIG_MEMORY_POISONING ?
                                                          NULL :
+@@ -732,6 +791,11 @@
+     if (avctx->hw_frames_ctx)
+         return av_hwframe_get_buffer(avctx->hw_frames_ctx, frame, 0);
+ 
++#ifdef RPI
++    // This is going to end badly if we let it continue
++    av_assert0(frame->format != AV_PIX_FMT_SAND128);
++#endif
++
+     if ((ret = update_frame_pool(avctx, frame)) < 0)
+         return ret;
+ 
+diff -Naur ffmpeg-3.2.4/libavfilter/avfilter.c ffmpeg-3.2.4.patch/libavfilter/avfilter.c
+--- ffmpeg-3.2.4/libavfilter/avfilter.c	2017-02-10 14:25:27.000000000 +0100
++++ ffmpeg-3.2.4.patch/libavfilter/avfilter.c	2017-05-28 20:42:45.758088737 +0200
+@@ -924,6 +924,7 @@
+                    "options, but options were provided: %s.\n", args);
+             return AVERROR(EINVAL);
+         }
++        printf("=== args='%s'\n", args);
+ 
+ #if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR
+             if (   !strcmp(filter->filter->name, "format")     ||
 diff -Naur ffmpeg-3.2.4/libavformat/mpegts.c ffmpeg-3.2.4.patch/libavformat/mpegts.c
 --- ffmpeg-3.2.4/libavformat/mpegts.c	2017-02-10 14:25:27.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavformat/mpegts.c	2017-03-22 22:42:34.854798590 +0100
++++ ffmpeg-3.2.4.patch/libavformat/mpegts.c	2017-05-28 20:42:45.759088741 +0200
 @@ -701,7 +701,7 @@
  #endif
      { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
@@ -15515,7 +18595,7 @@ diff -Naur ffmpeg-3.2.4/libavformat/mpegts.c ffmpeg-3.2.4.patch/libavformat/mpeg
      { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS       },
 diff -Naur ffmpeg-3.2.4/libavformat/utils.c ffmpeg-3.2.4.patch/libavformat/utils.c
 --- ffmpeg-3.2.4/libavformat/utils.c	2017-02-10 14:25:27.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavformat/utils.c	2017-03-22 22:42:34.856798595 +0100
++++ ffmpeg-3.2.4.patch/libavformat/utils.c	2017-05-28 20:42:45.760088744 +0200
 @@ -733,7 +733,7 @@
          int default_stream_index = av_find_default_stream_index(s);
          if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) {
@@ -15527,7 +18607,7 @@ diff -Naur ffmpeg-3.2.4/libavformat/utils.c ffmpeg-3.2.4.patch/libavformat/utils
                  s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
 diff -Naur ffmpeg-3.2.4/libavutil/buffer.c ffmpeg-3.2.4.patch/libavutil/buffer.c
 --- ffmpeg-3.2.4/libavutil/buffer.c	2017-02-10 14:25:28.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavutil/buffer.c	2017-03-22 22:42:34.856798595 +0100
++++ ffmpeg-3.2.4.patch/libavutil/buffer.c	2017-05-28 20:42:45.760088744 +0200
 @@ -425,3 +425,9 @@
  
      return ret;
@@ -15540,7 +18620,7 @@ diff -Naur ffmpeg-3.2.4/libavutil/buffer.c ffmpeg-3.2.4.patch/libavutil/buffer.c
 +}
 diff -Naur ffmpeg-3.2.4/libavutil/buffer.h ffmpeg-3.2.4.patch/libavutil/buffer.h
 --- ffmpeg-3.2.4/libavutil/buffer.h	2017-02-10 14:25:28.000000000 +0100
-+++ ffmpeg-3.2.4.patch/libavutil/buffer.h	2017-03-22 22:42:34.856798595 +0100
++++ ffmpeg-3.2.4.patch/libavutil/buffer.h	2017-05-28 20:42:45.760088744 +0200
 @@ -283,6 +283,9 @@
   */
  AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
@@ -15551,19 +18631,133 @@ diff -Naur ffmpeg-3.2.4/libavutil/buffer.h ffmpeg-3.2.4.patch/libavutil/buffer.h
  /**
   * @}
   */
+diff -Naur ffmpeg-3.2.4/libavutil/pixdesc.c ffmpeg-3.2.4.patch/libavutil/pixdesc.c
+--- ffmpeg-3.2.4/libavutil/pixdesc.c	2017-02-10 14:25:28.000000000 +0100
++++ ffmpeg-3.2.4.patch/libavutil/pixdesc.c	2017-05-28 20:42:45.761088748 +0200
+@@ -2092,6 +2092,18 @@
+         .flags = AV_PIX_FMT_FLAG_BE | AV_PIX_FMT_FLAG_PLANAR |
+                  AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_ALPHA,
+     },
++    [AV_PIX_FMT_SAND128] = {
++        .name = "sand128",
++        .nb_components = 3,
++        .log2_chroma_w = 1,
++        .log2_chroma_h = 1,
++        .comp = {
++            { 0, 1, 0, 0, 8, 0, 7, 1 },        /* Y */
++            { 1, 2, 0, 0, 8, 1, 7, 1 },        /* U */
++            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
++        },
++        .flags = 0,
++    }
+ };
+ #if FF_API_PLUS1_MINUS1
+ FF_ENABLE_DEPRECATION_WARNINGS
+diff -Naur ffmpeg-3.2.4/libavutil/pixfmt.h ffmpeg-3.2.4.patch/libavutil/pixfmt.h
+--- ffmpeg-3.2.4/libavutil/pixfmt.h	2017-02-10 14:25:28.000000000 +0100
++++ ffmpeg-3.2.4.patch/libavutil/pixfmt.h	2017-05-28 20:47:06.209006953 +0200
+@@ -306,6 +306,9 @@
+ 
+     AV_PIX_FMT_MEDIACODEC, ///< hardware decoding through MediaCodec
+ 
++// RPI - not on ifdef so can be got at by calling progs
++    AV_PIX_FMT_SAND128,   ///< 4:2:0 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
++
+     AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
+ };
+ 
+diff -Naur ffmpeg-3.2.4/libswscale/input.c ffmpeg-3.2.4.patch/libswscale/input.c
+--- ffmpeg-3.2.4/libswscale/input.c	2017-02-10 14:25:28.000000000 +0100
++++ ffmpeg-3.2.4.patch/libswscale/input.c	2017-05-28 20:42:45.762088751 +0200
+@@ -719,6 +719,14 @@
+     }
+ }
+ 
++
++static void sand128ToUV_c(uint8_t *dstU, uint8_t *dstV,
++                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
++                       int width, uint32_t *unused)
++{
++    // NIF
++}
++
+ #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
+ 
+ static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
+@@ -1085,6 +1093,9 @@
+     case AV_PIX_FMT_P010BE:
+         c->chrToYV12 = p010BEToUV_c;
+         break;
++    case AV_PIX_FMT_SAND128:
++        c->chrToYV12 = sand128ToUV_c;
++        break;
+     }
+     if (c->chrSrcHSubSample) {
+         switch (srcFormat) {
+diff -Naur ffmpeg-3.2.4/libswscale/utils.c ffmpeg-3.2.4.patch/libswscale/utils.c
+--- ffmpeg-3.2.4/libswscale/utils.c	2017-02-10 14:25:28.000000000 +0100
++++ ffmpeg-3.2.4.patch/libswscale/utils.c	2017-05-28 20:48:08.013211613 +0200
+@@ -248,6 +248,9 @@
+     [AV_PIX_FMT_AYUV64LE]    = { 1, 1},
+     [AV_PIX_FMT_P010LE]      = { 1, 1 },
+     [AV_PIX_FMT_P010BE]      = { 1, 1 },
++#ifdef RPI
++    [AV_PIX_FMT_SAND128]     = { 1, 0 },
++#endif
+ };
+ 
+ int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
+diff -Naur ffmpeg-3.2.4/pi-util/conf1.sh ffmpeg-3.2.4.patch/pi-util/conf1.sh
+--- ffmpeg-3.2.4/pi-util/conf1.sh	1970-01-01 01:00:00.000000000 +0100
++++ ffmpeg-3.2.4.patch/pi-util/conf1.sh	2017-05-28 20:42:45.764088759 +0200
+@@ -0,0 +1,34 @@
++echo "Configure for Pi1"
++
++RPI_BUILDROOT=`pwd`/build
++RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
++RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
++RPI_OPT_VC=$RPI_ROOTFS/opt/vc
++#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
++#RPI_DEFS="-D__VCCOREVER__=0x04000000"
++RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
++#RPI_KEEPS="-save-temps=obj"
++RPI_KEEPS=""
++
++./configure --enable-cross-compile\
++ --cpu=arm1176jzf-s\
++ --arch=armv\
++ --disable-neon\
++ --target-os=linux\
++ --disable-stripping\
++ --enable-mmal\
++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
++
++
++# --enable-extra-warnings\
++# --arch=armv71\
++# --enable-shared\
++
++# gcc option for getting asm listing
++# -Wa,-ahls
 diff -Naur ffmpeg-3.2.4/pi-util/conf_h265.csv ffmpeg-3.2.4.patch/pi-util/conf_h265.csv
 --- ffmpeg-3.2.4/pi-util/conf_h265.csv	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/pi-util/conf_h265.csv	2017-03-22 22:42:34.857798598 +0100
++++ ffmpeg-3.2.4.patch/pi-util/conf_h265.csv	2017-05-28 20:42:45.764088759 +0200
 @@ -0,0 +1,144 @@
 +1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
-+2,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
++1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
 +1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
 +1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
 +1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
 +1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
 +1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
 +1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
-+2,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
++1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
 +1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
 +1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
 +1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
@@ -15585,7 +18779,7 @@ diff -Naur ffmpeg-3.2.4/pi-util/conf_h265.csv ffmpeg-3.2.4.patch/pi-util/conf_h2
 +1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
 +1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
 +1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
-+2,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
 +1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
 +1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
 +1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
@@ -15625,7 +18819,7 @@ diff -Naur ffmpeg-3.2.4/pi-util/conf_h265.csv ffmpeg-3.2.4.patch/pi-util/conf_h2
 +1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
 +1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
 +1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
-+2,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
 +1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
 +1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
 +1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
@@ -15639,10 +18833,10 @@ diff -Naur ffmpeg-3.2.4/pi-util/conf_h265.csv ffmpeg-3.2.4.patch/pi-util/conf_h2
 +1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
 +1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
 +1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
-+2,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
 +1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
 +1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
-+2,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
 +1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
 +1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
 +1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
@@ -15671,7 +18865,7 @@ diff -Naur ffmpeg-3.2.4/pi-util/conf_h265.csv ffmpeg-3.2.4.patch/pi-util/conf_h2
 +1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
 +1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
 +1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
-+2,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
 +1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
 +1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
 +1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
@@ -15680,9 +18874,9 @@ diff -Naur ffmpeg-3.2.4/pi-util/conf_h265.csv ffmpeg-3.2.4.patch/pi-util/conf_h2
 +1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
 +1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
 +1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
-+2,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5
++0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
 +1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
-+2,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
 +1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
 +1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
 +1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
@@ -15701,7 +18895,7 @@ diff -Naur ffmpeg-3.2.4/pi-util/conf_h265.csv ffmpeg-3.2.4.patch/pi-util/conf_h2
 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
 diff -Naur ffmpeg-3.2.4/pi-util/conf.sh ffmpeg-3.2.4.patch/pi-util/conf.sh
 --- ffmpeg-3.2.4/pi-util/conf.sh	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/pi-util/conf.sh	2017-03-22 22:42:34.857798598 +0100
++++ ffmpeg-3.2.4.patch/pi-util/conf.sh	2017-05-28 20:42:45.764088759 +0200
 @@ -0,0 +1,33 @@
 +echo "Configure for Pi2/3"
 +
@@ -15738,8 +18932,8 @@ diff -Naur ffmpeg-3.2.4/pi-util/conf.sh ffmpeg-3.2.4.patch/pi-util/conf.sh
 +# -Wa,-ahls
 diff -Naur ffmpeg-3.2.4/pi-util/ffconf.py ffmpeg-3.2.4.patch/pi-util/ffconf.py
 --- ffmpeg-3.2.4/pi-util/ffconf.py	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/pi-util/ffconf.py	2017-03-22 22:42:34.857798598 +0100
-@@ -0,0 +1,146 @@
++++ ffmpeg-3.2.4.patch/pi-util/ffconf.py	2017-05-28 20:42:45.765088762 +0200
+@@ -0,0 +1,154 @@
 +#!/usr/bin/env python
 +
 +import os
@@ -15783,16 +18977,18 @@ diff -Naur ffmpeg-3.2.4/pi-util/ffconf.py ffmpeg-3.2.4.patch/pi-util/ffconf.py
 +    except:
 +        pass
 +
-+    rv = False
 +    if  m1 and m2 and m1.group() == m2.group():
 +        print >> flog, "Match: " + m1.group()
-+        rv = True
++        rv = 0
 +    elif not m1:
 +        print >> flog, "****** Cannot find m1"
++        rv = 3
 +    elif not m2:
 +        print >> flog, "****** Cannot find m2"
++        rv = 2
 +    else:
 +        print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
++        rv = 1
 +    flog.close()
 +    return rv
 +
@@ -15838,19 +19034,25 @@ diff -Naur ffmpeg-3.2.4/pi-util/ffconf.py ffmpeg-3.2.4.patch/pi-util/ffconf.py
 +            print "==== ", name,
 +            sys.stdout.flush()
 +
-+            if (not testone(os.path.join(conf_root, name), name, a[2], a[3])) :
-+                if exp_test == 1:
-+                    failures.append(name)
-+                    print ": * FAIL *"
-+                else:
-+                    print ": fail"
-+            else:
++            rv = testone(os.path.join(conf_root, name), name, a[2], a[3])
++            if (rv == 0):
 +                if exp_test == 2:
 +                    print ": * OK *"
 +                    unx_success.append(name)
 +                else:
 +                    print ": ok"
-+
++            elif exp_test > 1 and rv == 1:
++                print ": fail"
++            else:
++                failures.append(name)
++                if rv == 1:
++                    print ": * FAIL *"
++                elif (rv == 2) :
++                    print ": * CRASH *"
++                elif (rv == 3) :
++                    print ": * MD5 MISSING *"
++                else :
++                    print ": * BANG *"
 +
 +    if failures or unx_success:
 +        print "Unexpected Failures:", failures
@@ -15888,7 +19090,7 @@ diff -Naur ffmpeg-3.2.4/pi-util/ffconf.py ffmpeg-3.2.4.patch/pi-util/ffconf.py
 +
 diff -Naur ffmpeg-3.2.4/pi-util/qasm.py ffmpeg-3.2.4.patch/pi-util/qasm.py
 --- ffmpeg-3.2.4/pi-util/qasm.py	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/pi-util/qasm.py	2017-03-22 22:42:34.860798606 +0100
++++ ffmpeg-3.2.4.patch/pi-util/qasm.py	2017-05-28 20:42:45.767088769 +0200
 @@ -0,0 +1,2502 @@
 +#!/usr/bin/env python
 +
@@ -18392,9 +21594,22 @@ diff -Naur ffmpeg-3.2.4/pi-util/qasm.py ffmpeg-3.2.4.patch/pi-util/qasm.py
 +
 +if __name__ == '__main__':
 +   main()
+diff -Naur ffmpeg-3.2.4/pi-util/qem.sh ffmpeg-3.2.4.patch/pi-util/qem.sh
+--- ffmpeg-3.2.4/pi-util/qem.sh	1970-01-01 01:00:00.000000000 +0100
++++ ffmpeg-3.2.4.patch/pi-util/qem.sh	2017-05-28 20:42:45.767088769 +0200
+@@ -0,0 +1,9 @@
++TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
++QASM=python\ pi-util/qasm.py
++SRC_FILE=libavcodec/rpi_shader.qasm
++DST_BASE=shader
++
++cp libavcodec/rpi_shader_cmd.h $TARGET_DIR
++$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
++$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
++
 diff -Naur ffmpeg-3.2.4/pi-util/rebase_liblinks.py ffmpeg-3.2.4.patch/pi-util/rebase_liblinks.py
 --- ffmpeg-3.2.4/pi-util/rebase_liblinks.py	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/pi-util/rebase_liblinks.py	2017-03-22 22:42:34.860798606 +0100
++++ ffmpeg-3.2.4.patch/pi-util/rebase_liblinks.py	2017-05-28 20:42:45.767088769 +0200
 @@ -0,0 +1,37 @@
 +#!/usr/bin/env python
 +
@@ -18435,7 +21650,7 @@ diff -Naur ffmpeg-3.2.4/pi-util/rebase_liblinks.py ffmpeg-3.2.4.patch/pi-util/re
 +
 diff -Naur ffmpeg-3.2.4/pi-util/syncroot.sh ffmpeg-3.2.4.patch/pi-util/syncroot.sh
 --- ffmpeg-3.2.4/pi-util/syncroot.sh	1970-01-01 01:00:00.000000000 +0100
-+++ ffmpeg-3.2.4.patch/pi-util/syncroot.sh	2017-03-22 22:42:34.860798606 +0100
++++ ffmpeg-3.2.4.patch/pi-util/syncroot.sh	2017-05-28 20:42:45.767088769 +0200
 @@ -0,0 +1,43 @@
 +set -e
 +
@@ -18480,3 +21695,135 @@ diff -Naur ffmpeg-3.2.4/pi-util/syncroot.sh ffmpeg-3.2.4.patch/pi-util/syncroot.
 +pi-util/rebase_liblinks.py $DST
 +
 +
+diff -Naur ffmpeg-3.2.4/pi-util/v3dusage.py ffmpeg-3.2.4.patch/pi-util/v3dusage.py
+--- ffmpeg-3.2.4/pi-util/v3dusage.py	1970-01-01 01:00:00.000000000 +0100
++++ ffmpeg-3.2.4.patch/pi-util/v3dusage.py	2017-05-28 20:42:45.768088773 +0200
+@@ -0,0 +1,128 @@
++#!/usr/bin/env python
++
++import sys
++import argparse
++import re
++
++def do_logparse(logname):
++
++    rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
++    rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
++    rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
++    rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
++
++    ttotal = {'idle':0.0}
++    tstart = {}
++    qctotal = {}
++    qtstotal = {}
++    l2hits = {}
++    l2total = {}
++    time0 = None
++    idle_start = None
++    qpu_op_no = 0
++    op_count = 0
++
++    with open(logname, "rt") as infile:
++        for line in infile:
++            match = rmatch.match(line)
++            if match:
++#                print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
++                time = float(match.group(1))
++                unit = match.group(3)
++                opstart = not match.group(2)
++                optype = match.group(7)
++                hascb = match.group(8) != "0"
++
++                if unit == 'qpu1':
++                    unit = unit + "." + str(qpu_op_no)
++                    if not opstart:
++                        if hascb or optype == 'EXECUTE_SYNC':
++                            qpu_op_no = 0
++                        else:
++                            qpu_op_no += 1
++
++                # Ignore sync type
++                if optype == 'EXECUTE_SYNC':
++                    continue
++
++                if not time0:
++                    time0 = time
++
++                if opstart:
++                    tstart[unit] = time;
++                elif unit in tstart:
++                    op_count += 1
++                    if not unit in ttotal:
++                        ttotal[unit] = 0.0
++                    ttotal[unit] += time - tstart[unit]
++                    del tstart[unit]
++
++                if not idle_start and not tstart:
++                    idle_start = time
++                elif idle_start and tstart:
++                    ttotal['idle'] += time - idle_start
++                    idle_start = None
++
++            match = rqcycle.match(line)
++            if match:
++                unit = "qpu1." + str(qpu_op_no)
++                if not unit in qctotal:
++                    qctotal[unit] = 0
++                qctotal[unit] += int(match.group(2))
++
++            match = rqtscycle.match(line)
++            if match:
++                unit = "qpu1." + str(qpu_op_no)
++                if not unit in qtstotal:
++                    qtstotal[unit] = 0
++                qtstotal[unit] += int(match.group(2))
++
++            match = rl2hits.match(line)
++            if match:
++                unit = "qpu1." + str(qpu_op_no)
++                if not unit in l2total:
++                    l2total[unit] = 0
++                    l2hits[unit] = 0
++                l2total[unit] += int(match.group(3))
++                if match.group(2) == "hits":
++                    l2hits[unit] += int(match.group(3))
++
++
++    if not time0:
++        print "No v3d profile records found"
++    else:
++        tlogged = time - time0
++
++        print "Logged time:", tlogged, "  Op count:", op_count
++        for unit in sorted(ttotal):
++            print b'%6s: %10.3f    %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
++        print
++        for unit in sorted(qctotal):
++            if not unit in qtstotal:
++                qtstotal[unit] = 0;
++            print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
++            if unit in l2total:
++                print b'        L2Total: %10d, hits:      %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
++
++
++
++if __name__ == '__main__':
++    argp = argparse.ArgumentParser(
++        formatter_class=argparse.RawDescriptionHelpFormatter,
++        description="QPU/VPU perf summary from VC logging",
++        epilog = """
++Will also summarise TMU stalls if logging requests set in qpu noflush param
++in the profiled code.
++
++Example use:
++  vcgencmd set_logging level=0xc0
++  <command to profile>
++  sudo vcdbg log msg >& t.log
++  v3dusage.py t.log
++""")
++
++    argp.add_argument("logfile")
++    args = argp.parse_args()
++
++    do_logparse(args.logfile)
++
diff --git a/packages/multimedia/intel-vaapi-driver/package.mk b/packages/multimedia/intel-vaapi-driver/package.mk
index 7cf147daf57..69f1d0690fd 100644
--- a/packages/multimedia/intel-vaapi-driver/package.mk
+++ b/packages/multimedia/intel-vaapi-driver/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="intel-vaapi-driver"
-PKG_VERSION="1.8.0"
+PKG_VERSION="1.8.2"
 PKG_REV="1"
 PKG_ARCH="x86_64"
 PKG_LICENSE="GPL"
diff --git a/packages/multimedia/libhdhomerun/package.mk b/packages/multimedia/libhdhomerun/package.mk
index 5e1d7401dfc..74e75095efe 100644
--- a/packages/multimedia/libhdhomerun/package.mk
+++ b/packages/multimedia/libhdhomerun/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="libhdhomerun"
-PKG_VERSION="20150826"
+PKG_VERSION="20161117"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="LGPL"
diff --git a/packages/multimedia/libva/package.mk b/packages/multimedia/libva/package.mk
index b5e9024cab1..e7ffd3c87c5 100644
--- a/packages/multimedia/libva/package.mk
+++ b/packages/multimedia/libva/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="libva"
-PKG_VERSION="1.8.0"
+PKG_VERSION="1.8.2"
 PKG_REV="1"
 PKG_ARCH="x86_64"
 PKG_LICENSE="GPL"
@@ -41,11 +41,3 @@ PKG_CONFIGURE_OPTS_TARGET="--disable-silent-rules \
                            --disable-wayland \
                            --disable-dummy-driver \
                            --with-drivers-path=/usr/lib/va"
-
-post_makeinstall_target() {
-  rm -rf $INSTALL/usr/bin
-  if [ "$DEVTOOLS" = yes ]; then
-    mkdir -p $INSTALL/usr/bin
-      cp test/vainfo/.libs/vainfo $INSTALL/usr/bin
-  fi
-}
diff --git a/packages/multimedia/rtmpdump/package.mk b/packages/multimedia/rtmpdump/package.mk
index e8d16a06aa8..f13337a879b 100644
--- a/packages/multimedia/rtmpdump/package.mk
+++ b/packages/multimedia/rtmpdump/package.mk
@@ -35,6 +35,10 @@ PKG_AUTORECONF="no"
 
 MAKEFLAGS="-j1"
 
+pre_configure_target() {
+  CFLAGS="$CFLAGS -fPIC"
+}
+
 make_target() {
   make prefix=/usr \
        incdir=/usr/include/librtmp \
diff --git a/packages/network/bluez/package.mk b/packages/network/bluez/package.mk
index 4ef6f231b59..451e4db72df 100644
--- a/packages/network/bluez/package.mk
+++ b/packages/network/bluez/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="bluez"
-PKG_VERSION="5.44"
+PKG_VERSION="5.45"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/network/connman/system.d/network-online.service b/packages/network/connman/system.d/network-online.service
index a6d42403f9c..c52f8045717 100644
--- a/packages/network/connman/system.d/network-online.service
+++ b/packages/network/connman/system.d/network-online.service
@@ -3,14 +3,10 @@ Description=Wait for network to be configured by ConnMan
 Requisite=connman.service
 After=connman.service
 Before=network-online.target
-DefaultDependencies=no
-Conflicts=shutdown.target
 
 [Service]
 Type=oneshot
-ExecStartPre=/bin/sh -c 'echo "waiting on Network to come online ..."'
 ExecStart=/usr/sbin/connmand-wait-online --timeout=30
-StandardOutput=tty
 RemainAfterExit=yes
 
 [Install]
diff --git a/packages/network/libnfs/package.mk b/packages/network/libnfs/package.mk
index 88bed1a145f..8b8fe1eb708 100644
--- a/packages/network/libnfs/package.mk
+++ b/packages/network/libnfs/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="libnfs"
-PKG_VERSION="libnfs-1.11.0"
+PKG_VERSION="14adfbf"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/network/libnfs/patches/libnfs-glibc-2.25.patch b/packages/network/libnfs/patches/libnfs-glibc-2.25.patch
deleted file mode 100644
index 2eb5a666fa4..00000000000
--- a/packages/network/libnfs/patches/libnfs-glibc-2.25.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-diff -Naur libnfs-libnfs-1.11.0/lib/libnfs.c libnfs-libnfs-1.11.0.patch/lib/libnfs.c
---- libnfs-libnfs-1.11.0/lib/libnfs.c	2016-10-09 20:23:11.000000000 +0200
-+++ libnfs-libnfs-1.11.0.patch/lib/libnfs.c	2017-02-06 09:36:22.936702608 +0100
-@@ -68,10 +68,6 @@
- #include <sys/mkdev.h>
- #endif
- 
--#ifdef MAJOR_IN_SYSMACROS
--#include <sys/sysmacros.h>
--#endif
--
- #include <stdio.h>
- #include <stdarg.h>
- #include <stdlib.h>
-@@ -79,6 +75,7 @@
- #include <assert.h>
- #include <errno.h>
- #include <time.h>
-+#include <sys/sysmacros.h>
- #include <sys/types.h>
- #include <sys/stat.h>
- #include <fcntl.h>
diff --git a/packages/network/libssh/package.mk b/packages/network/libssh/package.mk
index f2a210e9bcd..475e2716c3a 100644
--- a/packages/network/libssh/package.mk
+++ b/packages/network/libssh/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="libssh"
-PKG_VERSION="0.7.3"
+PKG_VERSION="0.7.5"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="OpenSource"
diff --git a/packages/network/samba/package.mk b/packages/network/samba/package.mk
index 5a28315a118..0a2284f0bd1 100644
--- a/packages/network/samba/package.mk
+++ b/packages/network/samba/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="samba"
-PKG_VERSION="4.6.2"
+PKG_VERSION="4.6.4"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPLv3+"
@@ -148,7 +148,7 @@ post_makeinstall_target() {
     mkdir -p $INSTALL/etc/samba
       cp $PKG_DIR/config/smb.conf $INSTALL/etc/samba
     mkdir -p $INSTALL/usr/config
-      cp $PKG_DIR/config/smb.conf $INSTALL/usr/config/samba.conf.sample
+      cp $PKG_DIR/config/smb.conf $INSTALL/usr/config/samba4.conf.sample
   fi
 
   if [ "$DEVTOOLS" = "yes" ]; then
diff --git a/packages/network/samba/scripts/samba-config b/packages/network/samba/scripts/samba-config
index 6496523d4ef..d6f85e78b94 100755
--- a/packages/network/samba/scripts/samba-config
+++ b/packages/network/samba/scripts/samba-config
@@ -17,7 +17,7 @@
 #  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
 ################################################################################
 
-SMB_USERCONF="/storage/.config/samba.conf"
+SMB_USERCONF="/storage/.config/samba4.conf"
 SMB_DEFCONF="/etc/samba/smb.conf"
 SMB_CONF="/run/samba/smb.conf"
 
diff --git a/packages/print/freetype/package.mk b/packages/print/freetype/package.mk
index 36a33629b3b..42b711c6dfd 100644
--- a/packages/print/freetype/package.mk
+++ b/packages/print/freetype/package.mk
@@ -42,6 +42,7 @@ PKG_CONFIGURE_OPTS_TARGET="--enable-static \
                            --with-harfbuzz=no"
 
 pre_configure_target() {
+  CFLAGS="$CFLAGS -fPIC"
   # unset LIBTOOL because freetype uses its own
     ( cd ..
       unset LIBTOOL
diff --git a/packages/security/libgpg-error/package.mk b/packages/security/libgpg-error/package.mk
index a1fc1ef8d18..a08de3ae0f2 100644
--- a/packages/security/libgpg-error/package.mk
+++ b/packages/security/libgpg-error/package.mk
@@ -56,6 +56,7 @@ pre_configure_target() {
   esac
 
   cp $ROOT/$PKG_BUILD/src/syscfg/lock-obj-pub.$GPGERROR_TUPLE.h $ROOT/$PKG_BUILD/src/syscfg/lock-obj-pub.$GPGERROR_TARGET.h
+  CFLAGS="$CFLAGS -fPIC"
 }
 
 post_makeinstall_target() {
diff --git a/packages/sysutils/busybox/scripts/fs-resize b/packages/sysutils/busybox/scripts/fs-resize
index c5ce5ba7794..0307e30872c 100755
--- a/packages/sysutils/busybox/scripts/fs-resize
+++ b/packages/sysutils/busybox/scripts/fs-resize
@@ -17,14 +17,23 @@
 #  along with OpenELEC.  If not, see <http://www.gnu.org/licenses/>.
 ################################################################################
 
+progress() {
+  if [ -e /dev/psplash_fifo ] ; then
+    echo "MSG $1" > /dev/psplash_fifo
+      usleep 2000000 # wait a bit until the last message is written
+  elif [ "$PROGRESS" = "yes" ] ; then
+    echo "### $1 ###"
+  fi
+}
+
 if [ -e /storage/.please_resize_me ] ; then
   # this sh** was never intended to be used
   # on already installed and runing system
-  if [ -d /storage/.kodi -o -d /storage/.config -o -d /storage/.cache ] ; then
+  if [ -d /storage/.kodi -a -d /storage/.config -a -d /storage/.cache ] ; then
     rm -f /storage/.please_resize_me
     sync
-    echo "resizing not allowed. rebooting in 15s"
-    sleep 15
+    progress "resizing not allowed. rebooting in 10s..."
+    sleep 10
     reboot -f
   fi
   # get the disk. /storage on 2nd partition
@@ -51,18 +60,18 @@ if [ -e /storage/.please_resize_me ] ; then
     # failed to get partition start offset ?
     if [ ! -z "$PART_START" ] ; then
       umount $PART
-      echo "resizing /storage..."
-      echo "Please do not reboot or turn off your @DISTRONAME@ device!"
-      echo "... parted -s -m $DISK rm 2"
+      progress "resizing /storage..."
+      progress "Please do not reboot or turn off your @DISTRONAME@ device!"
+      progress "parted -s -m $DISK rm 2"
       parted -s -m $DISK rm 2 &>/dev/null
-      echo "... parted -s -m $DISK unit b mkpart primary $PART_START 100%"
+      progress "parted -s -m $DISK unit b mkpart primary $PART_START 100%"
       parted -s -m $DISK unit b mkpart primary $PART_START 100% &>/dev/null
-      echo "... e2fsck -f -p $PART"
+      progress "e2fsck -f -p $PART"
       e2fsck -f -p $PART &>/dev/null
-      echo "... resize2fs $PART"
+      progress "resize2fs $PART"
       resize2fs $PART &>/dev/null
-      echo "...done. rebooting in 15s"
-      sleep 15
+      progress "...done. rebooting in 10s"
+      sleep 10
     fi
   fi
 fi
diff --git a/packages/sysutils/busybox/scripts/init b/packages/sysutils/busybox/scripts/init
index 86a91bba011..fd7c5ca4d08 100755
--- a/packages/sysutils/busybox/scripts/init
+++ b/packages/sysutils/busybox/scripts/init
@@ -139,6 +139,7 @@
   progress() {
     if [ "$PROGRESS" = "yes" -a -e /dev/psplash_fifo ] ; then
       echo "MSG $1" > /dev/psplash_fifo
+        usleep 500000 # wait a bit until the last message is written
     elif [ "$PROGRESS" = "yes" ] ; then
       echo "### $1 ###"
     fi
diff --git a/packages/sysutils/busybox/system.d/var.mount b/packages/sysutils/busybox/system.d/var.mount
index 2207fa61a8e..1eea76fa0dd 100644
--- a/packages/sysutils/busybox/system.d/var.mount
+++ b/packages/sysutils/busybox/system.d/var.mount
@@ -10,7 +10,6 @@ Description=Variable Directory
 Documentation=man:hier(7)
 Documentation=http://www.freedesktop.org/wiki/Software/systemd/APIFileSystems
 DefaultDependencies=no
-Conflicts=umount.target
 Before=local-fs.target umount.target
 
 [Mount]
diff --git a/packages/sysutils/dbus/package.mk b/packages/sysutils/dbus/package.mk
index a1bf4e2acba..34ce674bf0b 100644
--- a/packages/sysutils/dbus/package.mk
+++ b/packages/sysutils/dbus/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="dbus"
-PKG_VERSION="1.10.16"
+PKG_VERSION="1.10.18"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/sysutils/dosfstools/package.mk b/packages/sysutils/dosfstools/package.mk
index c1dd72f7a18..78717c29d04 100644
--- a/packages/sysutils/dosfstools/package.mk
+++ b/packages/sysutils/dosfstools/package.mk
@@ -17,48 +17,28 @@
 ################################################################################
 
 PKG_NAME="dosfstools"
-PKG_VERSION="3.0.28"
+PKG_VERSION="4.1"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPLv3"
 PKG_SITE="https://github.com/dosfstools/dosfstools"
 PKG_URL="https://github.com/dosfstools/dosfstools/releases/download/v$PKG_VERSION/$PKG_NAME-$PKG_VERSION.tar.xz"
 PKG_DEPENDS_TARGET="toolchain"
-PKG_DEPENDS_INIT="toolchain dosfstools gcc:init"
+PKG_DEPENDS_INIT="toolchain gcc:init"
 PKG_PRIORITY="optional"
 PKG_SECTION="tools"
 PKG_SHORTDESC="dosfstools: utilities for making and checking MS-DOS FAT filesystems."
 PKG_LONGDESC="dosfstools contains utilities for making and checking MS-DOS FAT filesystems."
 
 PKG_IS_ADDON="no"
-PKG_AUTORECONF="no"
+PKG_AUTORECONF="yes"
 
-PKG_MAKE_OPTS_TARGET="PREFIX=/usr"
-PKG_MAKEINSTALL_OPTS_TARGET="PREFIX=/usr"
-
-make_init() {
-  : # reuse make_target()
-}
-
-pre_build_host() {
-  mkdir -p $PKG_BUILD/.$HOST_NAME
-  cp -RP $PKG_BUILD/* $PKG_BUILD/.$HOST_NAME
-}
-
-make_host() {
-  cd $ROOT/$PKG_BUILD/.$HOST_NAME
-  make PREFIX=/usr
-}
-
-makeinstall_init() {
-  mkdir -p $INSTALL/sbin
-    cp fsck.fat $INSTALL/sbin
-    ln -sf fsck.fat $INSTALL/sbin/fsck.msdos
-    ln -sf fsck.fat $INSTALL/sbin/fsck.vfat
-}
+PKG_CONFIGURE_OPTS_TARGET="--without-udev"
+PKG_CONFIGURE_OPTS_INIT="--without-udev"
+PKG_CONFIGURE_OPTS_HOST="--without-udev"
 
 makeinstall_host() {
   mkdir -p $ROOT/$TOOLCHAIN/sbin
-    cp mkfs.fat $ROOT/$TOOLCHAIN/sbin
+    cp src/mkfs.fat $ROOT/$TOOLCHAIN/sbin
     ln -sf mkfs.fat $ROOT/$TOOLCHAIN/sbin/mkfs.vfat
 }
diff --git a/packages/sysutils/fuse-exfat/package.mk b/packages/sysutils/fuse-exfat/package.mk
index 3f7ce899396..846ed412a89 100644
--- a/packages/sysutils/fuse-exfat/package.mk
+++ b/packages/sysutils/fuse-exfat/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="fuse-exfat"
-PKG_VERSION="1.2.4"
+PKG_VERSION="1.2.6"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPLv2+"
diff --git a/packages/sysutils/fuse/modules-load.d/fuse.conf b/packages/sysutils/fuse/modules-load.d/fuse.conf
new file mode 100644
index 00000000000..a517c488f34
--- /dev/null
+++ b/packages/sysutils/fuse/modules-load.d/fuse.conf
@@ -0,0 +1 @@
+fuse
diff --git a/packages/sysutils/fuse/system.d/sys-fs-fuse-connections.mount b/packages/sysutils/fuse/system.d/sys-fs-fuse-connections.mount
new file mode 100644
index 00000000000..4ed108dedd1
--- /dev/null
+++ b/packages/sysutils/fuse/system.d/sys-fs-fuse-connections.mount
@@ -0,0 +1,10 @@
+[Unit]
+Description=FUSE Control File System
+DefaultDependencies=no
+ConditionPathExists=/sys/fs/fuse/connections
+Before=sysinit.target
+
+[Mount]
+What=fusectl
+Where=/sys/fs/fuse/connections
+Type=fusectl
diff --git a/packages/sysutils/imx6-status-led/system.d/imx6-status-led.service b/packages/sysutils/imx6-status-led/system.d/imx6-status-led.service
index 66e1ca77405..01134d30299 100644
--- a/packages/sysutils/imx6-status-led/system.d/imx6-status-led.service
+++ b/packages/sysutils/imx6-status-led/system.d/imx6-status-led.service
@@ -1,6 +1,6 @@
 [Unit]
 Description=i.MX6 status led
-After=kodi.service
+After=graphical.target
 
 [Service]
 Type=oneshot
@@ -9,4 +9,4 @@ ExecStop=-/bin/sh -c "/usr/lib/openelec/imx6-status-led heartbeat"
 RemainAfterExit=yes
 
 [Install]
-WantedBy=multi-user.target
+WantedBy=kodi.service
diff --git a/packages/sysutils/irqbalance/patches/irqbalance-1.0.4-env-file-path.patch b/packages/sysutils/irqbalance/patches/irqbalance-1.0.4-env-file-path.patch
deleted file mode 100644
index c089bd5349a..00000000000
--- a/packages/sysutils/irqbalance/patches/irqbalance-1.0.4-env-file-path.patch
+++ /dev/null
@@ -1,12 +0,0 @@
-diff -up irqbalance-1.0.4/misc/irqbalance.service.orig irqbalance-1.0.4/misc/irqbalance.service
---- irqbalance-1.0.4/misc/irqbalance.service.orig	2012-08-29 16:24:42.011844627 +0200
-+++ irqbalance-1.0.4/misc/irqbalance.service	2012-08-29 16:24:59.817845765 +0200
-@@ -3,7 +3,7 @@ Description=irqbalance daemon
- After=syslog.target
- 
- [Service]
--EnvironmentFile=/path/to/irqbalance.env
-+EnvironmentFile=/etc/irqbalance
- ExecStart=/usr/sbin/irqbalance --foreground $IRQBALANCE_ARGS
- 
- [Install]
diff --git a/packages/sysutils/irqbalance/patches/irqbalance-systemd.patch b/packages/sysutils/irqbalance/patches/irqbalance-systemd.patch
deleted file mode 100644
index 7f2fe116677..00000000000
--- a/packages/sysutils/irqbalance/patches/irqbalance-systemd.patch
+++ /dev/null
@@ -1,12 +0,0 @@
-diff -Naur irqbalance-aa04f78/configure.ac irqbalance-aa04f78.patch/configure.ac
---- irqbalance-aa04f78/configure.ac	2016-06-27 14:34:47.000000000 +0200
-+++ irqbalance-aa04f78.patch/configure.ac	2016-09-15 15:40:19.663695774 +0200
-@@ -35,7 +35,7 @@
- )
- AS_IF(
-   [test "x$with_systemd" = xyes], [
--    PKG_CHECK_MODULES([SYSTEMD], [libsystemd-journal], [journal_lib=yes])
-+    PKG_CHECK_MODULES([SYSTEMD], [libsystemd], [journal_lib=yes])
-     AC_DEFINE(HAVE_LIBSYSTEMD, 1, [systemd support])
-     AC_CHECK_LIB([systemd], [sd_journal_print_with_location])
-     AC_CHECK_LIB([systemd], [sd_journal_print])
diff --git a/packages/sysutils/keyutils/package.mk b/packages/sysutils/keyutils/package.mk
index b2c3a703514..8e8aab79e9f 100644
--- a/packages/sysutils/keyutils/package.mk
+++ b/packages/sysutils/keyutils/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="keyutils"
-PKG_VERSION="1.5.9"
+PKG_VERSION="1.5.10"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/sysutils/keyutils/patches/keyutils-01-allow-building-of-the-shared-library-to-be-suppressed.patch b/packages/sysutils/keyutils/patches/keyutils-01-allow-building-of-the-shared-library-to-be-suppressed.patch
deleted file mode 100644
index 88d32b0cf0d..00000000000
--- a/packages/sysutils/keyutils/patches/keyutils-01-allow-building-of-the-shared-library-to-be-suppressed.patch
+++ /dev/null
@@ -1,104 +0,0 @@
-keyutils: Allow building of the shared library to be suppressed
-
-Upstream patch:
-  https://kernel.googlesource.com/pub/scm/linux/kernel/git/dhowells/keyutils/+/a4deb71ddc05e951c8be8d46615beed9d408a5c8
-
-Signed-off-by: Vicente Olivert Riera <vincent.riera@imgtec.com>
-
-LIB: Allow building of the shared library to be suppressed
-
-Allow building of the shared library to be suppressed by passing NO_SOLIB=1 to
-the Makefile.
-
-Reported-and-tested-by: Vicente Olivert Riera <vincent.riera@imgtec.com>
-Signed-off-by: David Howells <dhowells@redhat.com>
-diff --git a/Makefile b/Makefile
-index c904eaf..5dd2113 100644
---- a/Makefile
-+++ b/Makefile
-@@ -5,6 +5,7 @@
- SPECFILE	:= keyutils.spec
- NO_GLIBC_KEYERR	:= 0
- NO_ARLIB	:= 0
-+NO_SOLIB	:= 0
- ETCDIR		:= /etc
- BINDIR		:= /bin
- SBINDIR		:= /sbin
-@@ -95,7 +96,7 @@
- # Normal build rule
- #
- ###############################################################################
--all: $(DEVELLIB) keyctl request-key key.dns_resolver
-+all: keyctl request-key key.dns_resolver
- 
- ###############################################################################
- #
-@@ -104,20 +105,23 @@
- ###############################################################################
- #RPATH = -Wl,-rpath,$(LIBDIR)
- 
--ifeq ($(NO_ARLIB),0)
--all: $(ARLIB)
--$(ARLIB): keyutils.o
--	$(AR) rcs $@ $<
--endif
--
- VCPPFLAGS	:= -DPKGBUILD="\"$(shell date -u +%F)\""
- VCPPFLAGS	+= -DPKGVERSION="\"keyutils-$(VERSION)\""
- VCPPFLAGS	+= -DAPIVERSION="\"libkeyutils-$(APIVERSION)\""
- 
-+ifeq ($(NO_ARLIB),0)
-+all: $(ARLIB)
-+$(ARLIB): keyutils.o
-+	$(AR) rcs $@ $<
-+
- keyutils.o: keyutils.c keyutils.h Makefile
- 	$(CC) $(CPPFLAGS) $(VCPPFLAGS) $(CFLAGS) -UNO_GLIBC_KEYERR -o $@ -c $<
-+LIB_DEPENDENCY	:= libkeyutils.a
-+endif
- 
- 
-+ifeq ($(NO_SOLIB),0)
-+all: $(DEVELLIB)
- $(DEVELLIB): $(SONAME)
- 	ln -sf $< $@
- 
-@@ -131,6 +135,8 @@
- 
- keyutils.os: keyutils.c keyutils.h Makefile
- 	$(CC) $(CPPFLAGS) $(VCPPFLAGS) $(CFLAGS) -fPIC -o $@ -c $<
-+LIB_DEPENDENCY	:= $(DEVELLIB)
-+endif
- 
- ###############################################################################
- #
-@@ -140,13 +146,13 @@
- %.o: %.c keyutils.h Makefile
- 	$(CC) $(CPPFLAGS) $(CFLAGS) -o $@ -c $<
- 
--keyctl: keyctl.o $(DEVELLIB)
-+keyctl: keyctl.o $(LIB_DEPENDENCY)
- 	$(CC) -L. $(CFLAGS) $(LDFLAGS) $(RPATH) -o $@ $< -lkeyutils
- 
--request-key: request-key.o $(DEVELLIB)
-+request-key: request-key.o $(LIB_DEPENDENCY)
- 	$(CC) -L. $(CFLAGS) $(LDFLAGS) $(RPATH) -o $@ $< -lkeyutils
- 
--key.dns_resolver: key.dns_resolver.o $(DEVELLIB)
-+key.dns_resolver: key.dns_resolver.o $(LIB_DEPENDENCY)
- 	$(CC) -L. $(CFLAGS) $(LDFLAGS) $(RPATH) -o $@ $< -lkeyutils -lresolv
- 
- ###############################################################################
-@@ -158,10 +164,12 @@
- ifeq ($(NO_ARLIB),0)
- 	$(INSTALL) -D -m 0644 $(ARLIB) $(DESTDIR)$(USRLIBDIR)/$(ARLIB)
- endif
-+ifeq ($(NO_SOLIB),0)
- 	$(INSTALL) -D $(LIBNAME) $(DESTDIR)$(LIBDIR)/$(LIBNAME)
- 	$(LNS) $(LIBNAME) $(DESTDIR)$(LIBDIR)/$(SONAME)
- 	mkdir -p $(DESTDIR)$(USRLIBDIR)
- 	$(LNS) $(LIBDIR)/$(SONAME) $(DESTDIR)$(USRLIBDIR)/$(DEVELLIB)
-+endif
- 	$(INSTALL) -D keyctl $(DESTDIR)$(BINDIR)/keyctl
- 	$(INSTALL) -D request-key $(DESTDIR)$(SBINDIR)/request-key
- 	$(INSTALL) -D request-key-debug.sh $(DESTDIR)$(SHAREDIR)/request-key-debug.sh
diff --git a/packages/sysutils/libevdev/package.mk b/packages/sysutils/libevdev/package.mk
index 525af578fb0..3fce2ff3fca 100644
--- a/packages/sysutils/libevdev/package.mk
+++ b/packages/sysutils/libevdev/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="libevdev"
-PKG_VERSION="1.5.6"
+PKG_VERSION="1.5.7"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
@@ -32,7 +32,7 @@ PKG_LONGDESC="libevdev is a wrapper library for evdev devices. it moves the comm
 PKG_IS_ADDON="no"
 PKG_AUTORECONF="yes"
 
-PKG_CONFIGURE_OPTS_TARGET="--enable-shared --disable-static"
+PKG_CONFIGURE_OPTS_TARGET="ac_cv_path_DOXYGEN=true --enable-shared --disable-static"
 
 pre_configure_target() {
   export CFLAGS="$CFLAGS -fPIC -DPIC"
diff --git a/packages/sysutils/libusb/package.mk b/packages/sysutils/libusb/package.mk
index b610013097e..e8a5032f234 100644
--- a/packages/sysutils/libusb/package.mk
+++ b/packages/sysutils/libusb/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="libusb"
-PKG_VERSION="1.0.20"
+PKG_VERSION="1.0.21"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="LGPLv2.1"
diff --git a/packages/sysutils/pciutils/package.mk b/packages/sysutils/pciutils/package.mk
index 7c749faf5e1..b747718ded1 100644
--- a/packages/sysutils/pciutils/package.mk
+++ b/packages/sysutils/pciutils/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pciutils"
-PKG_VERSION="3.5.2"
+PKG_VERSION="3.5.4"
 PKG_REV="1"
 PKG_ARCH="x86_64"
 PKG_LICENSE="GPL"
diff --git a/packages/sysutils/systemd/package.mk b/packages/sysutils/systemd/package.mk
index de324bd069f..df1ac8814df 100644
--- a/packages/sysutils/systemd/package.mk
+++ b/packages/sysutils/systemd/package.mk
@@ -43,7 +43,7 @@ PKG_CONFIGURE_OPTS_TARGET="ac_cv_func_malloc_0_nonnull=yes \
                            --disable-coverage \
                            --disable-kmod \
                            --disable-xkbcommon \
-                           --disable-blkid \
+                           --enable-blkid \
                            --disable-seccomp \
                            --disable-ima \
                            --disable-selinux \
@@ -170,6 +170,9 @@ post_makeinstall_target() {
   rm -rf $INSTALL/usr/lib/systemd/system/systemd-udev-hwdb-update.service
   rm -rf $INSTALL/usr/lib/systemd/system/*.target.wants/systemd-udev-hwdb-update.service
 
+  # remove fuse mount rules, we ship this byself
+  rm -rf $INSTALL/usr/lib/systemd/system/sys-fs-fuse-connections.mount
+
   # remove nspawn
   rm -rf $INSTALL/usr/bin/systemd-nspawn
   rm -rf $INSTALL/usr/lib/systemd/system/systemd-nspawn@.service
@@ -216,7 +219,6 @@ post_makeinstall_target() {
   cp -PR $PKG_DIR/config/* $INSTALL/usr/config
 
   rm -rf $INSTALL/etc/modules-load.d
-  ln -sf /storage/.config/modules-load.d $INSTALL/etc/modules-load.d
   rm -rf $INSTALL/etc/sysctl.d
   ln -sf /storage/.config/sysctl.d $INSTALL/etc/sysctl.d
   rm -rf $INSTALL/etc/tmpfiles.d
diff --git a/packages/sysutils/systemd/patches/systemd-01-systemsleepdir.patch b/packages/sysutils/systemd/patches/systemd-01-systemsleepdir.patch
new file mode 100644
index 00000000000..f3fd4c56eed
--- /dev/null
+++ b/packages/sysutils/systemd/patches/systemd-01-systemsleepdir.patch
@@ -0,0 +1,12 @@
+diff -Naur systemd-233/Makefile.am systemd-233.patch/Makefile.am
+--- systemd-233/Makefile.am	2017-03-01 22:43:06.000000000 +0100
++++ systemd-233.patch/Makefile.am	2017-05-21 00:14:46.995223339 +0200
+@@ -84,7 +84,7 @@
+ systemenvgeneratordir=$(prefix)/lib/systemd/system-environment-generators
+ userenvgeneratordir=$(prefix)/lib/systemd/user-environment-generators
+ systemshutdowndir=$(rootlibexecdir)/system-shutdown
+-systemsleepdir=$(rootlibexecdir)/system-sleep
++systemsleepdir=$(rootprefix)/lib/systemd/system-sleep
+ systemunitdir=$(rootprefix)/lib/systemd/system
+ systempresetdir=$(rootprefix)/lib/systemd/system-preset
+ udevlibexecdir=$(rootprefix)/lib/udev
diff --git a/packages/sysutils/v4l-utils/package.mk b/packages/sysutils/v4l-utils/package.mk
index c3cd1c23300..6f940b435d8 100644
--- a/packages/sysutils/v4l-utils/package.mk
+++ b/packages/sysutils/v4l-utils/package.mk
@@ -19,7 +19,7 @@
 # with 1.0.0 repeat delay is broken. test on upgrade
 
 PKG_NAME="v4l-utils"
-PKG_VERSION="1.12.2"
+PKG_VERSION="1.12.3"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/tools/hdparm/package.mk b/packages/tools/hdparm/package.mk
index f9a3bea1a6b..c97e3901221 100644
--- a/packages/tools/hdparm/package.mk
+++ b/packages/tools/hdparm/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="hdparm"
-PKG_VERSION="9.51"
+PKG_VERSION="9.52"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="BSD"
diff --git a/packages/tools/installer/scripts/installer b/packages/tools/installer/scripts/installer
index 604c5b0e88d..cb00bffb543 100755
--- a/packages/tools/installer/scripts/installer
+++ b/packages/tools/installer/scripts/installer
@@ -672,10 +672,22 @@ mkdir -p $TMPDIR
 #create log file
 touch "$LOGFILE"
 
-# main
+# quit splash
+if [ -e /dev/psplash_fifo ]; then
+  echo "QUIT" > /dev/psplash_fifo
+fi
+
+# reset terminal
+sleep 2
+reset
+
+# switch to console 2
+sleep 1
+exec < /dev/tty2 > /dev/tty2
+chvt 2
 
+# main
 while true; do
-  clear
   menu_main
 done
 
diff --git a/packages/tools/installer/system.d/installer.service b/packages/tools/installer/system.d/installer.service
index a573b05a28b..aac9e38a3f0 100644
--- a/packages/tools/installer/system.d/installer.service
+++ b/packages/tools/installer/system.d/installer.service
@@ -1,5 +1,5 @@
 [Unit]
-Description=XBMC Media Center
+Description=OpenELEC Installer
 Requires=installer.target
 
 [Service]
diff --git a/packages/tools/nano/package.mk b/packages/tools/nano/package.mk
index c110a6244d0..6ff73188372 100644
--- a/packages/tools/nano/package.mk
+++ b/packages/tools/nano/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="nano"
-PKG_VERSION="2.8.0"
+PKG_VERSION="2.8.4"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/tools/psplash/patches/psplash-change-font.patch b/packages/tools/psplash/patches/psplash-change-font.patch.bk
similarity index 100%
rename from packages/tools/psplash/patches/psplash-change-font.patch
rename to packages/tools/psplash/patches/psplash-change-font.patch.bk
diff --git a/packages/virtual/mediacenter/package.mk b/packages/virtual/mediacenter/package.mk
index 889517b4d64..224e81b9f4c 100644
--- a/packages/virtual/mediacenter/package.mk
+++ b/packages/virtual/mediacenter/package.mk
@@ -39,6 +39,8 @@ if [ "$MEDIACENTER" = "kodi" ]; then
   PKG_DEPENDS_TARGET="$PKG_DEPENDS_TARGET pycrypto"
   PKG_DEPENDS_TARGET="$PKG_DEPENDS_TARGET xmlstarlet"
   PKG_DEPENDS_TARGET="$PKG_DEPENDS_TARGET peripheral.joystick"
+  PKG_DEPENDS_TARGET="$PKG_DEPENDS_TARGET inputstream.rtmp"
+  PKG_DEPENDS_TARGET="$PKG_DEPENDS_TARGET inputstream.adaptive"
 
 # other packages
   PKG_DEPENDS_TARGET="$PKG_DEPENDS_TARGET OpenELEC-settings"
diff --git a/packages/wayland/libinput/package.mk b/packages/wayland/libinput/package.mk
index 75adcd79007..8784725ce00 100644
--- a/packages/wayland/libinput/package.mk
+++ b/packages/wayland/libinput/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="libinput"
-PKG_VERSION="1.7.0"
+PKG_VERSION="1.7.1"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/wayland/mtdev/package.mk b/packages/wayland/mtdev/package.mk
index cf1a3d55ecf..f678850b65d 100644
--- a/packages/wayland/mtdev/package.mk
+++ b/packages/wayland/mtdev/package.mk
@@ -32,4 +32,8 @@ PKG_LONGDESC="The mtdev is a stand-alone library which transforms all variants o
 PKG_IS_ADDON="no"
 PKG_AUTORECONF="yes"
 
-PKG_CONFIGURE_OPTS_TARGET="--enable-static --disable-shared"
\ No newline at end of file
+PKG_CONFIGURE_OPTS_TARGET="--enable-static --disable-shared"
+
+pre_configure_target() {
+  CFLAGS="$CFLAGS -fPIC"
+}
diff --git a/packages/web/curl/package.mk b/packages/web/curl/package.mk
index 9e2edf54c1f..d34f825d412 100644
--- a/packages/web/curl/package.mk
+++ b/packages/web/curl/package.mk
@@ -25,7 +25,7 @@
 #   there: http://forum.xbmc.org/showthread.php?tid=177557
 
 PKG_NAME="curl"
-PKG_VERSION="7.53.1"
+PKG_VERSION="7.54.0"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="MIT"
diff --git a/packages/x11/app/xkbcomp/package.mk b/packages/x11/app/xkbcomp/package.mk
index c558ffcb0d3..4a9230858ca 100644
--- a/packages/x11/app/xkbcomp/package.mk
+++ b/packages/x11/app/xkbcomp/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="xkbcomp"
-PKG_VERSION="1.3.1"
+PKG_VERSION="1.4.0"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="OSS"
diff --git a/packages/x11/data/xkeyboard-config/package.mk b/packages/x11/data/xkeyboard-config/package.mk
index 3027d605e5f..01a5be83ce9 100644
--- a/packages/x11/data/xkeyboard-config/package.mk
+++ b/packages/x11/data/xkeyboard-config/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="xkeyboard-config"
-PKG_VERSION="2.19"
+PKG_VERSION="2.20"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="OSS"
diff --git a/packages/x11/driver/xf86-input-libinput/package.mk b/packages/x11/driver/xf86-input-libinput/package.mk
index 073f6075dd1..a277bdeb10b 100644
--- a/packages/x11/driver/xf86-input-libinput/package.mk
+++ b/packages/x11/driver/xf86-input-libinput/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="xf86-input-libinput"
-PKG_VERSION="0.25.0"
+PKG_VERSION="0.25.1"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
@@ -36,5 +36,5 @@ PKG_CONFIGURE_OPTS_TARGET="--with-xorg-module-dir=/usr/lib/xorg/modules"
 
 post_makeinstall_target() {
   mkdir -p $INSTALL/usr/share/X11/xorg.conf.d
-    cp $ROOT/$PKG_BUILD/conf/60-libinput.conf $INSTALL/usr/share/X11/xorg.conf.d
+    cp $ROOT/$PKG_BUILD/conf/*.conf $INSTALL/usr/share/X11/xorg.conf.d
 }
diff --git a/packages/x11/lib/libpciaccess/package.mk b/packages/x11/lib/libpciaccess/package.mk
index f2c6e629164..052ae44d83b 100644
--- a/packages/x11/lib/libpciaccess/package.mk
+++ b/packages/x11/lib/libpciaccess/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="libpciaccess"
-PKG_VERSION="0.13.4"
+PKG_VERSION="0.13.5"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="OSS"
diff --git a/packages/x11/lib/libxcb/package.mk b/packages/x11/lib/libxcb/package.mk
index 91e7f566e08..d1399efce13 100644
--- a/packages/x11/lib/libxcb/package.mk
+++ b/packages/x11/lib/libxcb/package.mk
@@ -36,6 +36,7 @@ PKG_CONFIGURE_OPTS_TARGET="--enable-static --disable-shared \
                            --disable-screensaver \
                            --disable-xprint \
                            --disable-selinux \
+                           --disable-devel-docs \
                            --disable-xvmc"
 
 pre_configure_target() {
diff --git a/packages/x11/other/fontconfig/package.mk b/packages/x11/other/fontconfig/package.mk
index 68eaf459628..6311ccb4d32 100644
--- a/packages/x11/other/fontconfig/package.mk
+++ b/packages/x11/other/fontconfig/package.mk
@@ -45,7 +45,9 @@ pre_configure_target() {
   CFLAGS=`echo $CFLAGS | sed -e "s|-O3|-O2|"`
   CXXFLAGS=`echo $CXXFLAGS | sed -e "s|-O3|-O2|"`
   CFLAGS="$CFLAGS -I$ROOT/$PKG_BUILD"
+  CFLAGS="$CFLAGS -fPIC"
   CXXFLAGS="$CXXFLAGS -I$ROOT/$PKG_BUILD"
+  CXXFLAGS="$CXXFLAGS -fPIC"
   LDFLAGS="$LDFLAGS -lz"
 }
 
diff --git a/packages/x11/xserver/xorg-server/package.mk b/packages/x11/xserver/xorg-server/package.mk
index db34c205e7d..5e5485d16b2 100644
--- a/packages/x11/xserver/xorg-server/package.mk
+++ b/packages/x11/xserver/xorg-server/package.mk
@@ -125,6 +125,7 @@ PKG_CONFIGURE_OPTS_TARGET="--disable-debug \
                            --with-fontrootdir=/usr/share/fonts \
                            --with-default-font-path=/usr/share/fonts/misc,built-ins \
                            --with-serverconfig-path=/usr/lib/xserver \
+                           --without-doxygen \
                            --without-xmlto \
                            --without-fop"
 
diff --git a/projects/Generic/linux/linux.x86_64.conf b/projects/Generic/linux/linux.x86_64.conf
index 0e026e33320..5e4b898711a 100644
--- a/projects/Generic/linux/linux.x86_64.conf
+++ b/projects/Generic/linux/linux.x86_64.conf
@@ -1,6 +1,6 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/x86 4.9.16 Kernel Configuration
+# Linux/x86 4.9.29 Kernel Configuration
 #
 CONFIG_64BIT=y
 CONFIG_X86_64=y
@@ -624,9 +624,9 @@ CONFIG_CPU_FREQ_GOV_COMMON=y
 # CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
-# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 # CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set
 CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
 CONFIG_CPU_FREQ_GOV_POWERSAVE=y
 # CONFIG_CPU_FREQ_GOV_USERSPACE is not set
@@ -745,7 +745,7 @@ CONFIG_STANDALONE=y
 # CONFIG_PREVENT_FIRMWARE_BUILD is not set
 CONFIG_FW_LOADER=y
 CONFIG_FIRMWARE_IN_KERNEL=y
-CONFIG_EXTRA_FIRMWARE="amdgpu/polaris11_pfp.bin amdgpu/topaz_ce.bin amdgpu/polaris10_me.bin amdgpu/polaris11_me.bin amdgpu/topaz_me.bin amdgpu/topaz_mc.bin amdgpu/polaris10_rlc.bin amdgpu/tonga_k_smc.bin amdgpu/polaris12_ce.bin amdgpu/tonga_sdma.bin amdgpu/polaris11_mec.bin amdgpu/polaris12_sdma.bin amdgpu/fiji_me.bin amdgpu/polaris10_sdma1.bin amdgpu/fiji_sdma1.bin amdgpu/carrizo_sdma.bin amdgpu/stoney_me.bin amdgpu/polaris12_smc.bin amdgpu/fiji_smc.bin amdgpu/tonga_sdma1.bin amdgpu/polaris12_sdma1.bin amdgpu/carrizo_uvd.bin amdgpu/polaris11_smc.bin amdgpu/polaris12_vce.bin amdgpu/polaris11_mec2.bin amdgpu/carrizo_sdma1.bin amdgpu/topaz_mec.bin amdgpu/carrizo_mec.bin amdgpu/polaris12_mc.bin amdgpu/polaris10_ce.bin amdgpu/tonga_uvd.bin amdgpu/fiji_rlc.bin amdgpu/fiji_mec.bin amdgpu/polaris12_pfp.bin amdgpu/polaris11_sdma.bin amdgpu/polaris11_rlc.bin amdgpu/polaris10_mec2.bin amdgpu/polaris10_mec.bin amdgpu/fiji_pfp.bin amdgpu/polaris12_me.bin amdgpu/polaris11_ce.bin amdgpu/topaz_rlc.bin amdgpu/tonga_mec2.bin amdgpu/tonga_me.bin amdgpu/stoney_uvd.bin amdgpu/tonga_ce.bin amdgpu/polaris12_mec2.bin amdgpu/stoney_sdma.bin amdgpu/fiji_vce.bin amdgpu/polaris11_mc.bin amdgpu/stoney_rlc.bin amdgpu/polaris10_smc_sk.bin amdgpu/polaris11_sdma1.bin amdgpu/fiji_mc.bin amdgpu/fiji_sdma.bin amdgpu/topaz_smc.bin amdgpu/tonga_mc.bin amdgpu/tonga_pfp.bin amdgpu/stoney_pfp.bin amdgpu/polaris11_vce.bin amdgpu/fiji_uvd.bin amdgpu/polaris11_uvd.bin amdgpu/polaris10_vce.bin amdgpu/polaris10_sdma.bin amdgpu/topaz_sdma1.bin amdgpu/topaz_pfp.bin amdgpu/polaris10_pfp.bin amdgpu/polaris10_mc.bin amdgpu/carrizo_mec2.bin amdgpu/tonga_vce.bin amdgpu/tonga_rlc.bin amdgpu/carrizo_rlc.bin amdgpu/polaris12_mec.bin amdgpu/tonga_smc.bin amdgpu/topaz_mec2.bin amdgpu/stoney_vce.bin amdgpu/polaris12_uvd.bin amdgpu/topaz_k_smc.bin amdgpu/carrizo_ce.bin amdgpu/polaris12_rlc.bin amdgpu/polaris11_k_smc.bin amdgpu/tonga_mec.bin amdgpu/polaris10_k_smc.bin amdgpu/polaris11_smc_sk.bin amdgpu/carrizo_vce.bin amdgpu/carrizo_me.bin amdgpu/carrizo_pfp.bin amdgpu/stoney_mec.bin amdgpu/polaris10_uvd.bin amdgpu/fiji_mec2.bin amdgpu/fiji_ce.bin amdgpu/polaris10_smc.bin amdgpu/stoney_ce.bin amdgpu/topaz_sdma.bin nvidia/gm204/gr/gpccs_sig.bin nvidia/gm204/gr/fecs_data.bin nvidia/gm204/gr/fecs_sig.bin nvidia/gm204/gr/gpccs_data.bin nvidia/gp100/gr/gpccs_sig.bin nvidia/gp100/gr/sw_method_init.bin nvidia/gp100/gr/fecs_data.bin nvidia/gp100/gr/fecs_inst.bin nvidia/gp100/gr/fecs_sig.bin nvidia/gp100/gr/gpccs_data.bin nvidia/gp100/gr/sw_ctx.bin nvidia/gp100/gr/sw_nonctx.bin nvidia/gp100/gr/gpccs_inst.bin nvidia/gp100/gr/sw_bundle_init.bin nvidia/gp100/acr/bl.bin nvidia/gp100/acr/ucode_unload.bin nvidia/gp100/acr/ucode_load.bin nvidia/gm200/gr/gpccs_bl.bin nvidia/gm200/gr/gpccs_sig.bin nvidia/gm200/gr/sw_method_init.bin nvidia/gm200/gr/fecs_data.bin nvidia/gm200/gr/fecs_inst.bin nvidia/gm200/gr/fecs_sig.bin nvidia/gm200/gr/gpccs_data.bin nvidia/gm200/gr/sw_ctx.bin nvidia/gm200/gr/sw_nonctx.bin nvidia/gm200/gr/gpccs_inst.bin nvidia/gm200/gr/fecs_bl.bin nvidia/gm200/gr/sw_bundle_init.bin nvidia/gm200/acr/bl.bin nvidia/gm200/acr/ucode_unload.bin nvidia/gm200/acr/ucode_load.bin nvidia/gm20b/gr/fecs_data.bin nvidia/gm20b/gr/fecs_inst.bin nvidia/gm20b/gr/fecs_sig.bin nvidia/gm20b/gr/gpccs_data.bin nvidia/gm20b/gr/sw_ctx.bin nvidia/gm20b/gr/sw_nonctx.bin nvidia/gm20b/gr/gpccs_inst.bin nvidia/gm20b/gr/fecs_bl.bin nvidia/gm20b/gr/sw_bundle_init.bin nvidia/gm20b/acr/bl.bin nvidia/gm20b/acr/ucode_load.bin nvidia/gk20a/sw_method_init.bin nvidia/gk20a/fecs_data.bin nvidia/gk20a/fecs_inst.bin nvidia/gk20a/gpccs_data.bin nvidia/gk20a/sw_ctx.bin nvidia/gk20a/sw_nonctx.bin nvidia/gk20a/gpccs_inst.bin nvidia/gk20a/sw_bundle_init.bin nvidia/gm206/gr/gpccs_sig.bin nvidia/gm206/gr/fecs_data.bin nvidia/gm206/gr/fecs_sig.bin nvidia/gm206/gr/gpccs_data.bin nvidia/gm206/acr/ucode_unload.bin nvidia/gm206/acr/ucode_load.bin i915/skl_dmc_ver1_23.bin i915/skl_guc_ver4.bin i915/skl_huc_ver01_07_1398.bin i915/bxt_dmc_ver1_07.bin i915/skl_guc_ver6_1.bin i915/skl_dmc_ver1_26.bin i915/bxt_guc_ver8_7.bin i915/kbl_dmc_ver1_01.bin i915/kbl_guc_ver9_14.bin i915/bxt_huc_ver01_07_1398.bin i915/kbl_huc_ver02_00_1810.bin i915/skl_guc_ver1.bin radeon/R200_cp.bin radeon/KABINI_mec.bin radeon/VERDE_pfp.bin radeon/REDWOOD_me.bin radeon/hainan_me.bin radeon/SUMO_uvd.bin radeon/CAICOS_mc.bin radeon/TAHITI_mc2.bin radeon/kabini_rlc.bin radeon/KAVERI_pfp.bin radeon/CAICOS_me.bin radeon/hainan_pfp.bin radeon/hainan_k_smc.bin radeon/hawaii_vce.bin radeon/HAINAN_mc.bin radeon/RV730_pfp.bin radeon/KAVERI_rlc.bin radeon/PALM_me.bin radeon/verde_ce.bin radeon/RV740_smc.bin radeon/KAVERI_me.bin radeon/CAYMAN_pfp.bin radeon/REDWOOD_smc.bin radeon/pitcairn_mc.bin radeon/PALM_pfp.bin radeon/R100_cp.bin radeon/VERDE_mc.bin radeon/JUNIPER_smc.bin radeon/SUMO2_me.bin radeon/RS600_cp.bin radeon/kabini_uvd.bin radeon/BARTS_pfp.bin radeon/KABINI_rlc.bin radeon/VERDE_me.bin radeon/R420_cp.bin radeon/R600_uvd.bin radeon/pitcairn_k_smc.bin radeon/verde_rlc.bin radeon/mullins_me.bin radeon/PITCAIRN_me.bin radeon/hawaii_me.bin radeon/tahiti_me.bin radeon/CAYMAN_rlc.bin radeon/RV610_me.bin radeon/hawaii_smc.bin radeon/JUNIPER_pfp.bin radeon/TAHITI_rlc.bin radeon/kabini_pfp.bin radeon/SUMO_pfp.bin radeon/oland_rlc.bin radeon/oland_k_smc.bin radeon/BONAIRE_smc.bin radeon/TURKS_smc.bin radeon/TAHITI_me.bin radeon/kaveri_vce.bin radeon/hainan_mc.bin radeon/BONAIRE_rlc.bin radeon/VERDE_smc.bin radeon/OLAND_pfp.bin radeon/RV630_pfp.bin radeon/pitcairn_me.bin radeon/kabini_ce.bin radeon/oland_ce.bin radeon/kaveri_ce.bin radeon/BARTS_me.bin radeon/mullins_sdma.bin radeon/bonaire_uvd.bin radeon/verde_mc.bin radeon/KABINI_me.bin radeon/KABINI_sdma.bin radeon/RV770_me.bin radeon/HAWAII_ce.bin radeon/mullins_vce.bin radeon/tahiti_pfp.bin radeon/MULLINS_pfp.bin radeon/pitcairn_ce.bin radeon/hawaii_rlc.bin radeon/TAHITI_mc.bin radeon/pitcairn_rlc.bin radeon/bonaire_me.bin radeon/R600_rlc.bin radeon/kaveri_sdma.bin radeon/kaveri_mec.bin radeon/TAHITI_uvd.bin radeon/tahiti_k_smc.bin radeon/tahiti_ce.bin radeon/HAINAN_mc2.bin radeon/hawaii_sdma1.bin radeon/bonaire_smc.bin radeon/REDWOOD_rlc.bin radeon/TAHITI_ce.bin radeon/RV610_pfp.bin radeon/hawaii_sdma.bin radeon/RV620_me.bin radeon/KABINI_pfp.bin radeon/CEDAR_pfp.bin radeon/RV770_pfp.bin radeon/hawaii_k_smc.bin radeon/MULLINS_rlc.bin radeon/hainan_rlc.bin radeon/CAICOS_pfp.bin radeon/VERDE_ce.bin radeon/kaveri_uvd.bin radeon/OLAND_mc2.bin radeon/mullins_uvd.bin radeon/PITCAIRN_mc.bin radeon/CYPRESS_uvd.bin radeon/RV670_pfp.bin radeon/PITCAIRN_pfp.bin radeon/RV630_me.bin radeon/CAYMAN_me.bin radeon/bonaire_mec.bin radeon/bonaire_sdma.bin radeon/bonaire_ce.bin radeon/kaveri_rlc.bin radeon/RS780_pfp.bin radeon/bonaire_rlc.bin radeon/BONAIRE_mec.bin radeon/HAINAN_smc.bin radeon/kabini_sdma.bin radeon/BONAIRE_sdma.bin radeon/RV635_me.bin radeon/SUMO_me.bin radeon/SUMO_rlc.bin radeon/CAYMAN_mc.bin radeon/CAYMAN_smc.bin radeon/oland_pfp.bin radeon/HAWAII_sdma.bin radeon/BONAIRE_vce.bin radeon/hawaii_uvd.bin radeon/mullins_rlc.bin radeon/bonaire_k_smc.bin radeon/oland_me.bin radeon/verde_me.bin radeon/kaveri_me.bin radeon/RV730_me.bin radeon/HAWAII_smc.bin radeon/ARUBA_me.bin radeon/CEDAR_smc.bin radeon/CAICOS_smc.bin radeon/BARTS_smc.bin radeon/JUNIPER_me.bin radeon/R300_cp.bin radeon/BONAIRE_pfp.bin radeon/oland_smc.bin radeon/KAVERI_mec.bin radeon/RS780_uvd.bin radeon/R700_rlc.bin radeon/bonaire_vce.bin radeon/PITCAIRN_mc2.bin radeon/kabini_vce.bin radeon/kabini_sdma1.bin radeon/MULLINS_sdma.bin radeon/RV710_pfp.bin radeon/BARTS_mc.bin radeon/kabini_me.bin radeon/mullins_mec.bin radeon/R520_cp.bin radeon/CYPRESS_pfp.bin radeon/pitcairn_pfp.bin radeon/si58_mc.bin radeon/RV730_smc.bin radeon/PITCAIRN_rlc.bin radeon/KAVERI_sdma.bin radeon/CYPRESS_smc.bin radeon/TAHITI_smc.bin radeon/hainan_ce.bin radeon/BONAIRE_ce.bin radeon/CYPRESS_me.bin radeon/mullins_pfp.bin radeon/OLAND_ce.bin radeon/PITCAIRN_ce.bin radeon/RV670_me.bin radeon/TURKS_pfp.bin radeon/HAINAN_pfp.bin radeon/VERDE_rlc.bin radeon/kaveri_mec2.bin radeon/banks_k_2_smc.bin radeon/mullins_ce.bin radeon/CEDAR_rlc.bin radeon/RV620_pfp.bin radeon/BONAIRE_mc2.bin radeon/RV770_uvd.bin radeon/tahiti_mc.bin radeon/TAHITI_vce.bin radeon/bonaire_pfp.bin radeon/HAWAII_mec.bin radeon/TURKS_me.bin radeon/ARUBA_pfp.bin radeon/BONAIRE_me.bin radeon/BONAIRE_mc.bin radeon/RV770_smc.bin radeon/RV635_pfp.bin radeon/PITCAIRN_smc.bin radeon/KAVERI_ce.bin radeon/REDWOOD_pfp.bin radeon/ARUBA_rlc.bin radeon/RS780_me.bin radeon/kaveri_pfp.bin radeon/KABINI_ce.bin radeon/verde_smc.bin radeon/JUNIPER_rlc.bin radeon/HAWAII_mc2.bin radeon/BTC_rlc.bin radeon/VERDE_mc2.bin radeon/CYPRESS_rlc.bin radeon/RV710_uvd.bin radeon/kaveri_sdma1.bin radeon/R600_pfp.bin radeon/bonaire_sdma1.bin radeon/HAINAN_me.bin radeon/HAWAII_pfp.bin radeon/TURKS_mc.bin radeon/MULLINS_ce.bin radeon/HAINAN_rlc.bin radeon/RV710_me.bin radeon/verde_pfp.bin radeon/oland_mc.bin radeon/BONAIRE_uvd.bin radeon/RV710_smc.bin radeon/hawaii_pfp.bin radeon/CEDAR_me.bin radeon/HAINAN_ce.bin radeon/OLAND_mc.bin radeon/OLAND_smc.bin radeon/OLAND_me.bin radeon/verde_k_smc.bin radeon/tahiti_smc.bin radeon/hawaii_ce.bin radeon/pitcairn_smc.bin radeon/MULLINS_me.bin radeon/MULLINS_mec.bin radeon/hawaii_mec.bin radeon/hainan_smc.bin radeon/kabini_mec.bin radeon/hawaii_mc.bin radeon/tahiti_rlc.bin radeon/R600_me.bin radeon/HAWAII_mc.bin radeon/HAWAII_me.bin radeon/HAWAII_rlc.bin radeon/mullins_sdma1.bin radeon/RS690_cp.bin radeon/SUMO2_pfp.bin radeon/bonaire_mc.bin radeon/OLAND_rlc.bin radeon/TAHITI_pfp.bin"
+CONFIG_EXTRA_FIRMWARE="amdgpu/polaris11_pfp.bin amdgpu/topaz_ce.bin amdgpu/polaris10_me.bin amdgpu/polaris11_me.bin amdgpu/topaz_me.bin amdgpu/topaz_mc.bin amdgpu/polaris10_rlc.bin amdgpu/tonga_k_smc.bin amdgpu/polaris12_ce.bin amdgpu/tonga_sdma.bin amdgpu/polaris11_mec.bin amdgpu/polaris12_sdma.bin amdgpu/fiji_me.bin amdgpu/polaris10_sdma1.bin amdgpu/fiji_sdma1.bin amdgpu/carrizo_sdma.bin amdgpu/stoney_me.bin amdgpu/polaris12_smc.bin amdgpu/fiji_smc.bin amdgpu/tonga_sdma1.bin amdgpu/polaris12_sdma1.bin amdgpu/carrizo_uvd.bin amdgpu/polaris11_smc.bin amdgpu/polaris12_vce.bin amdgpu/polaris11_mec2.bin amdgpu/carrizo_sdma1.bin amdgpu/topaz_mec.bin amdgpu/carrizo_mec.bin amdgpu/polaris12_mc.bin amdgpu/polaris10_ce.bin amdgpu/tonga_uvd.bin amdgpu/fiji_rlc.bin amdgpu/fiji_mec.bin amdgpu/polaris12_pfp.bin amdgpu/polaris11_sdma.bin amdgpu/polaris11_rlc.bin amdgpu/polaris10_mec2.bin amdgpu/polaris10_mec.bin amdgpu/fiji_pfp.bin amdgpu/polaris12_me.bin amdgpu/polaris11_ce.bin amdgpu/topaz_rlc.bin amdgpu/tonga_mec2.bin amdgpu/tonga_me.bin amdgpu/stoney_uvd.bin amdgpu/tonga_ce.bin amdgpu/polaris12_mec2.bin amdgpu/stoney_sdma.bin amdgpu/fiji_vce.bin amdgpu/polaris11_mc.bin amdgpu/stoney_rlc.bin amdgpu/polaris10_smc_sk.bin amdgpu/polaris11_sdma1.bin amdgpu/fiji_mc.bin amdgpu/fiji_sdma.bin amdgpu/topaz_smc.bin amdgpu/tonga_mc.bin amdgpu/tonga_pfp.bin amdgpu/stoney_pfp.bin amdgpu/polaris11_vce.bin amdgpu/fiji_uvd.bin amdgpu/polaris11_uvd.bin amdgpu/polaris10_vce.bin amdgpu/polaris10_sdma.bin amdgpu/topaz_sdma1.bin amdgpu/topaz_pfp.bin amdgpu/polaris10_pfp.bin amdgpu/polaris10_mc.bin amdgpu/carrizo_mec2.bin amdgpu/tonga_vce.bin amdgpu/tonga_rlc.bin amdgpu/carrizo_rlc.bin amdgpu/polaris12_mec.bin amdgpu/tonga_smc.bin amdgpu/topaz_mec2.bin amdgpu/stoney_vce.bin amdgpu/polaris12_uvd.bin amdgpu/topaz_k_smc.bin amdgpu/carrizo_ce.bin amdgpu/polaris12_rlc.bin amdgpu/polaris11_k_smc.bin amdgpu/tonga_mec.bin amdgpu/polaris10_k_smc.bin amdgpu/polaris11_smc_sk.bin amdgpu/carrizo_vce.bin amdgpu/carrizo_me.bin amdgpu/carrizo_pfp.bin amdgpu/stoney_mec.bin amdgpu/polaris10_uvd.bin amdgpu/fiji_mec2.bin amdgpu/fiji_ce.bin amdgpu/polaris10_smc.bin amdgpu/stoney_ce.bin amdgpu/topaz_sdma.bin rtl_nic/rtl8168h-1.fw rtl_nic/rtl8402-1.fw rtl_nic/rtl8168f-2.fw rtl_nic/rtl8411-2.fw rtl_nic/rtl8105e-1.fw rtl_nic/rtl8107e-2.fw rtl_nic/rtl8168e-2.fw rtl_nic/rtl8168f-1.fw rtl_nic/rtl8168h-2.fw rtl_nic/rtl8168d-1.fw rtl_nic/rtl8168g-3.fw rtl_nic/rtl8411-1.fw rtl_nic/rtl8168e-1.fw rtl_nic/rtl8107e-1.fw rtl_nic/rtl8168g-2.fw rtl_nic/rtl8168g-1.fw rtl_nic/rtl8168e-3.fw rtl_nic/rtl8106e-1.fw rtl_nic/rtl8106e-2.fw rtl_nic/rtl8168d-2.fw nvidia/gm204/gr/gpccs_sig.bin nvidia/gm204/gr/fecs_data.bin nvidia/gm204/gr/fecs_sig.bin nvidia/gm204/gr/gpccs_data.bin nvidia/gp100/gr/gpccs_sig.bin nvidia/gp100/gr/sw_method_init.bin nvidia/gp100/gr/fecs_data.bin nvidia/gp100/gr/fecs_inst.bin nvidia/gp100/gr/fecs_sig.bin nvidia/gp100/gr/gpccs_data.bin nvidia/gp100/gr/sw_ctx.bin nvidia/gp100/gr/sw_nonctx.bin nvidia/gp100/gr/gpccs_inst.bin nvidia/gp100/gr/sw_bundle_init.bin nvidia/gp100/acr/bl.bin nvidia/gp100/acr/ucode_unload.bin nvidia/gp100/acr/ucode_load.bin nvidia/gm200/gr/gpccs_bl.bin nvidia/gm200/gr/gpccs_sig.bin nvidia/gm200/gr/sw_method_init.bin nvidia/gm200/gr/fecs_data.bin nvidia/gm200/gr/fecs_inst.bin nvidia/gm200/gr/fecs_sig.bin nvidia/gm200/gr/gpccs_data.bin nvidia/gm200/gr/sw_ctx.bin nvidia/gm200/gr/sw_nonctx.bin nvidia/gm200/gr/gpccs_inst.bin nvidia/gm200/gr/fecs_bl.bin nvidia/gm200/gr/sw_bundle_init.bin nvidia/gm200/acr/bl.bin nvidia/gm200/acr/ucode_unload.bin nvidia/gm200/acr/ucode_load.bin nvidia/gm20b/gr/fecs_data.bin nvidia/gm20b/gr/fecs_inst.bin nvidia/gm20b/gr/fecs_sig.bin nvidia/gm20b/gr/gpccs_data.bin nvidia/gm20b/gr/sw_ctx.bin nvidia/gm20b/gr/sw_nonctx.bin nvidia/gm20b/gr/gpccs_inst.bin nvidia/gm20b/gr/fecs_bl.bin nvidia/gm20b/gr/sw_bundle_init.bin nvidia/gm20b/acr/bl.bin nvidia/gm20b/acr/ucode_load.bin nvidia/gk20a/sw_method_init.bin nvidia/gk20a/fecs_data.bin nvidia/gk20a/fecs_inst.bin nvidia/gk20a/gpccs_data.bin nvidia/gk20a/sw_ctx.bin nvidia/gk20a/sw_nonctx.bin nvidia/gk20a/gpccs_inst.bin nvidia/gk20a/sw_bundle_init.bin nvidia/gm206/gr/gpccs_sig.bin nvidia/gm206/gr/fecs_data.bin nvidia/gm206/gr/fecs_sig.bin nvidia/gm206/gr/gpccs_data.bin nvidia/gm206/acr/ucode_unload.bin nvidia/gm206/acr/ucode_load.bin i915/skl_dmc_ver1_23.bin i915/skl_guc_ver4.bin i915/skl_huc_ver01_07_1398.bin i915/bxt_dmc_ver1_07.bin i915/skl_guc_ver6_1.bin i915/skl_dmc_ver1_26.bin i915/bxt_guc_ver8_7.bin i915/kbl_dmc_ver1_01.bin i915/kbl_guc_ver9_14.bin i915/bxt_huc_ver01_07_1398.bin i915/kbl_huc_ver02_00_1810.bin i915/skl_guc_ver1.bin radeon/R200_cp.bin radeon/KABINI_mec.bin radeon/VERDE_pfp.bin radeon/REDWOOD_me.bin radeon/hainan_me.bin radeon/SUMO_uvd.bin radeon/CAICOS_mc.bin radeon/TAHITI_mc2.bin radeon/kabini_rlc.bin radeon/KAVERI_pfp.bin radeon/CAICOS_me.bin radeon/hainan_pfp.bin radeon/hainan_k_smc.bin radeon/hawaii_vce.bin radeon/HAINAN_mc.bin radeon/RV730_pfp.bin radeon/KAVERI_rlc.bin radeon/PALM_me.bin radeon/verde_ce.bin radeon/RV740_smc.bin radeon/KAVERI_me.bin radeon/CAYMAN_pfp.bin radeon/REDWOOD_smc.bin radeon/pitcairn_mc.bin radeon/PALM_pfp.bin radeon/R100_cp.bin radeon/VERDE_mc.bin radeon/JUNIPER_smc.bin radeon/SUMO2_me.bin radeon/RS600_cp.bin radeon/kabini_uvd.bin radeon/BARTS_pfp.bin radeon/KABINI_rlc.bin radeon/VERDE_me.bin radeon/R420_cp.bin radeon/R600_uvd.bin radeon/pitcairn_k_smc.bin radeon/verde_rlc.bin radeon/mullins_me.bin radeon/PITCAIRN_me.bin radeon/hawaii_me.bin radeon/tahiti_me.bin radeon/CAYMAN_rlc.bin radeon/RV610_me.bin radeon/hawaii_smc.bin radeon/JUNIPER_pfp.bin radeon/TAHITI_rlc.bin radeon/kabini_pfp.bin radeon/SUMO_pfp.bin radeon/oland_rlc.bin radeon/oland_k_smc.bin radeon/BONAIRE_smc.bin radeon/TURKS_smc.bin radeon/TAHITI_me.bin radeon/kaveri_vce.bin radeon/hainan_mc.bin radeon/BONAIRE_rlc.bin radeon/VERDE_smc.bin radeon/OLAND_pfp.bin radeon/RV630_pfp.bin radeon/pitcairn_me.bin radeon/kabini_ce.bin radeon/oland_ce.bin radeon/kaveri_ce.bin radeon/BARTS_me.bin radeon/mullins_sdma.bin radeon/bonaire_uvd.bin radeon/verde_mc.bin radeon/KABINI_me.bin radeon/KABINI_sdma.bin radeon/RV770_me.bin radeon/HAWAII_ce.bin radeon/mullins_vce.bin radeon/tahiti_pfp.bin radeon/MULLINS_pfp.bin radeon/pitcairn_ce.bin radeon/hawaii_rlc.bin radeon/TAHITI_mc.bin radeon/pitcairn_rlc.bin radeon/bonaire_me.bin radeon/R600_rlc.bin radeon/kaveri_sdma.bin radeon/kaveri_mec.bin radeon/TAHITI_uvd.bin radeon/tahiti_k_smc.bin radeon/tahiti_ce.bin radeon/HAINAN_mc2.bin radeon/hawaii_sdma1.bin radeon/bonaire_smc.bin radeon/REDWOOD_rlc.bin radeon/TAHITI_ce.bin radeon/RV610_pfp.bin radeon/hawaii_sdma.bin radeon/RV620_me.bin radeon/KABINI_pfp.bin radeon/CEDAR_pfp.bin radeon/RV770_pfp.bin radeon/hawaii_k_smc.bin radeon/MULLINS_rlc.bin radeon/hainan_rlc.bin radeon/CAICOS_pfp.bin radeon/VERDE_ce.bin radeon/kaveri_uvd.bin radeon/OLAND_mc2.bin radeon/mullins_uvd.bin radeon/PITCAIRN_mc.bin radeon/CYPRESS_uvd.bin radeon/RV670_pfp.bin radeon/PITCAIRN_pfp.bin radeon/RV630_me.bin radeon/CAYMAN_me.bin radeon/bonaire_mec.bin radeon/bonaire_sdma.bin radeon/bonaire_ce.bin radeon/kaveri_rlc.bin radeon/RS780_pfp.bin radeon/bonaire_rlc.bin radeon/BONAIRE_mec.bin radeon/HAINAN_smc.bin radeon/kabini_sdma.bin radeon/BONAIRE_sdma.bin radeon/RV635_me.bin radeon/SUMO_me.bin radeon/SUMO_rlc.bin radeon/CAYMAN_mc.bin radeon/CAYMAN_smc.bin radeon/oland_pfp.bin radeon/HAWAII_sdma.bin radeon/BONAIRE_vce.bin radeon/hawaii_uvd.bin radeon/mullins_rlc.bin radeon/bonaire_k_smc.bin radeon/oland_me.bin radeon/verde_me.bin radeon/kaveri_me.bin radeon/RV730_me.bin radeon/HAWAII_smc.bin radeon/ARUBA_me.bin radeon/CEDAR_smc.bin radeon/CAICOS_smc.bin radeon/BARTS_smc.bin radeon/JUNIPER_me.bin radeon/R300_cp.bin radeon/BONAIRE_pfp.bin radeon/oland_smc.bin radeon/KAVERI_mec.bin radeon/RS780_uvd.bin radeon/R700_rlc.bin radeon/bonaire_vce.bin radeon/PITCAIRN_mc2.bin radeon/kabini_vce.bin radeon/kabini_sdma1.bin radeon/MULLINS_sdma.bin radeon/RV710_pfp.bin radeon/BARTS_mc.bin radeon/kabini_me.bin radeon/mullins_mec.bin radeon/R520_cp.bin radeon/CYPRESS_pfp.bin radeon/pitcairn_pfp.bin radeon/si58_mc.bin radeon/RV730_smc.bin radeon/PITCAIRN_rlc.bin radeon/KAVERI_sdma.bin radeon/CYPRESS_smc.bin radeon/TAHITI_smc.bin radeon/hainan_ce.bin radeon/BONAIRE_ce.bin radeon/CYPRESS_me.bin radeon/mullins_pfp.bin radeon/OLAND_ce.bin radeon/PITCAIRN_ce.bin radeon/RV670_me.bin radeon/TURKS_pfp.bin radeon/HAINAN_pfp.bin radeon/VERDE_rlc.bin radeon/kaveri_mec2.bin radeon/banks_k_2_smc.bin radeon/mullins_ce.bin radeon/CEDAR_rlc.bin radeon/RV620_pfp.bin radeon/BONAIRE_mc2.bin radeon/RV770_uvd.bin radeon/tahiti_mc.bin radeon/TAHITI_vce.bin radeon/bonaire_pfp.bin radeon/HAWAII_mec.bin radeon/TURKS_me.bin radeon/ARUBA_pfp.bin radeon/BONAIRE_me.bin radeon/BONAIRE_mc.bin radeon/RV770_smc.bin radeon/RV635_pfp.bin radeon/PITCAIRN_smc.bin radeon/KAVERI_ce.bin radeon/REDWOOD_pfp.bin radeon/ARUBA_rlc.bin radeon/RS780_me.bin radeon/kaveri_pfp.bin radeon/KABINI_ce.bin radeon/verde_smc.bin radeon/JUNIPER_rlc.bin radeon/HAWAII_mc2.bin radeon/BTC_rlc.bin radeon/VERDE_mc2.bin radeon/CYPRESS_rlc.bin radeon/RV710_uvd.bin radeon/kaveri_sdma1.bin radeon/R600_pfp.bin radeon/bonaire_sdma1.bin radeon/HAINAN_me.bin radeon/HAWAII_pfp.bin radeon/TURKS_mc.bin radeon/MULLINS_ce.bin radeon/HAINAN_rlc.bin radeon/RV710_me.bin radeon/verde_pfp.bin radeon/oland_mc.bin radeon/BONAIRE_uvd.bin radeon/RV710_smc.bin radeon/hawaii_pfp.bin radeon/CEDAR_me.bin radeon/HAINAN_ce.bin radeon/OLAND_mc.bin radeon/OLAND_smc.bin radeon/OLAND_me.bin radeon/verde_k_smc.bin radeon/tahiti_smc.bin radeon/hawaii_ce.bin radeon/pitcairn_smc.bin radeon/MULLINS_me.bin radeon/MULLINS_mec.bin radeon/hawaii_mec.bin radeon/hainan_smc.bin radeon/kabini_mec.bin radeon/hawaii_mc.bin radeon/tahiti_rlc.bin radeon/R600_me.bin radeon/HAWAII_mc.bin radeon/HAWAII_me.bin radeon/HAWAII_rlc.bin radeon/mullins_sdma1.bin radeon/RS690_cp.bin radeon/SUMO2_pfp.bin radeon/bonaire_mc.bin radeon/OLAND_rlc.bin radeon/TAHITI_pfp.bin"
 CONFIG_EXTRA_FIRMWARE_DIR="firmware"
 # CONFIG_FW_LOADER_USER_HELPER_FALLBACK is not set
 CONFIG_ALLOW_DEV_COREDUMP=y
@@ -1187,6 +1187,7 @@ CONFIG_POWER_SUPPLY=y
 # CONFIG_CHARGER_MAX8903 is not set
 # CONFIG_CHARGER_LP8727 is not set
 # CONFIG_CHARGER_GPIO is not set
+# CONFIG_CHARGER_MANAGER is not set
 # CONFIG_CHARGER_BQ2415X is not set
 # CONFIG_CHARGER_BQ24190 is not set
 # CONFIG_CHARGER_BQ24257 is not set
@@ -1463,7 +1464,38 @@ CONFIG_MFD_RTSX_PCI=y
 # CONFIG_MFD_WM831X_I2C is not set
 # CONFIG_MFD_WM8350_I2C is not set
 # CONFIG_MFD_WM8994 is not set
-# CONFIG_REGULATOR is not set
+CONFIG_REGULATOR=y
+# CONFIG_REGULATOR_DEBUG is not set
+CONFIG_REGULATOR_FIXED_VOLTAGE=y
+# CONFIG_REGULATOR_VIRTUAL_CONSUMER is not set
+# CONFIG_REGULATOR_USERSPACE_CONSUMER is not set
+# CONFIG_REGULATOR_ACT8865 is not set
+# CONFIG_REGULATOR_AD5398 is not set
+# CONFIG_REGULATOR_DA9210 is not set
+# CONFIG_REGULATOR_DA9211 is not set
+# CONFIG_REGULATOR_FAN53555 is not set
+# CONFIG_REGULATOR_GPIO is not set
+# CONFIG_REGULATOR_ISL9305 is not set
+# CONFIG_REGULATOR_ISL6271A is not set
+# CONFIG_REGULATOR_LP3971 is not set
+# CONFIG_REGULATOR_LP3972 is not set
+# CONFIG_REGULATOR_LP872X is not set
+# CONFIG_REGULATOR_LP8755 is not set
+# CONFIG_REGULATOR_LTC3589 is not set
+# CONFIG_REGULATOR_LTC3676 is not set
+# CONFIG_REGULATOR_MAX1586 is not set
+# CONFIG_REGULATOR_MAX8649 is not set
+# CONFIG_REGULATOR_MAX8660 is not set
+# CONFIG_REGULATOR_MAX8952 is not set
+# CONFIG_REGULATOR_MT6311 is not set
+# CONFIG_REGULATOR_PFUZE100 is not set
+# CONFIG_REGULATOR_PV88060 is not set
+# CONFIG_REGULATOR_PV88080 is not set
+# CONFIG_REGULATOR_PV88090 is not set
+# CONFIG_REGULATOR_TPS51632 is not set
+# CONFIG_REGULATOR_TPS62360 is not set
+# CONFIG_REGULATOR_TPS65023 is not set
+# CONFIG_REGULATOR_TPS6507X is not set
 # CONFIG_MEDIA_SUPPORT is not set
 
 #
@@ -1873,7 +1905,17 @@ CONFIG_CLKBLD_I8253=y
 # CONFIG_SUNXI_SRAM is not set
 # CONFIG_SOC_TI is not set
 # CONFIG_PM_DEVFREQ is not set
-# CONFIG_EXTCON is not set
+CONFIG_EXTCON=m
+
+#
+# Extcon Device Drivers
+#
+# CONFIG_EXTCON_GPIO is not set
+# CONFIG_EXTCON_MAX3355 is not set
+# CONFIG_EXTCON_QCOM_SPMI_MISC is not set
+# CONFIG_EXTCON_RT8973A is not set
+# CONFIG_EXTCON_SM5502 is not set
+# CONFIG_EXTCON_USB_GPIO is not set
 # CONFIG_MEMORY is not set
 # CONFIG_IIO is not set
 # CONFIG_NTB is not set
diff --git a/projects/Generic/options b/projects/Generic/options
index ca3a13a598e..5bf185b0e8b 100644
--- a/projects/Generic/options
+++ b/projects/Generic/options
@@ -79,6 +79,11 @@
   # Displayserver to use (x11 / no)
     DISPLAYSERVER="x11"
 
+  # Xorg Graphic drivers to use (all / i915,i965,r200,r300,r600,nvidia)
+  # Space separated list is supported,
+  # e.g. GRAPHIC_DRIVERS="i915 i965 r300 r600 radeonsi nvidia"
+    GRAPHIC_DRIVERS="r200 r300 r600 radeonsi i915 i965 nvidia nvidia-legacy"
+
   # KODI Player implementation to use (default / bcm2835-firmware / libfslvpuwrap)
     KODIPLAYER_DRIVER="default"
 
diff --git a/projects/RPi/linux/extra-fix-config.config b/projects/RPi/linux/extra-fix-config.config
new file mode 100644
index 00000000000..5600a5b4cd4
--- /dev/null
+++ b/projects/RPi/linux/extra-fix-config.config
@@ -0,0 +1,4 @@
+### fix default config because drivers are broken ###
+
+# see https://github.com/raspberrypi/linux/issues/875
+# CONFIG_USB_UAS is not set
diff --git a/projects/RPi/linux/linux.arm.conf b/projects/RPi/linux/linux.arm.conf
index 6c005f66972..9489a6495f7 100644
--- a/projects/RPi/linux/linux.arm.conf
+++ b/projects/RPi/linux/linux.arm.conf
@@ -1,6 +1,6 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/arm 4.9.13 Kernel Configuration
+# Linux/arm 4.9.28 Kernel Configuration
 #
 CONFIG_ARM=y
 CONFIG_ARM_HAS_SG_CHAIN=y
@@ -870,7 +870,7 @@ CONFIG_I2C_BCM2708_BAUDRATE=100000
 #
 # I2C system bus drivers (mostly embedded / system-on-chip)
 #
-# CONFIG_I2C_BCM2835 is not set
+CONFIG_I2C_BCM2835=m
 # CONFIG_I2C_CBUS_GPIO is not set
 # CONFIG_I2C_DESIGNWARE_PLATFORM is not set
 # CONFIG_I2C_EMEV2 is not set
@@ -1062,6 +1062,7 @@ CONFIG_POWER_SUPPLY=y
 # CONFIG_CHARGER_MAX8903 is not set
 # CONFIG_CHARGER_LP8727 is not set
 # CONFIG_CHARGER_GPIO is not set
+# CONFIG_CHARGER_MANAGER is not set
 # CONFIG_CHARGER_BQ2415X is not set
 # CONFIG_CHARGER_BQ24190 is not set
 # CONFIG_CHARGER_BQ24257 is not set
@@ -1358,7 +1359,40 @@ CONFIG_MFD_WM5102=y
 # CONFIG_MFD_WM831X_SPI is not set
 # CONFIG_MFD_WM8350_I2C is not set
 # CONFIG_MFD_WM8994 is not set
-# CONFIG_REGULATOR is not set
+CONFIG_REGULATOR=y
+# CONFIG_REGULATOR_DEBUG is not set
+CONFIG_REGULATOR_FIXED_VOLTAGE=y
+# CONFIG_REGULATOR_VIRTUAL_CONSUMER is not set
+# CONFIG_REGULATOR_USERSPACE_CONSUMER is not set
+# CONFIG_REGULATOR_ACT8865 is not set
+# CONFIG_REGULATOR_AD5398 is not set
+# CONFIG_REGULATOR_DA9210 is not set
+# CONFIG_REGULATOR_DA9211 is not set
+# CONFIG_REGULATOR_FAN53555 is not set
+# CONFIG_REGULATOR_GPIO is not set
+# CONFIG_REGULATOR_ISL9305 is not set
+# CONFIG_REGULATOR_ISL6271A is not set
+# CONFIG_REGULATOR_LP3971 is not set
+# CONFIG_REGULATOR_LP3972 is not set
+# CONFIG_REGULATOR_LP872X is not set
+# CONFIG_REGULATOR_LP8755 is not set
+# CONFIG_REGULATOR_LTC3589 is not set
+# CONFIG_REGULATOR_LTC3676 is not set
+# CONFIG_REGULATOR_MAX1586 is not set
+# CONFIG_REGULATOR_MAX8649 is not set
+# CONFIG_REGULATOR_MAX8660 is not set
+# CONFIG_REGULATOR_MAX8952 is not set
+# CONFIG_REGULATOR_MAX8973 is not set
+# CONFIG_REGULATOR_MT6311 is not set
+# CONFIG_REGULATOR_PFUZE100 is not set
+# CONFIG_REGULATOR_PV88060 is not set
+# CONFIG_REGULATOR_PV88080 is not set
+# CONFIG_REGULATOR_PV88090 is not set
+# CONFIG_REGULATOR_TPS51632 is not set
+# CONFIG_REGULATOR_TPS62360 is not set
+# CONFIG_REGULATOR_TPS65023 is not set
+# CONFIG_REGULATOR_TPS6507X is not set
+# CONFIG_REGULATOR_TPS6524X is not set
 # CONFIG_MEDIA_SUPPORT is not set
 
 #
@@ -1500,7 +1534,7 @@ CONFIG_RTC_DRV_DS1307_CENTURY=y
 # CONFIG_RTC_DRV_X1205 is not set
 CONFIG_RTC_DRV_PCF8523=m
 # CONFIG_RTC_DRV_PCF85063 is not set
-# CONFIG_RTC_DRV_PCF8563 is not set
+CONFIG_RTC_DRV_PCF8563=m
 # CONFIG_RTC_DRV_PCF8583 is not set
 # CONFIG_RTC_DRV_M41T80 is not set
 # CONFIG_RTC_DRV_BQ32K is not set
@@ -1702,7 +1736,17 @@ CONFIG_RASPBERRYPI_POWER=y
 # CONFIG_SUNXI_SRAM is not set
 # CONFIG_SOC_TI is not set
 # CONFIG_PM_DEVFREQ is not set
-# CONFIG_EXTCON is not set
+CONFIG_EXTCON=m
+
+#
+# Extcon Device Drivers
+#
+# CONFIG_EXTCON_GPIO is not set
+# CONFIG_EXTCON_MAX3355 is not set
+# CONFIG_EXTCON_QCOM_SPMI_MISC is not set
+# CONFIG_EXTCON_RT8973A is not set
+# CONFIG_EXTCON_SM5502 is not set
+# CONFIG_EXTCON_USB_GPIO is not set
 # CONFIG_MEMORY is not set
 # CONFIG_IIO is not set
 # CONFIG_PWM is not set
diff --git a/projects/RPi/options b/projects/RPi/options
index 7ebef41220c..7cdf413e9d7 100644
--- a/projects/RPi/options
+++ b/projects/RPi/options
@@ -104,6 +104,11 @@
   # Displayserver to use (x11 / no)
     DISPLAYSERVER="no"
 
+  # Xorg Graphic drivers to use (all / i915,i965,r200,r300,r600,nvidia)
+  # Space separated list is supported,
+  # e.g. GRAPHIC_DRIVERS="i915 i965 r300 r600 radeonsi nvidia"
+    GRAPHIC_DRIVERS=""
+
   # Windowmanager to use (ratpoison / fluxbox / none)
     WINDOWMANAGER="none"
 
@@ -118,14 +123,18 @@
   # Modules to install in initramfs for early boot
     INITRAMFS_MODULES=""
 
+  # install extra kernel drivers, see configfiles in packages/linux/config/*.config
+  # or projects/*/linux/*.config or distributions/*/linux/*.config
+  # please add all configfiles which should be included without the suffix and prefix
+  # except the default-*.config and the ones included in packages/linux/package.mk
+  # Space separated list is supported,
+    KERNEL_EXTRA_CONFIG+=" fix-config"
+
   # additional Firmware to use (dvb-firmware, misc-firmware, wlan-firmware)
   # Space separated list is supported,
   # e.g. FIRMWARE="dvb-firmware misc-firmware wlan-firmware"
     FIRMWARE="misc-firmware wlan-firmware dvb-firmware"
 
-  # build and install ATV IR remote support (yes / no)
-    ATVCLIENT_SUPPORT="no"
-
   # build and install IRServer IR/LCD support (yes / no)
     IRSERVER_SUPPORT="no"
 
diff --git a/projects/RPi2/linux/extra-fix-config.config b/projects/RPi2/linux/extra-fix-config.config
new file mode 100644
index 00000000000..5600a5b4cd4
--- /dev/null
+++ b/projects/RPi2/linux/extra-fix-config.config
@@ -0,0 +1,4 @@
+### fix default config because drivers are broken ###
+
+# see https://github.com/raspberrypi/linux/issues/875
+# CONFIG_USB_UAS is not set
diff --git a/projects/RPi2/linux/linux.arm.conf b/projects/RPi2/linux/linux.arm.conf
index a21ae7bec21..c72751f893b 100644
--- a/projects/RPi2/linux/linux.arm.conf
+++ b/projects/RPi2/linux/linux.arm.conf
@@ -1,6 +1,6 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/arm 4.9.13 Kernel Configuration
+# Linux/arm 4.9.28 Kernel Configuration
 #
 CONFIG_ARM=y
 CONFIG_ARM_HAS_SG_CHAIN=y
@@ -582,9 +582,9 @@ CONFIG_CPU_FREQ_GOV_COMMON=y
 # CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
-# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 # CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set
 CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
 CONFIG_CPU_FREQ_GOV_POWERSAVE=y
 # CONFIG_CPU_FREQ_GOV_USERSPACE is not set
@@ -624,7 +624,7 @@ CONFIG_CPU_IDLE_GOV_MENU=y
 CONFIG_VFP=y
 CONFIG_VFPv3=y
 CONFIG_NEON=y
-# CONFIG_KERNEL_MODE_NEON is not set
+CONFIG_KERNEL_MODE_NEON=y
 
 #
 # Userspace binary formats
@@ -954,7 +954,7 @@ CONFIG_I2C_BCM2708_BAUDRATE=100000
 #
 # I2C system bus drivers (mostly embedded / system-on-chip)
 #
-# CONFIG_I2C_BCM2835 is not set
+CONFIG_I2C_BCM2835=m
 # CONFIG_I2C_CBUS_GPIO is not set
 # CONFIG_I2C_DESIGNWARE_PLATFORM is not set
 # CONFIG_I2C_EMEV2 is not set
@@ -1150,6 +1150,7 @@ CONFIG_POWER_SUPPLY=y
 # CONFIG_CHARGER_MAX8903 is not set
 # CONFIG_CHARGER_LP8727 is not set
 # CONFIG_CHARGER_GPIO is not set
+# CONFIG_CHARGER_MANAGER is not set
 # CONFIG_CHARGER_BQ2415X is not set
 # CONFIG_CHARGER_BQ24190 is not set
 # CONFIG_CHARGER_BQ24257 is not set
@@ -1446,7 +1447,41 @@ CONFIG_MFD_WM5102=y
 # CONFIG_MFD_WM831X_SPI is not set
 # CONFIG_MFD_WM8350_I2C is not set
 # CONFIG_MFD_WM8994 is not set
-# CONFIG_REGULATOR is not set
+CONFIG_REGULATOR=y
+# CONFIG_REGULATOR_DEBUG is not set
+CONFIG_REGULATOR_FIXED_VOLTAGE=y
+# CONFIG_REGULATOR_VIRTUAL_CONSUMER is not set
+# CONFIG_REGULATOR_USERSPACE_CONSUMER is not set
+# CONFIG_REGULATOR_ACT8865 is not set
+# CONFIG_REGULATOR_AD5398 is not set
+# CONFIG_REGULATOR_ANATOP is not set
+# CONFIG_REGULATOR_DA9210 is not set
+# CONFIG_REGULATOR_DA9211 is not set
+# CONFIG_REGULATOR_FAN53555 is not set
+# CONFIG_REGULATOR_GPIO is not set
+# CONFIG_REGULATOR_ISL9305 is not set
+# CONFIG_REGULATOR_ISL6271A is not set
+# CONFIG_REGULATOR_LP3971 is not set
+# CONFIG_REGULATOR_LP3972 is not set
+# CONFIG_REGULATOR_LP872X is not set
+# CONFIG_REGULATOR_LP8755 is not set
+# CONFIG_REGULATOR_LTC3589 is not set
+# CONFIG_REGULATOR_LTC3676 is not set
+# CONFIG_REGULATOR_MAX1586 is not set
+# CONFIG_REGULATOR_MAX8649 is not set
+# CONFIG_REGULATOR_MAX8660 is not set
+# CONFIG_REGULATOR_MAX8952 is not set
+# CONFIG_REGULATOR_MAX8973 is not set
+# CONFIG_REGULATOR_MT6311 is not set
+# CONFIG_REGULATOR_PFUZE100 is not set
+# CONFIG_REGULATOR_PV88060 is not set
+# CONFIG_REGULATOR_PV88080 is not set
+# CONFIG_REGULATOR_PV88090 is not set
+# CONFIG_REGULATOR_TPS51632 is not set
+# CONFIG_REGULATOR_TPS62360 is not set
+# CONFIG_REGULATOR_TPS65023 is not set
+# CONFIG_REGULATOR_TPS6507X is not set
+# CONFIG_REGULATOR_TPS6524X is not set
 # CONFIG_MEDIA_SUPPORT is not set
 
 #
@@ -1588,7 +1623,7 @@ CONFIG_RTC_DRV_DS1307_CENTURY=y
 # CONFIG_RTC_DRV_X1205 is not set
 CONFIG_RTC_DRV_PCF8523=m
 # CONFIG_RTC_DRV_PCF85063 is not set
-# CONFIG_RTC_DRV_PCF8563 is not set
+CONFIG_RTC_DRV_PCF8563=m
 # CONFIG_RTC_DRV_PCF8583 is not set
 # CONFIG_RTC_DRV_M41T80 is not set
 # CONFIG_RTC_DRV_BQ32K is not set
@@ -1792,7 +1827,17 @@ CONFIG_RASPBERRYPI_POWER=y
 # CONFIG_SUNXI_SRAM is not set
 # CONFIG_SOC_TI is not set
 # CONFIG_PM_DEVFREQ is not set
-# CONFIG_EXTCON is not set
+CONFIG_EXTCON=m
+
+#
+# Extcon Device Drivers
+#
+# CONFIG_EXTCON_GPIO is not set
+# CONFIG_EXTCON_MAX3355 is not set
+# CONFIG_EXTCON_QCOM_SPMI_MISC is not set
+# CONFIG_EXTCON_RT8973A is not set
+# CONFIG_EXTCON_SM5502 is not set
+# CONFIG_EXTCON_USB_GPIO is not set
 # CONFIG_MEMORY is not set
 # CONFIG_IIO is not set
 # CONFIG_PWM is not set
@@ -2267,9 +2312,15 @@ CONFIG_CRYPTO_JITTERENTROPY=y
 #
 CONFIG_ARM_CRYPTO=y
 CONFIG_CRYPTO_SHA1_ARM=m
+# CONFIG_CRYPTO_SHA1_ARM_NEON is not set
+# CONFIG_CRYPTO_SHA1_ARM_CE is not set
+# CONFIG_CRYPTO_SHA2_ARM_CE is not set
 CONFIG_CRYPTO_SHA256_ARM=m
 # CONFIG_CRYPTO_SHA512_ARM is not set
 CONFIG_CRYPTO_AES_ARM=m
+# CONFIG_CRYPTO_AES_ARM_BS is not set
+# CONFIG_CRYPTO_AES_ARM_CE is not set
+# CONFIG_CRYPTO_GHASH_ARM_CE is not set
 # CONFIG_BINARY_PRINTF is not set
 
 #
diff --git a/projects/RPi2/options b/projects/RPi2/options
index a269ed081bb..c9679e1aacc 100644
--- a/projects/RPi2/options
+++ b/projects/RPi2/options
@@ -104,6 +104,11 @@
   # Displayserver to use (x11 / no)
     DISPLAYSERVER="no"
 
+  # Xorg Graphic drivers to use (all / i915,i965,r200,r300,r600,nvidia)
+  # Space separated list is supported,
+  # e.g. GRAPHIC_DRIVERS="i915 i965 r300 r600 radeonsi nvidia"
+    GRAPHIC_DRIVERS=""
+
   # Windowmanager to use (ratpoison / fluxbox / none)
     WINDOWMANAGER="none"
 
@@ -118,14 +123,18 @@
   # Modules to install in initramfs for early boot
     INITRAMFS_MODULES=""
 
+  # install extra kernel drivers, see configfiles in packages/linux/config/*.config
+  # or projects/*/linux/*.config or distributions/*/linux/*.config
+  # please add all configfiles which should be included without the suffix and prefix
+  # except the default-*.config and the ones included in packages/linux/package.mk
+  # Space separated list is supported,
+    KERNEL_EXTRA_CONFIG+=" fix-config"
+
   # additional Firmware to use (dvb-firmware, misc-firmware, wlan-firmware)
   # Space separated list is supported,
   # e.g. FIRMWARE="dvb-firmware misc-firmware wlan-firmware"
     FIRMWARE="misc-firmware wlan-firmware dvb-firmware"
 
-  # build and install ATV IR remote support (yes / no)
-    ATVCLIENT_SUPPORT="no"
-
   # build and install IRServer IR/LCD support (yes / no)
     IRSERVER_SUPPORT="no"
 
diff --git a/projects/WeTek_Core/linux/default-lan-phy.config b/projects/WeTek_Core/linux/default-lan-phy.config
index 71398cf0efd..a5fef78c6f6 100644
--- a/projects/WeTek_Core/linux/default-lan-phy.config
+++ b/projects/WeTek_Core/linux/default-lan-phy.config
@@ -8,6 +8,6 @@ CONFIG_OF_MDIO=y
 
 CONFIG_AML_PHY=y
 CONFIG_AML_LAN8720=y
-CONFIG_AML_IP101_PHY=m
-CONFIG_AML_KSZ8091=m
-CONFIG_AML_RTL8211F=m
+CONFIG_AML_IP101_PHY=y
+CONFIG_AML_KSZ8091=y
+CONFIG_AML_RTL8211F=y
diff --git a/projects/WeTek_Core/linux/linux.arm.conf b/projects/WeTek_Core/linux/linux.arm.conf
index 45fa0f8ed65..76dfb45ee60 100644
--- a/projects/WeTek_Core/linux/linux.arm.conf
+++ b/projects/WeTek_Core/linux/linux.arm.conf
@@ -503,8 +503,8 @@ CONFIG_CPU_FREQ_STAT=y
 # CONFIG_CPU_FREQ_STAT_DETAILS is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-# CONFIG_CPU_FREQ_DEFAULT_GOV_HOTPLUG is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_HOTPLUG=y
 # CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE is not set
 CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
@@ -1837,7 +1837,7 @@ CONFIG_BCMA_POSSIBLE=y
 CONFIG_REGULATOR=y
 # CONFIG_REGULATOR_DEBUG is not set
 # CONFIG_REGULATOR_DUMMY is not set
-# CONFIG_REGULATOR_FIXED_VOLTAGE is not set
+CONFIG_REGULATOR_FIXED_VOLTAGE=y
 # CONFIG_REGULATOR_VIRTUAL_CONSUMER is not set
 # CONFIG_REGULATOR_USERSPACE_CONSUMER is not set
 # CONFIG_REGULATOR_GPIO is not set
@@ -2187,7 +2187,12 @@ CONFIG_OF_IOMMU=y
 # Rpmsg drivers
 #
 # CONFIG_PM_DEVFREQ is not set
-# CONFIG_EXTCON is not set
+CONFIG_EXTCON=m
+
+#
+# Extcon Device Drivers
+#
+# CONFIG_EXTCON_GPIO is not set
 # CONFIG_MEMORY is not set
 # CONFIG_IIO is not set
 # CONFIG_PWM is not set
diff --git a/projects/WeTek_Core/options b/projects/WeTek_Core/options
index 9f5db594f1b..a224cc416c2 100644
--- a/projects/WeTek_Core/options
+++ b/projects/WeTek_Core/options
@@ -105,14 +105,14 @@
   # Displayserver to use (x11 / no)
     DISPLAYSERVER="no"
 
-  # Windowmanager to use (ratpoison / fluxbox / none)
-    WINDOWMANAGER="none"
-
   # Xorg Graphic drivers to use (all / i915,i965,r200,r300,r600,nvidia)
   # Space separated list is supported,
   # e.g. GRAPHIC_DRIVERS="i915 i965 r300 r600 radeonsi nvidia"
     GRAPHIC_DRIVERS=""
 
+  # Windowmanager to use (ratpoison / fluxbox / none)
+    WINDOWMANAGER="none"
+
   # KODI Player implementation to use (default / bcm2835-firmware / libfslvpuwrap / libamcodec)
     KODIPLAYER_DRIVER="libamcodec"
 
@@ -123,7 +123,7 @@
   # for a list of additinoal drivers see packages/linux-drivers
   # Space separated list is supported,
   # e.g. ADDITIONAL_DRIVERS="DRIVER1 DRIVER2"
-    ADDITIONAL_DRIVERS="RTL8192DU RTL8192EU RTL8812AU" # todo: dvbhdhomerun
+    ADDITIONAL_DRIVERS="RTL8192DU RTL8192EU RTL8812AU libhdhomerun"
 
   # install extra kernel drivers, see configfiles in packages/linux/config/*.config
   # or projects/*/linux/*.config or distributions/*/linux/*.config
@@ -137,9 +137,6 @@
   # e.g. FIRMWARE="dvb-firmware misc-firmware wlan-firmware"
     FIRMWARE="misc-firmware wlan-firmware dvb-firmware"
 
-  # build and install ATV IR remote support (yes / no)
-    ATVCLIENT_SUPPORT="no"
-
   # build and install IRServer IR/LCD support (yes / no)
     IRSERVER_SUPPORT="no"
 
diff --git a/projects/WeTek_Hub/linux/linux.aarch64.conf b/projects/WeTek_Hub/linux/linux.aarch64.conf
index 3a673f39897..cafa5087d37 100644
--- a/projects/WeTek_Hub/linux/linux.aarch64.conf
+++ b/projects/WeTek_Hub/linux/linux.aarch64.conf
@@ -407,8 +407,8 @@ CONFIG_CPU_FREQ_STAT=y
 # CONFIG_CPU_FREQ_STAT_DETAILS is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-# CONFIG_CPU_FREQ_DEFAULT_GOV_HOTPLUG is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_HOTPLUG=y
 # CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE is not set
 CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
@@ -1467,7 +1467,7 @@ CONFIG_BCMA_POSSIBLE=y
 CONFIG_VEXPRESS_CONFIG=y
 CONFIG_REGULATOR=y
 # CONFIG_REGULATOR_DEBUG is not set
-# CONFIG_REGULATOR_FIXED_VOLTAGE is not set
+CONFIG_REGULATOR_FIXED_VOLTAGE=y
 # CONFIG_REGULATOR_VIRTUAL_CONSUMER is not set
 # CONFIG_REGULATOR_USERSPACE_CONSUMER is not set
 # CONFIG_REGULATOR_ACT8865 is not set
@@ -1906,7 +1906,13 @@ CONFIG_PM_DEVFREQ=y
 #
 # DEVFREQ Drivers
 #
-# CONFIG_EXTCON is not set
+CONFIG_EXTCON=m
+
+#
+# Extcon Device Drivers
+#
+CONFIG_OF_EXTCON=m
+# CONFIG_EXTCON_GPIO is not set
 # CONFIG_MEMORY is not set
 # CONFIG_IIO is not set
 # CONFIG_PWM is not set
diff --git a/projects/WeTek_Hub/options b/projects/WeTek_Hub/options
index 9844eb3cef0..33ba5c341ed 100644
--- a/projects/WeTek_Hub/options
+++ b/projects/WeTek_Hub/options
@@ -93,14 +93,14 @@
   # Displayserver to use (x11 / no)
     DISPLAYSERVER="no"
 
-  # Windowmanager to use (ratpoison / fluxbox / none)
-    WINDOWMANAGER="none"
-
   # Xorg Graphic drivers to use (all / i915,i965,r200,r300,r600,nvidia)
   # Space separated list is supported,
   # e.g. GRAPHIC_DRIVERS="i915 i965 r300 r600 radeonsi nvidia"
     GRAPHIC_DRIVERS=""
 
+  # Windowmanager to use (ratpoison / fluxbox / none)
+    WINDOWMANAGER="none"
+
   # KODI Player implementation to use (default / bcm2835-firmware / libfslvpuwrap / libamcodec)
     KODIPLAYER_DRIVER="libamcodec"
 
@@ -111,7 +111,7 @@
   # for a list of additinoal drivers see packages/linux-drivers
   # Space separated list is supported,
   # e.g. ADDITIONAL_DRIVERS="DRIVER1 DRIVER2"
-    ADDITIONAL_DRIVERS="RTL8192DU RTL8192EU RTL8812AU gpu-aml brcmap6xxx-aml" # todo: dvbhdhomerun
+    ADDITIONAL_DRIVERS="RTL8192DU RTL8192EU RTL8812AU gpu-aml brcmap6xxx-aml libhdhomerun"
 
   # install extra kernel drivers, see configfiles in packages/linux/config/*.config
   # or projects/*/linux/*.config or distributions/*/linux/*.config
@@ -125,9 +125,6 @@
   # e.g. FIRMWARE="dvb-firmware misc-firmware wlan-firmware"
     FIRMWARE="misc-firmware wlan-firmware dvb-firmware"
 
-  # build and install ATV IR remote support (yes / no)
-    ATVCLIENT_SUPPORT="no"
-
   # build and install IRServer IR/LCD support (yes / no)
     IRSERVER_SUPPORT="no"
 
diff --git a/projects/WeTek_Play/linux/default-lan-phy.config b/projects/WeTek_Play/linux/default-lan-phy.config
index bb786d7cdba..a5fef78c6f6 100644
--- a/projects/WeTek_Play/linux/default-lan-phy.config
+++ b/projects/WeTek_Play/linux/default-lan-phy.config
@@ -8,7 +8,6 @@ CONFIG_OF_MDIO=y
 
 CONFIG_AML_PHY=y
 CONFIG_AML_LAN8720=y
-CONFIG_AML_IP101_PHY=m
-CONFIG_AML_KSZ8091=m
-CONFIG_AML_RTL8211F=m
-
+CONFIG_AML_IP101_PHY=y
+CONFIG_AML_KSZ8091=y
+CONFIG_AML_RTL8211F=y
diff --git a/projects/WeTek_Play/linux/linux.arm.conf b/projects/WeTek_Play/linux/linux.arm.conf
index e16357e9612..e94a66e0647 100644
--- a/projects/WeTek_Play/linux/linux.arm.conf
+++ b/projects/WeTek_Play/linux/linux.arm.conf
@@ -515,8 +515,8 @@ CONFIG_CPU_FREQ_STAT=y
 # CONFIG_CPU_FREQ_STAT_DETAILS is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-# CONFIG_CPU_FREQ_DEFAULT_GOV_HOTPLUG is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_HOTPLUG=y
 # CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE is not set
 CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
@@ -1852,7 +1852,7 @@ CONFIG_BCMA_POSSIBLE=y
 CONFIG_REGULATOR=y
 # CONFIG_REGULATOR_DEBUG is not set
 # CONFIG_REGULATOR_DUMMY is not set
-# CONFIG_REGULATOR_FIXED_VOLTAGE is not set
+CONFIG_REGULATOR_FIXED_VOLTAGE=y
 # CONFIG_REGULATOR_VIRTUAL_CONSUMER is not set
 # CONFIG_REGULATOR_USERSPACE_CONSUMER is not set
 # CONFIG_REGULATOR_GPIO is not set
@@ -2242,7 +2242,12 @@ CONFIG_OF_IOMMU=y
 # Rpmsg drivers
 #
 # CONFIG_PM_DEVFREQ is not set
-# CONFIG_EXTCON is not set
+CONFIG_EXTCON=m
+
+#
+# Extcon Device Drivers
+#
+# CONFIG_EXTCON_GPIO is not set
 # CONFIG_MEMORY is not set
 # CONFIG_IIO is not set
 # CONFIG_PWM is not set
diff --git a/projects/WeTek_Play/options b/projects/WeTek_Play/options
index 0b0ab0eeafc..0e496b3a345 100644
--- a/projects/WeTek_Play/options
+++ b/projects/WeTek_Play/options
@@ -105,14 +105,14 @@
   # Displayserver to use (x11 / no)
     DISPLAYSERVER="no"
 
-  # Windowmanager to use (ratpoison / fluxbox / none)
-    WINDOWMANAGER="none"
-
   # Xorg Graphic drivers to use (all / i915,i965,r200,r300,r600,nvidia)
   # Space separated list is supported,
   # e.g. GRAPHIC_DRIVERS="i915 i965 r300 r600 radeonsi nvidia"
     GRAPHIC_DRIVERS=""
 
+  # Windowmanager to use (ratpoison / fluxbox / none)
+    WINDOWMANAGER="none"
+
   # KODI Player implementation to use (default / bcm2835-firmware / libfslvpuwrap / libamcodec)
     KODIPLAYER_DRIVER="libamcodec"
 
@@ -123,7 +123,7 @@
   # for a list of additinoal drivers see packages/linux-drivers
   # Space separated list is supported,
   # e.g. ADDITIONAL_DRIVERS="DRIVER1 DRIVER2"
-    ADDITIONAL_DRIVERS="RTL8192DU RTL8192EU RTL8812AU wetekdvb" # todo: dvbhdhomerun
+    ADDITIONAL_DRIVERS="RTL8192DU RTL8192EU RTL8812AU wetekdvb libhdhomerun"
 
   # install extra kernel drivers, see configfiles in packages/linux/config/*.config
   # or projects/*/linux/*.config or distributions/*/linux/*.config
@@ -136,9 +136,6 @@
   # e.g. FIRMWARE="dvb-firmware misc-firmware wlan-firmware"
     FIRMWARE="misc-firmware wlan-firmware dvb-firmware"
 
-  # build and install ATV IR remote support (yes / no)
-    ATVCLIENT_SUPPORT="no"
-
   # build and install IRServer IR/LCD support (yes / no)
     IRSERVER_SUPPORT="no"
 
diff --git a/projects/WeTek_Play2/linux/linux.aarch64.conf b/projects/WeTek_Play2/linux/linux.aarch64.conf
index 3a673f39897..cafa5087d37 100644
--- a/projects/WeTek_Play2/linux/linux.aarch64.conf
+++ b/projects/WeTek_Play2/linux/linux.aarch64.conf
@@ -407,8 +407,8 @@ CONFIG_CPU_FREQ_STAT=y
 # CONFIG_CPU_FREQ_STAT_DETAILS is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-# CONFIG_CPU_FREQ_DEFAULT_GOV_HOTPLUG is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_HOTPLUG=y
 # CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE is not set
 CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
@@ -1467,7 +1467,7 @@ CONFIG_BCMA_POSSIBLE=y
 CONFIG_VEXPRESS_CONFIG=y
 CONFIG_REGULATOR=y
 # CONFIG_REGULATOR_DEBUG is not set
-# CONFIG_REGULATOR_FIXED_VOLTAGE is not set
+CONFIG_REGULATOR_FIXED_VOLTAGE=y
 # CONFIG_REGULATOR_VIRTUAL_CONSUMER is not set
 # CONFIG_REGULATOR_USERSPACE_CONSUMER is not set
 # CONFIG_REGULATOR_ACT8865 is not set
@@ -1906,7 +1906,13 @@ CONFIG_PM_DEVFREQ=y
 #
 # DEVFREQ Drivers
 #
-# CONFIG_EXTCON is not set
+CONFIG_EXTCON=m
+
+#
+# Extcon Device Drivers
+#
+CONFIG_OF_EXTCON=m
+# CONFIG_EXTCON_GPIO is not set
 # CONFIG_MEMORY is not set
 # CONFIG_IIO is not set
 # CONFIG_PWM is not set
diff --git a/projects/WeTek_Play2/options b/projects/WeTek_Play2/options
index b17e4109952..ed0990bd486 100644
--- a/projects/WeTek_Play2/options
+++ b/projects/WeTek_Play2/options
@@ -93,14 +93,14 @@
   # Displayserver to use (x11 / no)
     DISPLAYSERVER="no"
 
-  # Windowmanager to use (ratpoison / fluxbox / none)
-    WINDOWMANAGER="none"
-
   # Xorg Graphic drivers to use (all / i915,i965,r200,r300,r600,nvidia)
   # Space separated list is supported,
   # e.g. GRAPHIC_DRIVERS="i915 i965 r300 r600 radeonsi nvidia"
     GRAPHIC_DRIVERS=""
 
+  # Windowmanager to use (ratpoison / fluxbox / none)
+    WINDOWMANAGER="none"
+
   # KODI Player implementation to use (default / bcm2835-firmware / libfslvpuwrap / libamcodec)
     KODIPLAYER_DRIVER="libamcodec"
 
@@ -111,7 +111,7 @@
   # for a list of additinoal drivers see packages/linux-drivers
   # Space separated list is supported,
   # e.g. ADDITIONAL_DRIVERS="DRIVER1 DRIVER2"
-    ADDITIONAL_DRIVERS="RTL8192DU RTL8192EU RTL8812AU gpu-aml brcmap6xxx-aml wetekdvb" # todo: dvbhdhomerun
+    ADDITIONAL_DRIVERS="RTL8192DU RTL8192EU RTL8812AU gpu-aml brcmap6xxx-aml wetekdvb libhdhomerun"
 
   # install extra kernel drivers, see configfiles in packages/linux/config/*.config
   # or projects/*/linux/*.config or distributions/*/linux/*.config
@@ -125,9 +125,6 @@
   # e.g. FIRMWARE="dvb-firmware misc-firmware wlan-firmware"
     FIRMWARE="misc-firmware wlan-firmware dvb-firmware"
 
-  # build and install ATV IR remote support (yes / no)
-    ATVCLIENT_SUPPORT="no"
-
   # build and install IRServer IR/LCD support (yes / no)
     IRSERVER_SUPPORT="no"
 
diff --git a/projects/imx6/linux/default-lan-imx.config b/projects/imx6/linux/default-lan-imx.config
index 9b92c97d59b..e0106c20aae 100644
--- a/projects/imx6/linux/default-lan-imx.config
+++ b/projects/imx6/linux/default-lan-imx.config
@@ -5,7 +5,7 @@ CONFIG_NETDEVICES=y
 CONFIG_ETHERNET=y
 
 CONFIG_NET_VENDOR_FREESCALE=y
-CONFIG_FEC=m
+CONFIG_FEC=y
 CONFIG_FSL_PQ_MDIO=y
 # CONFIG_FSL_XGMAC_MDIO is not set
 CONFIG_GIANFAR=y
diff --git a/projects/imx6/linux/linux.arm.conf b/projects/imx6/linux/linux.arm.conf
index 03cba696e6a..525c070fd12 100644
--- a/projects/imx6/linux/linux.arm.conf
+++ b/projects/imx6/linux/linux.arm.conf
@@ -632,9 +632,9 @@ CONFIG_CPU_FREQ_STAT=y
 # CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
-# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
 # CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set
 CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
 CONFIG_CPU_FREQ_GOV_POWERSAVE=y
 # CONFIG_CPU_FREQ_GOV_USERSPACE is not set
@@ -1791,7 +1791,6 @@ CONFIG_FB_MODE_HELPERS=y
 # CONFIG_FB_VIRTUAL is not set
 # CONFIG_FB_METRONOME is not set
 # CONFIG_FB_MB862XX is not set
-# CONFIG_FB_MX3 is not set
 # CONFIG_FB_BROADSHEET is not set
 # CONFIG_FB_AUO_K190X is not set
 # CONFIG_FB_MXS is not set
@@ -1961,8 +1960,7 @@ CONFIG_FSL_EDMA=y
 CONFIG_IMX_SDMA=y
 # CONFIG_INTEL_IDMA64 is not set
 CONFIG_MXS_DMA=y
-CONFIG_MX3_IPU=y
-CONFIG_MX3_IPU_IRQS=4
+# CONFIG_MX3_IPU is not set
 CONFIG_MXC_PXP_V2=y
 CONFIG_MXC_PXP_CLIENT_DEVICE=y
 # CONFIG_NBPFAXI_DMA is not set
@@ -2081,7 +2079,16 @@ CONFIG_CLKSRC_IMX_GPT=y
 # CONFIG_SUNXI_SRAM is not set
 # CONFIG_SOC_TI is not set
 # CONFIG_PM_DEVFREQ is not set
-# CONFIG_EXTCON is not set
+CONFIG_EXTCON=m
+
+#
+# Extcon Device Drivers
+#
+# CONFIG_EXTCON_GPIO is not set
+# CONFIG_EXTCON_MAX3355 is not set
+# CONFIG_EXTCON_RT8973A is not set
+# CONFIG_EXTCON_SM5502 is not set
+# CONFIG_EXTCON_USB_GPIO is not set
 # CONFIG_MEMORY is not set
 # CONFIG_IIO is not set
 # CONFIG_NTB is not set
diff --git a/projects/imx6/options b/projects/imx6/options
index 357c4d9949e..99012f974ff 100644
--- a/projects/imx6/options
+++ b/projects/imx6/options
@@ -106,14 +106,14 @@
   # Displayserver to use (x11 / no)
     DISPLAYSERVER="no"
 
-  # Windowmanager to use (ratpoison / fluxbox / none)
-    WINDOWMANAGER="none"
-
   # Xorg Graphic drivers to use (all / i915,i965,r200,r300,r600,nvidia)
   # Space separated list is supported,
   # e.g. GRAPHIC_DRIVERS="i915 i965 r300 r600 radeonsi nvidia"
     GRAPHIC_DRIVERS=""
 
+  # Windowmanager to use (ratpoison / fluxbox / none)
+    WINDOWMANAGER="none"
+
   # KODI Player implementation to use (default / bcm2835-firmware / libfslvpuwrap)
     KODIPLAYER_DRIVER="libfslvpuwrap"
 
@@ -130,10 +130,7 @@
   # additional Firmware to use (dvb-firmware, misc-firmware, wlan-firmware)
   # Space separated list is supported,
   # e.g. FIRMWARE="dvb-firmware misc-firmware wlan-firmware"
-    FIRMWARE="misc-firmware wlan-firmware iwlwifi-firmware dvb-firmware"
-
-  # build and install ATV IR remote support (yes / no)
-    ATVCLIENT_SUPPORT="no"
+    FIRMWARE="misc-firmware wlan-firmware dvb-firmware"
 
   # build and install IRServer IR/LCD support (yes / no)
     IRSERVER_SUPPORT="no"
diff --git a/scripts/build b/scripts/build
index 13db1f7384b..52887e03ebb 100755
--- a/scripts/build
+++ b/scripts/build
@@ -337,7 +337,7 @@ if [ ! -f $STAMP ]; then
             $STRIP `find $INSTALL -name "*.so.[0-9]*" 2>/dev/null` 2>/dev/null || :
 
             # strip kernel modules
-            for MOD in `find $INSTALL -type f -name *.ko`; do
+            for MOD in `find $INSTALL/lib $INSTALL/usr/lib -type f -name *.ko`; do
               $STRIP --strip-debug $MOD
             done
           fi
diff --git a/scripts/create_addon b/scripts/create_addon
index 333a844adbd..ba97b86c011 100755
--- a/scripts/create_addon
+++ b/scripts/create_addon
@@ -186,17 +186,11 @@ if [ "$PKG_IS_ADDON" = "yes" ] ; then
 
   rm -rf $ADDON_BUILD
 
-
   if [ -d $PKG_BUILD/.install_pkg/usr/share/kodi/addons/$PKG_NAME ]; then
     mkdir -p $ADDON_BUILD/$PKG_ADDON_ID/
       cp -PR $PKG_BUILD/.install_pkg/usr/share/kodi/addons/$PKG_NAME/* $ADDON_BUILD/$PKG_ADDON_ID/
       if [ -d $PKG_BUILD/.install_pkg/usr/lib/kodi/addons/$PKG_NAME ]; then
-        if [  -f $ADDON_BUILD/$PKG_ADDON_ID/addon.xml ]; then
-          ADDONSO=$(xmlstarlet sel -t -v "/addon/extension/@library_linux" $ADDON_BUILD/$PKG_ADDON_ID/addon.xml)
-          cp -PL $PKG_BUILD/.install_pkg/usr/lib/$MEDIACENTER/addons/$PKG_NAME/$ADDONSO $ADDON_BUILD/$PKG_ADDON_ID/
-        else
-          cp -PL $PKG_BUILD/.install_pkg/usr/lib/$MEDIACENTER/addons/$PKG_NAME/*.so* $ADDON_BUILD/$PKG_ADDON_ID/
-        fi
+        cp -PL $PKG_BUILD/.install_pkg/usr/lib/$MEDIACENTER/addons/$PKG_NAME/*.so* $ADDON_BUILD/$PKG_ADDON_ID/
       fi
     if [ "$(type -t addon)" = "function" ]; then
       addon