diff --git a/miniz_oxide/src/deflate/buffer.rs b/miniz_oxide/src/deflate/buffer.rs
index d36dc4a..ec4e499 100644
--- a/miniz_oxide/src/deflate/buffer.rs
+++ b/miniz_oxide/src/deflate/buffer.rs
@@ -25,6 +25,7 @@ pub const fn update_hash(current_hash: u16, byte: u8) -> u16 {
     ((current_hash << LZ_HASH_SHIFT) ^ byte as u16) & (LZ_HASH_SIZE as u16 - 1)
 }
 
+#[derive(Clone)]
 pub struct HashBuffers {
     pub dict: Box<[u8; LZ_DICT_FULL_SIZE]>,
     pub next: Box<[u16; LZ_DICT_SIZE]>,
@@ -53,6 +54,7 @@ impl Default for HashBuffers {
     }
 }
 
+#[derive(Clone)]
 pub struct LocalBuf {
     pub b: [u8; OUT_BUF_SIZE],
 }
diff --git a/miniz_oxide/src/deflate/core.rs b/miniz_oxide/src/deflate/core.rs
index 3a5dfd2..9871439 100644
--- a/miniz_oxide/src/deflate/core.rs
+++ b/miniz_oxide/src/deflate/core.rs
@@ -223,10 +223,12 @@ pub enum TDEFLFlush {
     /// Compress as much as there is space for, and then return waiting for more input.
     None = 0,
 
-    /// Try to flush all the current data and output an empty fixed block.
+    /// Try to flush all the current data and output an empty fixed
+    /// block (10 bits) to synchonize the stream.
     Partial = 1,
 
-    /// Try to flush all the current data and output an empty raw block.
+    /// Try to flush all the current data and output an empty raw
+    /// block (3-10 bits + 32 bits) to synchonize the stream.
     Sync = 2,
 
     /// Same as [`Sync`][Self::Sync], but reset the dictionary so that the following data does not
@@ -237,6 +239,23 @@ pub enum TDEFLFlush {
     ///
     /// On success this will yield a [`TDEFLStatus::Done`] return status.
     Finish = 4,
+
+    /// Try to flush all the current data and, if data is unaligned,
+    /// output an empty fixed block (10 bits) to synchonize the
+    /// stream.
+    PartialOpt = 5,
+
+    /// Try to flush all the current data and, if data is unaligned,
+    /// output an empty raw block (3-10 bits + 32 bits) to synchonize
+    /// the stream.
+    SyncOpt = 6,
+
+    /// Try to flush the current data but without any sync, which may
+    /// leave up to 7 bits of data not output.  You can use
+    /// `TDEFLFlush::PartialOpt` or `TDEFLFlush::SyncOpt` to add a
+    /// sync sequence on a future call if you later decide that you
+    /// have space downstream to forward that final byte.
+    NoSync = 7,
 }
 
 impl From<MZFlush> for TDEFLFlush {
@@ -260,6 +279,9 @@ impl TDEFLFlush {
             2 => Ok(TDEFLFlush::Sync),
             3 => Ok(TDEFLFlush::Full),
             4 => Ok(TDEFLFlush::Finish),
+            5 => Ok(TDEFLFlush::PartialOpt),
+            6 => Ok(TDEFLFlush::SyncOpt),
+            7 => Ok(TDEFLFlush::NoSync),
             _ => Err(MZError::Param),
         }
     }
@@ -327,6 +349,7 @@ const fn read_u16_le<const N: usize>(slice: &[u8; N], pos: usize) -> u16 {
 }
 
 /// Main compression struct.
+#[derive(Clone)]
 pub struct CompressorOxide {
     pub(crate) lz: LZOxide,
     pub(crate) params: ParamsOxide,
@@ -428,6 +451,13 @@ impl CompressorOxide {
         self.params.update_flags(flags);
         self.dict.update_flags(flags);
     }
+
+    /// Check the number of unwritten bits after the last flush.
+    /// After a `NoSync` flush it can be used to test whether the
+    /// stream is aligned with a byte boundary.
+    pub fn unwritten_bit_count(&self) -> u32 {
+        self.params.saved_bits_in
+    }
 }
 
 impl Default for CompressorOxide {
@@ -649,6 +679,13 @@ impl OutputBufferOxide<'_> {
         }
     }
 
+    /// Test whether the output is currently on a byte boundary,
+    /// i.e. all current data has been output
+    #[inline]
+    fn is_byte_aligned(&self) -> bool {
+        self.bits_in == 0
+    }
+
     #[inline]
     fn write_bytes(&mut self, bytes: &[u8]) {
         debug_assert_eq!(self.bits_in, 0);
@@ -698,6 +735,7 @@ impl BitBuffer {
 /// NOTE: Only the literal/lengths have enough symbols to actually use
 /// the full array. It's unclear why it's defined like this in miniz,
 /// it could be for cache/alignment reasons.
+#[derive(Clone)]
 pub(crate) struct HuffmanOxide {
     /// Number of occurrences of each symbol.
     pub count: [[u16; MAX_HUFF_SYMBOLS]; MAX_HUFF_TABLES],
@@ -1131,6 +1169,7 @@ impl HuffmanOxide {
     }
 }
 
+#[derive(Clone)]
 pub(crate) struct DictOxide {
     /// The maximum number of checks in the hash chain, for the initial,
     /// and the lazy match respectively.
@@ -1344,6 +1383,7 @@ impl DictOxide {
     }
 }
 
+#[derive(Clone)]
 pub(crate) struct ParamsOxide {
     pub flags: u32,
     pub greedy_parsing: bool,
@@ -1419,6 +1459,7 @@ impl ParamsOxide {
     }
 }
 
+#[derive(Clone)]
 pub(crate) struct LZOxide {
     pub codes: [u8; LZ_CODE_BUF_SIZE],
     pub code_position: usize,
@@ -1610,85 +1651,88 @@ pub(crate) fn flush_block(
         output.bit_buffer = d.params.saved_bit_buffer;
         output.bits_in = d.params.saved_bits_in;
 
-        // TODO: Don't think this second condition should be here but need to verify.
-        let use_raw_block = (d.params.flags & TDEFL_FORCE_ALL_RAW_BLOCKS != 0)
-            && (d.dict.lookahead_pos - d.dict.code_buf_dict_pos) <= d.dict.size;
-        debug_assert_eq!(
-            use_raw_block,
-            d.params.flags & TDEFL_FORCE_ALL_RAW_BLOCKS != 0
-        );
+        if d.lz.total_bytes > 0 || flush == TDEFLFlush::Finish {
+            // TODO: Don't think this second condition should be here but need to verify.
+            let use_raw_block = (d.params.flags & TDEFL_FORCE_ALL_RAW_BLOCKS != 0)
+                && (d.dict.lookahead_pos - d.dict.code_buf_dict_pos) <= d.dict.size;
+            debug_assert_eq!(
+                use_raw_block,
+                d.params.flags & TDEFL_FORCE_ALL_RAW_BLOCKS != 0
+            );
 
-        assert!(d.params.flush_remaining == 0);
-        d.params.flush_ofs = 0;
-        d.params.flush_remaining = 0;
+            assert!(d.params.flush_remaining == 0);
+            d.params.flush_ofs = 0;
+            d.params.flush_remaining = 0;
 
-        d.lz.init_flag();
+            d.lz.init_flag();
 
-        // If we are at the start of the stream, write the zlib header if requested.
-        if d.params.flags & TDEFL_WRITE_ZLIB_HEADER != 0 && d.params.block_index == 0 {
-            let header = zlib::header_from_flags(d.params.flags);
-            output.put_bits_no_flush(header[0].into(), 8);
-            output.put_bits(header[1].into(), 8);
-        }
+            // If we are at the start of the stream, write the zlib header if requested.
+            if d.params.flags & TDEFL_WRITE_ZLIB_HEADER != 0 && d.params.block_index == 0 {
+                let header = zlib::header_from_flags(d.params.flags);
+                output.put_bits_no_flush(header[0].into(), 8);
+                output.put_bits(header[1].into(), 8);
+            }
 
-        // Output the block header.
-        output.put_bits((flush == TDEFLFlush::Finish) as u32, 1);
+            // Output the block header.
+            output.put_bits((flush == TDEFLFlush::Finish) as u32, 1);
 
-        saved_buffer = output.save();
+            saved_buffer = output.save();
 
-        let comp_success = if !use_raw_block {
-            let use_static =
-                (d.params.flags & TDEFL_FORCE_ALL_STATIC_BLOCKS != 0) || (d.lz.total_bytes < 48);
-            compress_block(&mut d.huff, &mut output, &d.lz, use_static)?
-        } else {
-            false
-        };
+            let comp_success = if !use_raw_block {
+                let use_static = (d.params.flags & TDEFL_FORCE_ALL_STATIC_BLOCKS != 0)
+                    || (d.lz.total_bytes < 48);
+                compress_block(&mut d.huff, &mut output, &d.lz, use_static)?
+            } else {
+                false
+            };
+
+            // If we failed to compress anything and the output would take up more space than the output
+            // data, output a stored block instead, which has at most 5 bytes of overhead.
+            // We only use some simple heuristics for now.
+            // A stored block will have an overhead of at least 4 bytes containing the block length
+            // but usually more due to the length parameters having to start at a byte boundary and thus
+            // requiring up to 5 bytes of padding.
+            // As a static block will have an overhead of at most 1 bit per byte
+            // (as literals are either 8 or 9 bytes), a raw block will
+            // never take up less space if the number of input bytes are less than 32.
+            let expanded = (d.lz.total_bytes > 32)
+                && (output.inner_pos - saved_buffer.pos + 1 >= (d.lz.total_bytes as usize))
+                && (d.dict.lookahead_pos - d.dict.code_buf_dict_pos <= d.dict.size);
+
+            if use_raw_block || expanded {
+                output.load(saved_buffer);
+
+                // Block header.
+                output.put_bits(0, 2);
+
+                // Block length has to start on a byte boundary, so pad.
+                output.pad_to_bytes();
 
-        // If we failed to compress anything and the output would take up more space than the output
-        // data, output a stored block instead, which has at most 5 bytes of overhead.
-        // We only use some simple heuristics for now.
-        // A stored block will have an overhead of at least 4 bytes containing the block length
-        // but usually more due to the length parameters having to start at a byte boundary and thus
-        // requiring up to 5 bytes of padding.
-        // As a static block will have an overhead of at most 1 bit per byte
-        // (as literals are either 8 or 9 bytes), a raw block will
-        // never take up less space if the number of input bytes are less than 32.
-        let expanded = (d.lz.total_bytes > 32)
-            && (output.inner_pos - saved_buffer.pos + 1 >= (d.lz.total_bytes as usize))
-            && (d.dict.lookahead_pos - d.dict.code_buf_dict_pos <= d.dict.size);
-
-        if use_raw_block || expanded {
-            output.load(saved_buffer);
-
-            // Block header.
-            output.put_bits(0, 2);
-
-            // Block length has to start on a byte boundary, so pad.
-            output.pad_to_bytes();
-
-            // Block length and ones complement of block length.
-            output.put_bits(d.lz.total_bytes & 0xFFFF, 16);
-            output.put_bits(!d.lz.total_bytes & 0xFFFF, 16);
-
-            // Write the actual bytes.
-            let start = d.dict.code_buf_dict_pos & LZ_DICT_SIZE_MASK;
-            let end = (d.dict.code_buf_dict_pos + d.lz.total_bytes as usize) & LZ_DICT_SIZE_MASK;
-            let dict = &mut d.dict.b.dict;
-            if start < end {
-                // The data does not wrap around.
-                output.write_bytes(&dict[start..end]);
-            } else if d.lz.total_bytes > 0 {
-                // The data wraps around and the input was not 0 bytes.
-                output.write_bytes(&dict[start..LZ_DICT_SIZE]);
-                output.write_bytes(&dict[..end]);
+                // Block length and ones complement of block length.
+                output.put_bits(d.lz.total_bytes & 0xFFFF, 16);
+                output.put_bits(!d.lz.total_bytes & 0xFFFF, 16);
+
+                // Write the actual bytes.
+                let start = d.dict.code_buf_dict_pos & LZ_DICT_SIZE_MASK;
+                let end =
+                    (d.dict.code_buf_dict_pos + d.lz.total_bytes as usize) & LZ_DICT_SIZE_MASK;
+                let dict = &mut d.dict.b.dict;
+                if start < end {
+                    // The data does not wrap around.
+                    output.write_bytes(&dict[start..end]);
+                } else if d.lz.total_bytes > 0 {
+                    // The data wraps around and the input was not 0 bytes.
+                    output.write_bytes(&dict[start..LZ_DICT_SIZE]);
+                    output.write_bytes(&dict[..end]);
+                }
+            } else if !comp_success {
+                output.load(saved_buffer);
+                compress_block(&mut d.huff, &mut output, &d.lz, true)?;
             }
-        } else if !comp_success {
-            output.load(saved_buffer);
-            compress_block(&mut d.huff, &mut output, &d.lz, true)?;
         }
 
-        if flush != TDEFLFlush::None {
-            if flush == TDEFLFlush::Finish {
+        match flush {
+            TDEFLFlush::Finish => {
                 output.pad_to_bytes();
                 if d.params.flags & TDEFL_WRITE_ZLIB_HEADER != 0 {
                     let mut adler = d.params.adler32;
@@ -1697,16 +1741,31 @@ pub(crate) fn flush_block(
                         adler <<= 8;
                     }
                 }
-            } else if flush == TDEFLFlush::Partial {
+            }
+            TDEFLFlush::Partial => {
                 output.put_bits(2, 10);
-            } else {
-                // Sync or Full flush.
+            }
+            TDEFLFlush::PartialOpt => {
+                if !output.is_byte_aligned() {
+                    output.put_bits(2, 10);
+                }
+            }
+            TDEFLFlush::Sync | TDEFLFlush::Full => {
                 // Output an empty raw block.
                 output.put_bits(0, 3);
                 output.pad_to_bytes();
                 output.put_bits(0, 16);
                 output.put_bits(0xFFFF, 16);
             }
+            TDEFLFlush::SyncOpt => {
+                if !output.is_byte_aligned() {
+                    output.put_bits(0, 3);
+                    output.pad_to_bytes();
+                    output.put_bits(0, 16);
+                    output.put_bits(0xFFFF, 16);
+                }
+            }
+            TDEFLFlush::None | TDEFLFlush::NoSync => (),
         }
 
         d.huff.count[0][..MAX_HUFF_SYMBOLS_0].fill(0);
diff --git a/miniz_oxide/tests/flush.rs b/miniz_oxide/tests/flush.rs
new file mode 100644
index 0000000..187903a
--- /dev/null
+++ b/miniz_oxide/tests/flush.rs
@@ -0,0 +1,212 @@
+#![cfg(feature = "with-alloc")]
+
+use miniz_oxide::deflate::core::{compress_to_output, CompressorOxide, TDEFLFlush, TDEFLStatus};
+use miniz_oxide::deflate::CompressionLevel;
+use miniz_oxide::inflate::core::decompress;
+use miniz_oxide::inflate::core::inflate_flags::*;
+use miniz_oxide::inflate::core::DecompressorOxide;
+use miniz_oxide::inflate::TINFLStatus;
+use miniz_oxide::DataFormat;
+
+/// Looks for byte-strings which when compressed result in each of the
+/// final number of bits 0..=7, and then tests each of the flush modes
+/// on that byte-string to see that it gives the correct result,
+/// i.e. right number of bits added, and successful sync of the
+/// stream.
+#[test]
+fn test_flush() {
+    let mut found = 0;
+    let mut n = 1;
+    while found != 255 {
+        let data = Rng::new(987654321).octal(n);
+        n += 1;
+
+        let base = compress(&data, &[TDEFLFlush::NoSync]);
+
+        let mask = 1 << (base & 7);
+        if (found & mask) != 0 {
+            continue;
+        }
+        found |= mask;
+
+        for nosync_first in [false, true] {
+            for mode in [
+                TDEFLFlush::Partial,
+                TDEFLFlush::Sync,
+                TDEFLFlush::Full,
+                TDEFLFlush::Finish,
+                TDEFLFlush::PartialOpt,
+                TDEFLFlush::SyncOpt,
+            ] {
+                if nosync_first && mode == TDEFLFlush::Finish {
+                    // `Finish` has to output a block even if empty to
+                    // pass the finish flag, so skip as NoSync would
+                    // be expected to change the output length in this
+                    // case.  For all other cases doing a NoSync first
+                    // should make no difference to the output.
+                    continue;
+                }
+                let bits = if nosync_first {
+                    compress(&data, &[TDEFLFlush::NoSync, mode])
+                } else {
+                    compress(&data, &[mode])
+                };
+                let expected = match mode {
+                    TDEFLFlush::Partial => base + 10,
+                    TDEFLFlush::Sync | TDEFLFlush::Full => {
+                        // 3 bits, pad-to-byte, 16+16-bit length
+                        ((base + 3 - 1) | 7) + 1 + 32
+                    }
+                    TDEFLFlush::Finish => {
+                        // Pad-to-byte, Zlib trailer
+                        ((base - 1) | 7) + 1 + 32
+                    }
+                    TDEFLFlush::PartialOpt => {
+                        if (base & 7) != 0 {
+                            base + 10
+                        } else {
+                            base
+                        }
+                    }
+                    TDEFLFlush::SyncOpt => {
+                        if (base & 7) != 0 {
+                            ((base + 3 - 1) | 7) + 1 + 32
+                        } else {
+                            base
+                        }
+                    }
+                    _ => panic!(),
+                };
+
+                assert_eq!(
+                    bits,
+                    expected,
+                    "Unexpected flush behaviour for unwritten_bits={} \
+                     mode={mode:?} nosync_first={nosync_first}: \
+                     expecting {base} -> {expected}, but got {bits}",
+                    base & 7
+                );
+            }
+        }
+    }
+}
+
+// Low-quality RNG, copied from test.rs
+struct Rng(u64);
+
+impl Rng {
+    fn new(seed: u32) -> Self {
+        Self(((seed as u64) << 16) | 0x330E)
+    }
+    fn octal(&mut self, n: usize) -> Vec<u8> {
+        self.map(|x| ((x & 7) + 48) as u8).take(n).collect()
+    }
+}
+
+impl Iterator for Rng {
+    type Item = u32;
+    fn next(&mut self) -> Option<u32> {
+        self.0 = self.0.wrapping_mul(0x5DEECE66D).wrapping_add(0xB);
+        Some((self.0 >> 16) as u32)
+    }
+}
+
+/// Compress data then apply the given flush modes in sequence and
+/// return the number of output bits that result.  Also checks that
+/// the decompression matches if the flush modes used are expected to
+/// sync the stream.
+fn compress(mut data: &[u8], modes: &[TDEFLFlush]) -> usize {
+    let save_data = data;
+    let mut compressor = CompressorOxide::new(0);
+    compressor.set_format_and_level(DataFormat::Zlib, 0);
+    compressor.set_compression_level(CompressionLevel::BestCompression);
+
+    let mut out = Vec::new();
+    loop {
+        let mut ocount = 0;
+        let (status, icount) =
+            compress_to_output(&mut compressor, data, TDEFLFlush::None, |data| {
+                ocount += data.len();
+                out.extend_from_slice(data);
+                true // Success
+            });
+        assert!(!matches!(
+            status,
+            TDEFLStatus::BadParam | TDEFLStatus::PutBufFailed
+        ));
+        data = &data[icount..];
+        if icount == 0 && ocount == 0 {
+            break;
+        }
+    }
+
+    let mut check = false;
+    for &mode in modes {
+        let (status, _) = compress_to_output(&mut compressor, b"", mode, |data| {
+            out.extend_from_slice(data);
+            true // Success
+        });
+        assert!(!matches!(
+            status,
+            TDEFLStatus::BadParam | TDEFLStatus::PutBufFailed
+        ));
+        if !matches!(mode, TDEFLFlush::NoSync | TDEFLFlush::None) {
+            check = true;
+        }
+    }
+
+    if check {
+        // Check that sync really does what it is supposed to be
+        // doing, i.e. syncing the stream in the byte-stream output
+        check_partial_inflate(&out, save_data);
+    }
+
+    out.len() * 8 + compressor.unwritten_bit_count() as usize
+}
+
+/// Check that an unterminated Zlib stream matches the given
+/// uncompressed data, i.e. that it has synced correctly
+fn check_partial_inflate(compressed: &[u8], uncompressed: &[u8]) {
+    let mut out = Vec::new();
+    let mut decompressor = DecompressorOxide::new();
+    const DECODE_BUF_LEN: usize = 65536;
+    let mut obuf = vec![0; DECODE_BUF_LEN];
+    let mut iread = 0;
+    let mut opos = 0;
+    loop {
+        let (status, icount, mut ocount) = decompress(
+            &mut decompressor,
+            &compressed[iread..],
+            &mut obuf[..],
+            opos,
+            TINFL_FLAG_HAS_MORE_INPUT | TINFL_FLAG_PARSE_ZLIB_HEADER,
+        );
+
+        assert!(!matches!(
+            status,
+            TINFLStatus::FailedCannotMakeProgress
+                | TINFLStatus::BadParam
+                | TINFLStatus::Adler32Mismatch
+                | TINFLStatus::Failed
+        ));
+
+        if icount == 0 && ocount == 0 {
+            break;
+        }
+
+        iread += icount;
+
+        while ocount > 0 {
+            let count = ocount.min(obuf.len() - opos);
+            out.extend_from_slice(&obuf[opos..opos + count]);
+            opos = (opos + count) & (DECODE_BUF_LEN - 1);
+            ocount -= count;
+        }
+    }
+
+    assert_eq!(
+        uncompressed,
+        out.as_slice(),
+        "Byte-stream doesn't decompress to the expected data"
+    );
+}