From b53dbae27d87f5f1bfac7f07a7c97da73950620c Mon Sep 17 00:00:00 2001 From: Jon Crussell Date: Fri, 22 Aug 2025 09:28:46 -0700 Subject: [PATCH 1/4] WIP: add support for MSI Basic scaffolding, running into a few issues. --- python/unblob/handlers/__init__.py | 2 + python/unblob/handlers/archive/msi.py | 71 +++++++++++++++++++++++++++ python/unblob/processing.py | 4 +- 3 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 python/unblob/handlers/archive/msi.py diff --git a/python/unblob/handlers/__init__.py b/python/unblob/handlers/__init__.py index 98a0accba6..ab37cf859b 100644 --- a/python/unblob/handlers/__init__.py +++ b/python/unblob/handlers/__init__.py @@ -6,6 +6,7 @@ cab, cpio, dmg, + msi, partclone, rar, sevenzip, @@ -88,6 +89,7 @@ arc.ARCHandler, arj.ARJHandler, cab.CABHandler, + msi.MsiHandler, tar.TarUstarHandler, tar.TarUnixHandler, cpio.PortableASCIIHandler, diff --git a/python/unblob/handlers/archive/msi.py b/python/unblob/handlers/archive/msi.py new file mode 100644 index 0000000000..d5b5c6ac5a --- /dev/null +++ b/python/unblob/handlers/archive/msi.py @@ -0,0 +1,71 @@ +"""MSI Handler + +Extracts uses 7z for now. Could migrate to fully implementation: + + https://github.com/nightlark/pymsi +""" + +from typing import Optional +import io + +import pymsi +from structlog import get_logger + +from unblob.extractors import Command + +from ...models import ( + File, + Handler, + HandlerDoc, + HandlerType, + HexString, + Reference, + ValidChunk, +) + +logger = get_logger() + + +class MsiHandler(Handler): + NAME = "msi" + + PATTERNS = [ + HexString("D0 CF 11 E0 A1 B1 1A E1") + ] + EXTRACTOR = Command("7z", "x", "-p", "-y", "{inpath}", "-o{outdir}") + + DOC = HandlerDoc( + name="MSI", + description="Microsoft Installer (MSI) files are used for the installation, maintenance, and removal of software.", + handler_type=HandlerType.ARCHIVE, + vendor="Microsoft", + references=[ + Reference( + title="MSI File Format Documentation", + url="https://docs.microsoft.com/en-us/windows/win32/msi/overview-of-windows-installer", + ) + ], + limitations=[], + ) + + def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]: + file.seek(start_offset, io.SEEK_SET) + + try: + # TODO: pymsi wants a path or BytesIO + buf = io.BytesIO() + buf.write(file[:]) + buf.seek(0) + + package = pymsi.Package(buf) + msi = pymsi.Msi(package, True) + except Exception: + return None + + # MSI moves the file pointer + msi_end_offset = buf.tell() + + return ValidChunk( + start_offset = start_offset, + end_offset = msi_end_offset, + ) diff --git a/python/unblob/processing.py b/python/unblob/processing.py index 64ebf210fd..4c21cfeddd 100644 --- a/python/unblob/processing.py +++ b/python/unblob/processing.py @@ -54,7 +54,9 @@ DEFAULT_PROCESS_NUM = multiprocessing.cpu_count() DEFAULT_SKIP_MAGIC = ( "BFLT", - "Composite Document File V2 Document", + # TODO: Need to disable this for MSI but does it need to be enabled for + # other types of Composite Documents? + #"Composite Document File V2 Document", "Erlang BEAM file", "GIF", "GNU message catalog", From 297d798dae0d0fe0f4c38acbe8b922923dd0b1f1 Mon Sep 17 00:00:00 2001 From: Jon Crussell Date: Thu, 28 Aug 2025 20:51:07 -0700 Subject: [PATCH 2/4] chore(deps): add python-msi dependency --- pyproject.toml | 3 ++- uv.lock | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b31a179c3e..56f5c2ed78 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,13 +13,14 @@ dependencies = [ "jefferson>=0.4.5", "lark>=1.1.8", "lief>=0.16.1", - "lz4>=4.3.2,!=4.4.3", # 4.4.3 doesn't have aarch64 wheels https://github.com/python-lz4/python-lz4/pull/298 + "lz4>=4.3.2,!=4.4.3", # 4.4.3 doesn't have aarch64 wheels https://github.com/python-lz4/python-lz4/pull/298 "plotext>=4.2.0,<6.0", "pluggy>=1.3.0", "pyfatfs>=1.0.5", "pymdown-extensions>=10.15", "pyperscan>=0.3.0", "python-magic>=0.4.27", + "python-msi>=0.0.0a2", "pyzstd", "rarfile>=4.1", "rich>=13.3.5", diff --git a/uv.lock b/uv.lock index e4fa3db8bc..e60933ea25 100644 --- a/uv.lock +++ b/uv.lock @@ -990,6 +990,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, ] +[[package]] +name = "olefile" +version = "0.47" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/69/1b/077b508e3e500e1629d366249c3ccb32f95e50258b231705c09e3c7a4366/olefile-0.47.zip", hash = "sha256:599383381a0bf3dfbd932ca0ca6515acd174ed48870cbf7fee123d698c192c1c", size = 112240, upload-time = "2023-12-01T16:22:53.025Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/d3/b64c356a907242d719fc668b71befd73324e47ab46c8ebbbede252c154b2/olefile-0.47-py2.py3-none-any.whl", hash = "sha256:543c7da2a7adadf21214938bb79c83ea12b473a4b6ee4ad4bf854e7715e13d1f", size = 114565, upload-time = "2023-12-01T16:22:51.518Z" }, +] + [[package]] name = "packaging" version = "25.0" @@ -1296,6 +1305,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6c/73/9f872cb81fc5c3bb48f7227872c28975f998f3e7c2b1c16e95e6432bbb90/python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3", size = 13840, upload-time = "2022-06-07T20:16:57.763Z" }, ] +[[package]] +name = "python-msi" +version = "0.0.0a2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "olefile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b2/b1/3e84b65ab7d8586953b8f4677993786f9150a121fad3685d2cb192bd87b3/python_msi-0.0.0a2.tar.gz", hash = "sha256:232789fa8614627c7800cfd7aafc76381fd0c7d8e0809031531672ed655d807f", size = 55828, upload-time = "2025-06-21T03:12:37.441Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/03/f5/dc9ee69d4dfd7615624ebdba22d719db345a78454559d2a5374f40d16727/python_msi-0.0.0a2-py3-none-any.whl", hash = "sha256:73177bf2022014016b2dc89a5f07847f39cef9ca88787ceab37187e5a8e8eead", size = 47776, upload-time = "2025-06-21T03:12:35.949Z" }, +] + [[package]] name = "pyyaml" version = "6.0.2" @@ -1677,6 +1698,7 @@ dependencies = [ { name = "pymdown-extensions" }, { name = "pyperscan" }, { name = "python-magic" }, + { name = "python-msi" }, { name = "pyzstd" }, { name = "rarfile" }, { name = "rich" }, @@ -1722,6 +1744,7 @@ requires-dist = [ { name = "pymdown-extensions", specifier = ">=10.15" }, { name = "pyperscan", specifier = ">=0.3.0" }, { name = "python-magic", specifier = ">=0.4.27" }, + { name = "python-msi", specifier = ">=0.0.0a2" }, { name = "pyzstd" }, { name = "rarfile", specifier = ">=4.1" }, { name = "rich", specifier = ">=13.3.5" }, From 4e0a7b3faae8694e06a70b48b31fabaf0b37a5e1 Mon Sep 17 00:00:00 2001 From: Jon Crussell Date: Thu, 28 Aug 2025 20:52:57 -0700 Subject: [PATCH 3/4] feat(handler): clean up msi handler Requires this PR to (almost) work properly: https://github.com/nightlark/pymsi/pull/81 --- python/unblob/handlers/archive/msi.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/python/unblob/handlers/archive/msi.py b/python/unblob/handlers/archive/msi.py index d5b5c6ac5a..9abc925698 100644 --- a/python/unblob/handlers/archive/msi.py +++ b/python/unblob/handlers/archive/msi.py @@ -1,6 +1,6 @@ """MSI Handler -Extracts uses 7z for now. Could migrate to fully implementation: +Extracts uses 7z for now. Could migrate to a fully Python-based implementation: https://github.com/nightlark/pymsi """ @@ -51,21 +51,13 @@ class MsiHandler(Handler): def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]: file.seek(start_offset, io.SEEK_SET) - try: - # TODO: pymsi wants a path or BytesIO - buf = io.BytesIO() - buf.write(file[:]) - buf.seek(0) - - package = pymsi.Package(buf) - msi = pymsi.Msi(package, True) - except Exception: - return None + package = pymsi.Package(file) + msi = pymsi.Msi(package, False) # MSI moves the file pointer - msi_end_offset = buf.tell() + msi_end_offset = file.tell() return ValidChunk( - start_offset = start_offset, - end_offset = msi_end_offset, + start_offset = start_offset, + end_offset = msi_end_offset, ) From ceb5e7415656c79278e4cccaffd6a506cd904ccd Mon Sep 17 00:00:00 2001 From: Jon Crussell Date: Tue, 2 Sep 2025 14:34:15 -0700 Subject: [PATCH 4/4] fix(bug): compute the msi length from header Don't assume that pymsi will actually read the entire file. --- python/unblob/handlers/archive/msi.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/unblob/handlers/archive/msi.py b/python/unblob/handlers/archive/msi.py index 9abc925698..792647b2a4 100644 --- a/python/unblob/handlers/archive/msi.py +++ b/python/unblob/handlers/archive/msi.py @@ -54,8 +54,10 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk] package = pymsi.Package(file) msi = pymsi.Msi(package, False) - # MSI moves the file pointer - msi_end_offset = file.tell() + # multiply the number of sectors by the sector size, plus 512 for header + msi_size = (msi.package.ole.nb_sect * msi.package.ole.sector_size) + 512 + + msi_end_offset = start_offset + msi_size return ValidChunk( start_offset = start_offset,