Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2024 05 04 Experimental regex support (No rush to merge, proof of concept/feasibility discussion) #65

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions puremagic/magic_data.json
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,12 @@
["57505542", 10, ".mp3", "audio/mpeg", "MPEG-1 Audio Layer 3 (MP3) ID3v2.4.0 audio file"],
["57585858", 10, ".mp3", "audio/mpeg", "MPEG-1 Audio Layer 3 (MP3) ID3v2.4.0 audio file"],
["544147", -128, ".mp3", "audio/mpeg", "MPEG-1 Audio Layer 3 (MP3) ID3v2.4.0 audio file"]
]
],
"504b030414000600" : [
["776f72642f646f63756d656e742e786d6c", 3000, ".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "###REGEX### MS Office Open XML Format Word Document"],
["786c2f776f726b626f6f6b2e786d6c", 3000, ".xlsx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "###REGEX### Microsoft Office 2007+ Open XML Format Excel Document file"],
["786c2f76626150726f6a6563742e62696e", 0, ".xlsm", "application/vnd.ms-excel.sheet.macroEnabled.12","###REGEX### Microsoft Excel - Macro-Enabled Workbook"]
]
},
"footers": [
["54525545564953494f4e2d5846494c452e00", -18, ".tga", "image/tga", "Truevision Targa Graphic file"],
Expand Down Expand Up @@ -447,7 +452,7 @@
["464f524d", 0, ".aiff", "audio/aiff", "Audio Interchange File"],
["2e524d46", 0, ".rmvb", "", "RealMedia streaming media"],
[
"504b0304", 0, ".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"504b030414000600", 0, ".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"MS Office Open XML Format Document"
],
[
Expand Down Expand Up @@ -496,7 +501,7 @@
"Microsoft PowerPoint - Macro-Enabled Template File"
],
[
"504b0304", 0, ".xlsm", "application/vnd.ms-excel.sheet.macroEnabled.12",
"504b030414000600", 0, ".xlsm", "application/vnd.ms-excel.sheet.macroEnabled.12",
"Microsoft Excel - Macro-Enabled Workbook"
],
["7a626578", 0, ".info", "", "ZoomBrowser Image Index"],
Expand Down
52 changes: 35 additions & 17 deletions puremagic/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,33 +159,51 @@ def _identify_all(header: bytes, footer: bytes, ext=None) -> List[PureMagicWithC
for matched in matches:
if matched.byte_match in multi_part_dict:
for magic_row in multi_part_dict[matched.byte_match]:
start = magic_row.offset
end = magic_row.offset + len(magic_row.byte_match)
if magic_row.offset < 0:
match_area = footer[start:end] if end != 0 else footer[start:]
if match_area == magic_row.byte_match:
if "###REGEX###" in magic_row.name:
import re

if not magic_row.offset == 0:
scan_bytes = header[0 : magic_row.offset]
else:
scan_bytes = header
Comment on lines +165 to +168
Copy link
Contributor

@cclauss cclauss May 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if not magic_row.offset == 0:
scan_bytes = header[0 : magic_row.offset]
else:
scan_bytes = header
scan_bytes = header[0 : magic_row.offset] if magic_row.offset else header

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for that, my Python skills are like me, ugly but functional. 🙂

This PR will likely close for reasons discussed above, but the chances are some of it will come back in one form or another in V2.0

if re.search(magic_row.byte_match, scan_bytes):
new_matches.add(
PureMagic(
byte_match=matched.byte_match + magic_row.byte_match,
offset=magic_row.offset,
extension=magic_row.extension,
mime_type=magic_row.mime_type,
name=magic_row.name,
name=magic_row.name.split("###REGEX### ")[1],
)
)
else:
if end > len(header):
continue
if header[start:end] == magic_row.byte_match:
new_matches.add(
PureMagic(
byte_match=header[matched.offset : end],
offset=magic_row.offset,
extension=magic_row.extension,
mime_type=magic_row.mime_type,
name=magic_row.name,
start = magic_row.offset
end = magic_row.offset + len(magic_row.byte_match)
if magic_row.offset < 0:
match_area = footer[start:end] if end != 0 else footer[start:]
if match_area == magic_row.byte_match:
new_matches.add(
PureMagic(
byte_match=matched.byte_match + magic_row.byte_match,
offset=magic_row.offset,
extension=magic_row.extension,
mime_type=magic_row.mime_type,
name=magic_row.name,
)
)
else:
if end > len(header):
continue
if header[start:end] == magic_row.byte_match:
new_matches.add(
PureMagic(
byte_match=header[matched.offset : end],
offset=magic_row.offset,
extension=magic_row.extension,
mime_type=magic_row.mime_type,
name=magic_row.name,
)
)
)

matches.extend(list(new_matches))
return _confidence(matches, ext)
Expand Down