Skip to content

Commit

Permalink
Removed globalization
Browse files Browse the repository at this point in the history
pretty code is happy code. also completely broken.
  • Loading branch information
brandonbunce committed Nov 7, 2023
1 parent 2101134 commit 2ff8154
Showing 1 changed file with 122 additions and 101 deletions.
223 changes: 122 additions & 101 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,121 @@
import shutil
import magic
from pymediainfo import MediaInfo
from tkinter.filedialog import askdirectory
import tkinter as tk
from tkinter import Tk
import tkinter.filedialog
import hashlib
from pathlib import Path

print("Starting InstagramDownloadMetadataFixer by Brandon Bunce")

# Specify the directory we want to search in.
Tk().withdraw()
search_directory = askdirectory(title="Select Instagram Root Directory (should contain comments/files/messages)")
def tk_update_status(string_input):
tk_status = tk.Label(master = tk_root, text = string_input)
tk_status.pack()
tk_root.update()

def idmf_check_working_directory():
# Make sure our working directory is set up.
if not os.path.exists("output/photos"):
os.makedirs("output/photos")
if not os.path.exists("output/video"):
os.makedirs("output/video")
if not os.path.exists("output/audio"):
os.makedirs("output/audio")

def idmf_delete_duplicates(target_dir):
# Listing out all the files
list_of_files = os.walk(target_dir)

# In order to detect the duplicate files, define an empty dictionary.
unique_files = dict()

for root, folders, files in list_of_files:
for file in files:
file_path = Path(os.path.join(root, file))
# Converting all the content of our file into an md5 hash.
Hash_file = hashlib.md5(open(file_path, 'rb').read()).hexdigest()
if Hash_file not in unique_files:
unique_files[Hash_file] = file_path
else:
# If the hash already exists, compare file sizes to ensure they are identical
existing_file_path = unique_files[Hash_file]
if os.path.getsize(file_path) == os.path.getsize(existing_file_path):
os.remove(file_path)
print(f"{file_path} has been deleted as a duplicate of {existing_file_path}")
else:
print(f"{file_path} has the same hash but different size, not deleting.")

def idmf_save_from_source(html_file, target_dir, image_list, dates_list):
print("Saving media from "+str(html_file))
for i in range(len(image_list)):
if os.path.exists("source/"+image_list[i]):
# For whatever reason, Instagram will randomly store media without a file extension,
# so, we must determine the file type before we pass it to other libraries.
file_type = magic.from_file("source/"+image_list[i], mime=True)
#print(str(i)+ " - " +str(file_type) +" - "+str(imagereferences[i]))
if file_type == "image/jpeg":
image = Image.open("source/"+image_list[i]).convert("RGB")
image.save(str(target_dir)+"photos/"+dates_list[i]+".jpg", "jpeg")
if file_type == "video/mp4":
# Maybe we can do some transcoding here at some point, but highly unneeded
# since Instagram compresses well.
mp4info = MediaInfo.parse("source/"+image_list[i])
hasVideo = False
hasAudio = False
for track in mp4info.tracks:
if track.track_type == "Video":
hasVideo = True
if track.track_type == "Audio":
hasAudio = True
if hasAudio and hasVideo:
# MP4 contains a video.
shutil.copy("source/"+image_list[i], str(target_dir)+"video/"+dates_list[i]+".mp4")
elif hasAudio:
# MP4 is just an audio clip.
shutil.copy("source/"+imagereferences[i], str(target_dir)+"audio/"+dates_list[i]+".mp4")
else:
print("A reference exists to a file ("+str(imagereferences[i])+") that doesn't exist. Did you extract all media properly?")

def idmf_correct_image_dates(image_dates):
date_counts = Counter(imagedates)
corrected_image_dates = []

for date in imagedates:
if date_counts[date] > 1:
#Append 00X based on the occurrence
occurrence = date_counts[date] - 1
new_date = f"{date}_{occurrence:03d}"
date_counts[date] -= 1
else:
new_date = date

corrected_image_dates.append(new_date)
return corrected_image_dates

def idmf_find_files_in_target(target_dir):
# Recursively search for matching files
matching_files = []
file_pattern = "message_*.html"
for root, dirnames, filenames in os.walk(target_dir):
for filename in fnmatch.filter(filenames, file_pattern):
matching_files.append(os.path.join(root, filename))
return matching_files

def idmf_parse_html_files(target_dir):
matching_files = idmf_find_files_in_target(target_dir)
for file_path in matching_files:
# For every file that matches our set file pattern...
with open(file_path, 'r', encoding='utf-8') as html_file:
source_code = html_file.read()
#print("Parsing: "+file_path)
#tk_update_status(file_path)
htmlParser.feed(source_code)

# Make sure that multiple instances of filenames are corrected so they will not cause issues with the filesystem
corrected_image_dates = idmf_correct_image_dates()
idmf_save_from_source(html_file, "output/", imagereferences, corrected_image_dates)
print("Finished parsing file with ("+str(len(imagereferences))+") unique files.")

# Define what we will do when searching thru HTML files
class MyHTMLParser(HTMLParser):
Expand Down Expand Up @@ -66,27 +170,23 @@ def handle_data(self, data):
if data == "Group photo":
# If we are about to encounter the preview image for the group chat, ignore it in the start tags.
isgroupphoto = True

htmlParser = MyHTMLParser()

# Define the pattern for matching HTML files with the name "message_x.html"
file_pattern = "message_*.html"

# Initialize a list to store the matching file paths
matching_files = []
# Specify the directory we want to search in.
tk_root = tk.Tk()
tk_root.title("InstagramDownloadMetadataFixer by Brandon Bunce")
tk_root.geometry("800x400")
tk_status = tk.Label(master = tk_root, text = "")
tk_status.pack()
search_directory = tkinter.filedialog.askdirectory(title="Select Instagram Root Directory (should contain comments/files/messages)")
#tk_root.mainloop()
#tk_root.withdraw()

# Make sure our working directory is set up.
if not os.path.exists("output/photos"):
os.makedirs("output/photos")
if not os.path.exists("output/video"):
os.makedirs("output/video")
if not os.path.exists("output/audio"):
os.makedirs("output/audio")

# Recursively search for matching files
for root, dirnames, filenames in os.walk(search_directory):
for filename in fnmatch.filter(filenames, file_pattern):
matching_files.append(os.path.join(root, filename))
idmf_check_working_directory()

# Parse all avaialbe html files
idmf_parse_html_files(search_directory)

# Initialize these lists here to collect data for each HTML file
imagereferences = []
Expand All @@ -96,86 +196,7 @@ def handle_data(self, data):
imageinmessagecount = 0
countdowntodate = 0

# Print the list of matching file paths
for file_path in matching_files:
with open(file_path, 'r', encoding='utf-8') as html_file:
source_code = html_file.read()
print("Parsing: "+file_path)
htmlParser.feed(source_code)

# Make sure that multiple instances of filenames are corrected so they will not cause issues with the filesystem
date_counts = Counter(imagedates)
corrected_image_dates = []

for date in imagedates:
if date_counts[date] > 1:
#Append 00X based on the occurrence
occurrence = date_counts[date] - 1
new_date = f"{date}_{occurrence:03d}"
date_counts[date] -= 1
else:
new_date = date

corrected_image_dates.append(new_date)

# Open all the media and rename them to their corresponding formatted date in array.
print("Saving media...")
for i in range(len(imagereferences)):
if os.path.exists("source/"+imagereferences[i]):
# For whatever reason, Instagram will randomly store media without a file extension,
# so, we must determine the file type before we pass it to other libraries.
file_type = magic.from_file("source/"+imagereferences[i], mime=True)
#print(str(i)+ " - " +str(file_type) +" - "+str(imagereferences[i]))
if file_type == "image/jpeg":
image = Image.open("source/"+imagereferences[i]).convert("RGB")
image.save("output/photos/"+corrected_image_dates[i]+".jpg", "jpeg")
if file_type == "video/mp4":
# Maybe we can do some transcoding here at some point, but highly unneeded
# since Instagram compresses well.
mp4info = MediaInfo.parse("source/"+imagereferences[i])
hasVideo = False
hasAudio = False
for track in mp4info.tracks:
if track.track_type == "Video":
hasVideo = True
if track.track_type == "Audio":
hasAudio = True
if hasAudio and hasVideo:
# MP4 contains a video.
shutil.copy("source/"+imagereferences[i], "output/video/"+corrected_image_dates[i]+".mp4")
elif hasAudio:
# MP4 is just an audio clip.
shutil.copy("source/"+imagereferences[i], "output/audio/"+corrected_image_dates[i]+".mp4")
else:
print("A reference exists to a file ("+str(imagereferences[i])+") that doesn't exist. Did you extract all media properly?")
print("Finished parsing file with ("+str(len(imagereferences))+") unique files.")
imagereferences.clear()
imagedates.clear()
corrected_image_dates.clear()
checkingfordate = False
countdowntodate = 0

delete_duplicates = input("Would you like to delete any duplicate files? (Spammed memes from your group chats)\nThis works by hashing files to determine if they are unique.\nPlease input (Y/N)\n")
if delete_duplicates.lower() == "y":
print("Deleting duplicates...")
# Listing out all the files
list_of_files = os.walk("output")

# In order to detect the duplicate files, define an empty dictionary.
unique_files = dict()

for root, folders, files in list_of_files:
for file in files:
file_path = Path(os.path.join(root, file))
# Converting all the content of our file into an md5 hash.
Hash_file = hashlib.md5(open(file_path, 'rb').read()).hexdigest()
if Hash_file not in unique_files:
unique_files[Hash_file] = file_path
else:
# If the hash already exists, compare file sizes to ensure they are identical
existing_file_path = unique_files[Hash_file]
if os.path.getsize(file_path) == os.path.getsize(existing_file_path):
os.remove(file_path)
print(f"{file_path} has been deleted as a duplicate of {existing_file_path}")
else:
print(f"{file_path} has the same hash but different size, not deleting.")
idmf_delete_duplicates("output")

0 comments on commit 2ff8154

Please sign in to comment.