From 2ff815463d89da790a5e50218fd7b7c4c8389dfd Mon Sep 17 00:00:00 2001 From: Brandon Bunce <67491262+brandonbunce@users.noreply.github.com> Date: Tue, 7 Nov 2023 09:30:46 -0800 Subject: [PATCH] Removed globalization pretty code is happy code. also completely broken. --- main.py | 223 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 122 insertions(+), 101 deletions(-) diff --git a/main.py b/main.py index 4c0fa1f..9846394 100644 --- a/main.py +++ b/main.py @@ -7,17 +7,121 @@ import shutil import magic from pymediainfo import MediaInfo -from tkinter.filedialog import askdirectory import tkinter as tk from tkinter import Tk +import tkinter.filedialog import hashlib from pathlib import Path print("Starting InstagramDownloadMetadataFixer by Brandon Bunce") -# Specify the directory we want to search in. -Tk().withdraw() -search_directory = askdirectory(title="Select Instagram Root Directory (should contain comments/files/messages)") +def tk_update_status(string_input): + tk_status = tk.Label(master = tk_root, text = string_input) + tk_status.pack() + tk_root.update() + +def idmf_check_working_directory(): + # Make sure our working directory is set up. + if not os.path.exists("output/photos"): + os.makedirs("output/photos") + if not os.path.exists("output/video"): + os.makedirs("output/video") + if not os.path.exists("output/audio"): + os.makedirs("output/audio") + +def idmf_delete_duplicates(target_dir): + # Listing out all the files + list_of_files = os.walk(target_dir) + + # In order to detect the duplicate files, define an empty dictionary. + unique_files = dict() + + for root, folders, files in list_of_files: + for file in files: + file_path = Path(os.path.join(root, file)) + # Converting all the content of our file into an md5 hash. + Hash_file = hashlib.md5(open(file_path, 'rb').read()).hexdigest() + if Hash_file not in unique_files: + unique_files[Hash_file] = file_path + else: + # If the hash already exists, compare file sizes to ensure they are identical + existing_file_path = unique_files[Hash_file] + if os.path.getsize(file_path) == os.path.getsize(existing_file_path): + os.remove(file_path) + print(f"{file_path} has been deleted as a duplicate of {existing_file_path}") + else: + print(f"{file_path} has the same hash but different size, not deleting.") + +def idmf_save_from_source(html_file, target_dir, image_list, dates_list): + print("Saving media from "+str(html_file)) + for i in range(len(image_list)): + if os.path.exists("source/"+image_list[i]): + # For whatever reason, Instagram will randomly store media without a file extension, + # so, we must determine the file type before we pass it to other libraries. + file_type = magic.from_file("source/"+image_list[i], mime=True) + #print(str(i)+ " - " +str(file_type) +" - "+str(imagereferences[i])) + if file_type == "image/jpeg": + image = Image.open("source/"+image_list[i]).convert("RGB") + image.save(str(target_dir)+"photos/"+dates_list[i]+".jpg", "jpeg") + if file_type == "video/mp4": + # Maybe we can do some transcoding here at some point, but highly unneeded + # since Instagram compresses well. + mp4info = MediaInfo.parse("source/"+image_list[i]) + hasVideo = False + hasAudio = False + for track in mp4info.tracks: + if track.track_type == "Video": + hasVideo = True + if track.track_type == "Audio": + hasAudio = True + if hasAudio and hasVideo: + # MP4 contains a video. + shutil.copy("source/"+image_list[i], str(target_dir)+"video/"+dates_list[i]+".mp4") + elif hasAudio: + # MP4 is just an audio clip. + shutil.copy("source/"+imagereferences[i], str(target_dir)+"audio/"+dates_list[i]+".mp4") + else: + print("A reference exists to a file ("+str(imagereferences[i])+") that doesn't exist. Did you extract all media properly?") + +def idmf_correct_image_dates(image_dates): + date_counts = Counter(imagedates) + corrected_image_dates = [] + + for date in imagedates: + if date_counts[date] > 1: + #Append 00X based on the occurrence + occurrence = date_counts[date] - 1 + new_date = f"{date}_{occurrence:03d}" + date_counts[date] -= 1 + else: + new_date = date + + corrected_image_dates.append(new_date) + return corrected_image_dates + +def idmf_find_files_in_target(target_dir): + # Recursively search for matching files + matching_files = [] + file_pattern = "message_*.html" + for root, dirnames, filenames in os.walk(target_dir): + for filename in fnmatch.filter(filenames, file_pattern): + matching_files.append(os.path.join(root, filename)) + return matching_files + +def idmf_parse_html_files(target_dir): + matching_files = idmf_find_files_in_target(target_dir) + for file_path in matching_files: + # For every file that matches our set file pattern... + with open(file_path, 'r', encoding='utf-8') as html_file: + source_code = html_file.read() + #print("Parsing: "+file_path) + #tk_update_status(file_path) + htmlParser.feed(source_code) + + # Make sure that multiple instances of filenames are corrected so they will not cause issues with the filesystem + corrected_image_dates = idmf_correct_image_dates() + idmf_save_from_source(html_file, "output/", imagereferences, corrected_image_dates) + print("Finished parsing file with ("+str(len(imagereferences))+") unique files.") # Define what we will do when searching thru HTML files class MyHTMLParser(HTMLParser): @@ -66,27 +170,23 @@ def handle_data(self, data): if data == "Group photo": # If we are about to encounter the preview image for the group chat, ignore it in the start tags. isgroupphoto = True - htmlParser = MyHTMLParser() -# Define the pattern for matching HTML files with the name "message_x.html" -file_pattern = "message_*.html" - -# Initialize a list to store the matching file paths -matching_files = [] +# Specify the directory we want to search in. +tk_root = tk.Tk() +tk_root.title("InstagramDownloadMetadataFixer by Brandon Bunce") +tk_root.geometry("800x400") +tk_status = tk.Label(master = tk_root, text = "") +tk_status.pack() +search_directory = tkinter.filedialog.askdirectory(title="Select Instagram Root Directory (should contain comments/files/messages)") +#tk_root.mainloop() +#tk_root.withdraw() # Make sure our working directory is set up. -if not os.path.exists("output/photos"): - os.makedirs("output/photos") -if not os.path.exists("output/video"): - os.makedirs("output/video") -if not os.path.exists("output/audio"): - os.makedirs("output/audio") - -# Recursively search for matching files -for root, dirnames, filenames in os.walk(search_directory): - for filename in fnmatch.filter(filenames, file_pattern): - matching_files.append(os.path.join(root, filename)) +idmf_check_working_directory() + +# Parse all avaialbe html files +idmf_parse_html_files(search_directory) # Initialize these lists here to collect data for each HTML file imagereferences = [] @@ -96,86 +196,7 @@ def handle_data(self, data): imageinmessagecount = 0 countdowntodate = 0 -# Print the list of matching file paths -for file_path in matching_files: - with open(file_path, 'r', encoding='utf-8') as html_file: - source_code = html_file.read() - print("Parsing: "+file_path) - htmlParser.feed(source_code) - - # Make sure that multiple instances of filenames are corrected so they will not cause issues with the filesystem - date_counts = Counter(imagedates) - corrected_image_dates = [] - - for date in imagedates: - if date_counts[date] > 1: - #Append 00X based on the occurrence - occurrence = date_counts[date] - 1 - new_date = f"{date}_{occurrence:03d}" - date_counts[date] -= 1 - else: - new_date = date - - corrected_image_dates.append(new_date) - - # Open all the media and rename them to their corresponding formatted date in array. - print("Saving media...") - for i in range(len(imagereferences)): - if os.path.exists("source/"+imagereferences[i]): - # For whatever reason, Instagram will randomly store media without a file extension, - # so, we must determine the file type before we pass it to other libraries. - file_type = magic.from_file("source/"+imagereferences[i], mime=True) - #print(str(i)+ " - " +str(file_type) +" - "+str(imagereferences[i])) - if file_type == "image/jpeg": - image = Image.open("source/"+imagereferences[i]).convert("RGB") - image.save("output/photos/"+corrected_image_dates[i]+".jpg", "jpeg") - if file_type == "video/mp4": - # Maybe we can do some transcoding here at some point, but highly unneeded - # since Instagram compresses well. - mp4info = MediaInfo.parse("source/"+imagereferences[i]) - hasVideo = False - hasAudio = False - for track in mp4info.tracks: - if track.track_type == "Video": - hasVideo = True - if track.track_type == "Audio": - hasAudio = True - if hasAudio and hasVideo: - # MP4 contains a video. - shutil.copy("source/"+imagereferences[i], "output/video/"+corrected_image_dates[i]+".mp4") - elif hasAudio: - # MP4 is just an audio clip. - shutil.copy("source/"+imagereferences[i], "output/audio/"+corrected_image_dates[i]+".mp4") - else: - print("A reference exists to a file ("+str(imagereferences[i])+") that doesn't exist. Did you extract all media properly?") - print("Finished parsing file with ("+str(len(imagereferences))+") unique files.") - imagereferences.clear() - imagedates.clear() - corrected_image_dates.clear() - checkingfordate = False - countdowntodate = 0 - delete_duplicates = input("Would you like to delete any duplicate files? (Spammed memes from your group chats)\nThis works by hashing files to determine if they are unique.\nPlease input (Y/N)\n") if delete_duplicates.lower() == "y": print("Deleting duplicates...") - # Listing out all the files - list_of_files = os.walk("output") - - # In order to detect the duplicate files, define an empty dictionary. - unique_files = dict() - - for root, folders, files in list_of_files: - for file in files: - file_path = Path(os.path.join(root, file)) - # Converting all the content of our file into an md5 hash. - Hash_file = hashlib.md5(open(file_path, 'rb').read()).hexdigest() - if Hash_file not in unique_files: - unique_files[Hash_file] = file_path - else: - # If the hash already exists, compare file sizes to ensure they are identical - existing_file_path = unique_files[Hash_file] - if os.path.getsize(file_path) == os.path.getsize(existing_file_path): - os.remove(file_path) - print(f"{file_path} has been deleted as a duplicate of {existing_file_path}") - else: - print(f"{file_path} has the same hash but different size, not deleting.") \ No newline at end of file + idmf_delete_duplicates("output") \ No newline at end of file