From 4d11e6e24f5312dbac12248506298a8e552f882c Mon Sep 17 00:00:00 2001 From: Brandon Bunce <67491262+brandonbunce@users.noreply.github.com> Date: Wed, 8 Nov 2023 22:15:33 -0800 Subject: [PATCH] Update main.py Simplified functions, cleaned up variables, and added some stats for space savings when removing duplicates. --- main.py | 85 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 47 insertions(+), 38 deletions(-) diff --git a/main.py b/main.py index 9846394..99baf96 100644 --- a/main.py +++ b/main.py @@ -8,7 +8,6 @@ import magic from pymediainfo import MediaInfo import tkinter as tk -from tkinter import Tk import tkinter.filedialog import hashlib from pathlib import Path @@ -35,6 +34,8 @@ def idmf_delete_duplicates(target_dir): # In order to detect the duplicate files, define an empty dictionary. unique_files = dict() + + total_space_saved_in_bytes = 0 for root, folders, files in list_of_files: for file in files: @@ -49,17 +50,21 @@ def idmf_delete_duplicates(target_dir): if os.path.getsize(file_path) == os.path.getsize(existing_file_path): os.remove(file_path) print(f"{file_path} has been deleted as a duplicate of {existing_file_path}") + duplicate_stats = os.stat(existing_file_path) + total_space_saved_in_bytes += duplicate_stats.st_size + else: print(f"{file_path} has the same hash but different size, not deleting.") + print("Successfully removed duplicates, saving "+str(total_space_saved_in_bytes)+" bytes of data.") -def idmf_save_from_source(html_file, target_dir, image_list, dates_list): - print("Saving media from "+str(html_file)) +def idmf_save_media(html_file_name, target_dir, image_list, dates_list): + print("Saving media from "+str(html_file_name)) for i in range(len(image_list)): if os.path.exists("source/"+image_list[i]): # For whatever reason, Instagram will randomly store media without a file extension, # so, we must determine the file type before we pass it to other libraries. file_type = magic.from_file("source/"+image_list[i], mime=True) - #print(str(i)+ " - " +str(file_type) +" - "+str(imagereferences[i])) + #print(str(i)+ " - " +str(file_type) +" - "+str(media_links[i])) if file_type == "image/jpeg": image = Image.open("source/"+image_list[i]).convert("RGB") image.save(str(target_dir)+"photos/"+dates_list[i]+".jpg", "jpeg") @@ -79,15 +84,17 @@ def idmf_save_from_source(html_file, target_dir, image_list, dates_list): shutil.copy("source/"+image_list[i], str(target_dir)+"video/"+dates_list[i]+".mp4") elif hasAudio: # MP4 is just an audio clip. - shutil.copy("source/"+imagereferences[i], str(target_dir)+"audio/"+dates_list[i]+".mp4") + shutil.copy("source/"+image_list[i], str(target_dir)+"audio/"+dates_list[i]+".mp4") else: - print("A reference exists to a file ("+str(imagereferences[i])+") that doesn't exist. Did you extract all media properly?") + print("A reference exists to a file ("+str(image_list[i])+") that doesn't exist. Did you extract all media properly?") -def idmf_correct_image_dates(image_dates): - date_counts = Counter(imagedates) - corrected_image_dates = [] +# In this function, we will transform the date from what it is on the html file into +# a proper format we can use for the name. +def idmf_correct_media_dates(media_dates_list): + date_counts = Counter(media_dates_list) + corrected_image_dates_list = [] - for date in imagedates: + for date in media_dates_list: if date_counts[date] > 1: #Append 00X based on the occurrence occurrence = date_counts[date] - 1 @@ -96,38 +103,48 @@ def idmf_correct_image_dates(image_dates): else: new_date = date - corrected_image_dates.append(new_date) - return corrected_image_dates + corrected_image_dates_list.append(new_date) + return corrected_image_dates_list + +def idmf_parse_html_files(target_dir): + global media_dates, media_links + -def idmf_find_files_in_target(target_dir): # Recursively search for matching files matching_files = [] file_pattern = "message_*.html" for root, dirnames, filenames in os.walk(target_dir): for filename in fnmatch.filter(filenames, file_pattern): matching_files.append(os.path.join(root, filename)) - return matching_files -def idmf_parse_html_files(target_dir): - matching_files = idmf_find_files_in_target(target_dir) for file_path in matching_files: # For every file that matches our set file pattern... with open(file_path, 'r', encoding='utf-8') as html_file: source_code = html_file.read() - #print("Parsing: "+file_path) + print("Parsing: "+file_path) #tk_update_status(file_path) htmlParser.feed(source_code) # Make sure that multiple instances of filenames are corrected so they will not cause issues with the filesystem - corrected_image_dates = idmf_correct_image_dates() - idmf_save_from_source(html_file, "output/", imagereferences, corrected_image_dates) - print("Finished parsing file with ("+str(len(imagereferences))+") unique files.") + corrected_image_dates = idmf_correct_media_dates(media_dates_list=media_dates) + idmf_save_media(file_path, "output/", media_links, corrected_image_dates) + print("Finished parsing file with ("+str(len(media_links))+") unique files.") + + +# Global variables we use to keep track of collected data. +media_links = [] +media_dates = [] +checkingfordate = False +ignore_because_group_photo = False +object_media_total = 0 +countdowntodate = 0 # Define what we will do when searching thru HTML files class MyHTMLParser(HTMLParser): + # When we observe a start tag (div) def handle_starttag(self, tag, attrs): - global checkingfordate, countdowntodate, imageinmessagecount, isgroupphoto + global checkingfordate, countdowntodate, object_media_total, ignore_because_group_photo #print("Encountered a start tag:", tag) for attr, value in attrs: #print(" attr:", attr) @@ -136,40 +153,40 @@ def handle_starttag(self, tag, attrs): if attr == "src" and value != "files/Instagram-Logo.png": #print("Extracted image: "+value) # Record image reference - if isgroupphoto: - isgroupphoto = False + if ignore_because_group_photo: + ignore_because_group_photo = False print("Ignoring "+value+" because it is the icon for the chat.") if value == "": print("A reference exists to nothing! Good job Instagram.") else: - imagereferences.append(value) + media_links.append(value) countdowntodate = 0 - imageinmessagecount += 1 + object_media_total += 1 checkingfordate = True if tag == "div" and checkingfordate: countdowntodate += 1 #print(countdowntodate) def handle_data(self, data): - global checkingfordate, countdowntodate, imageinmessagecount, isgroupphoto + global checkingfordate, countdowntodate, object_media_total, ignore_because_group_photo #print("Encountered some data :", data) if countdowntodate == 1 and checkingfordate: - for i in range(imageinmessagecount): + for i in range(object_media_total): # For every image we find in a message block, # convert the extracted date to the format we want # (eg. Sep 30, 2022, 9:32PM TO 20220930_213200) time_object = time.strptime(data, '%b %d, %Y, %I:%M %p') formatted_time_object = time.strftime('%Y%m%d_%H%M%S', time_object) - imagedates.append(formatted_time_object) + media_dates.append(formatted_time_object) #print("Added Timestamp: "+ formatted_time_object) # Because we found the date for the media, start looking for the next media. checkingfordate = False countdowntodate = 0 - imageinmessagecount = 0 + object_media_total = 0 if data == "Group photo": # If we are about to encounter the preview image for the group chat, ignore it in the start tags. - isgroupphoto = True + ignore_because_group_photo = True htmlParser = MyHTMLParser() # Specify the directory we want to search in. @@ -188,14 +205,6 @@ def handle_data(self, data): # Parse all avaialbe html files idmf_parse_html_files(search_directory) -# Initialize these lists here to collect data for each HTML file -imagereferences = [] -imagedates = [] -checkingfordate = False -isgroupphoto = False -imageinmessagecount = 0 -countdowntodate = 0 - delete_duplicates = input("Would you like to delete any duplicate files? (Spammed memes from your group chats)\nThis works by hashing files to determine if they are unique.\nPlease input (Y/N)\n") if delete_duplicates.lower() == "y": print("Deleting duplicates...")