From 2ff815463d89da790a5e50218fd7b7c4c8389dfd Mon Sep 17 00:00:00 2001
From: Brandon Bunce <67491262+brandonbunce@users.noreply.github.com>
Date: Tue, 7 Nov 2023 09:30:46 -0800
Subject: [PATCH] Removed globalization

pretty code is happy code. also completely broken.
---
 main.py | 223 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 122 insertions(+), 101 deletions(-)

diff --git a/main.py b/main.py
index 4c0fa1f..9846394 100644
--- a/main.py
+++ b/main.py
@@ -7,17 +7,121 @@
 import shutil
 import magic
 from pymediainfo import MediaInfo
-from tkinter.filedialog import askdirectory
 import tkinter as tk
 from tkinter import Tk
+import tkinter.filedialog
 import hashlib
 from pathlib import Path
 
 print("Starting InstagramDownloadMetadataFixer by Brandon Bunce")
 
-# Specify the directory we want to search in.
-Tk().withdraw()
-search_directory = askdirectory(title="Select Instagram Root Directory (should contain comments/files/messages)")
+def tk_update_status(string_input):
+    tk_status = tk.Label(master = tk_root, text = string_input)
+    tk_status.pack()
+    tk_root.update()
+
+def idmf_check_working_directory():
+    # Make sure our working directory is set up.
+    if not os.path.exists("output/photos"):
+        os.makedirs("output/photos")
+    if not os.path.exists("output/video"):
+        os.makedirs("output/video")
+    if not os.path.exists("output/audio"):
+        os.makedirs("output/audio")
+
+def idmf_delete_duplicates(target_dir):
+    # Listing out all the files
+    list_of_files = os.walk(target_dir)
+  
+    # In order to detect the duplicate files, define an empty dictionary.
+    unique_files = dict()
+  
+    for root, folders, files in list_of_files:
+        for file in files:
+            file_path = Path(os.path.join(root, file))
+            # Converting all the content of our file into an md5 hash.
+            Hash_file = hashlib.md5(open(file_path, 'rb').read()).hexdigest()
+            if Hash_file not in unique_files:
+                unique_files[Hash_file] = file_path
+            else:
+                # If the hash already exists, compare file sizes to ensure they are identical
+                existing_file_path = unique_files[Hash_file]
+                if os.path.getsize(file_path) == os.path.getsize(existing_file_path):
+                    os.remove(file_path)
+                    print(f"{file_path} has been deleted as a duplicate of {existing_file_path}")
+                else:
+                    print(f"{file_path} has the same hash but different size, not deleting.")
+
+def idmf_save_from_source(html_file, target_dir, image_list, dates_list):
+    print("Saving media from "+str(html_file))
+    for i in range(len(image_list)):
+            if os.path.exists("source/"+image_list[i]):
+                # For whatever reason, Instagram will randomly store media without a file extension,
+                # so, we must determine the file type before we pass it to other libraries.
+                file_type = magic.from_file("source/"+image_list[i], mime=True)
+                #print(str(i)+ " - " +str(file_type) +" - "+str(imagereferences[i]))
+                if file_type == "image/jpeg":
+                    image = Image.open("source/"+image_list[i]).convert("RGB")
+                    image.save(str(target_dir)+"photos/"+dates_list[i]+".jpg", "jpeg")
+                if file_type == "video/mp4":
+                    # Maybe we can do some transcoding here at some point, but highly unneeded
+                    # since Instagram compresses well.
+                    mp4info = MediaInfo.parse("source/"+image_list[i])
+                    hasVideo = False
+                    hasAudio = False
+                    for track in mp4info.tracks:
+                        if track.track_type == "Video":
+                            hasVideo = True
+                        if track.track_type == "Audio":
+                            hasAudio = True
+                    if hasAudio and hasVideo:
+                        # MP4 contains a video.
+                        shutil.copy("source/"+image_list[i], str(target_dir)+"video/"+dates_list[i]+".mp4")
+                    elif hasAudio:
+                        # MP4 is just an audio clip.
+                        shutil.copy("source/"+imagereferences[i], str(target_dir)+"audio/"+dates_list[i]+".mp4")
+            else:
+                print("A reference exists to a file ("+str(imagereferences[i])+") that doesn't exist. Did you extract all media properly?")
+
+def idmf_correct_image_dates(image_dates):
+    date_counts = Counter(imagedates)
+    corrected_image_dates = []
+
+    for date in imagedates:
+        if date_counts[date] > 1:
+            #Append 00X based on the occurrence
+            occurrence = date_counts[date] - 1
+            new_date = f"{date}_{occurrence:03d}"
+            date_counts[date] -= 1
+        else:
+            new_date = date
+
+        corrected_image_dates.append(new_date)
+    return corrected_image_dates
+
+def idmf_find_files_in_target(target_dir):
+    # Recursively search for matching files
+    matching_files = []
+    file_pattern = "message_*.html"
+    for root, dirnames, filenames in os.walk(target_dir):
+        for filename in fnmatch.filter(filenames, file_pattern):
+            matching_files.append(os.path.join(root, filename))
+    return matching_files
+
+def idmf_parse_html_files(target_dir):
+    matching_files = idmf_find_files_in_target(target_dir)
+    for file_path in matching_files:
+    # For every file that matches our set file pattern...
+        with open(file_path, 'r', encoding='utf-8') as html_file:
+            source_code = html_file.read()
+            #print("Parsing: "+file_path)
+            #tk_update_status(file_path)
+            htmlParser.feed(source_code)
+
+            # Make sure that multiple instances of filenames are corrected so they will not cause issues with the filesystem
+            corrected_image_dates = idmf_correct_image_dates()
+            idmf_save_from_source(html_file, "output/", imagereferences, corrected_image_dates)
+        print("Finished parsing file with ("+str(len(imagereferences))+") unique files.")
 
 # Define what we will do when searching thru HTML files
 class MyHTMLParser(HTMLParser):
@@ -66,27 +170,23 @@ def handle_data(self, data):
         if data == "Group photo":
             # If we are about to encounter the preview image for the group chat, ignore it in the start tags.
             isgroupphoto = True                
-
 htmlParser = MyHTMLParser()
 
-# Define the pattern for matching HTML files with the name "message_x.html"
-file_pattern = "message_*.html"
-
-# Initialize a list to store the matching file paths
-matching_files = []
+# Specify the directory we want to search in.
+tk_root = tk.Tk()
+tk_root.title("InstagramDownloadMetadataFixer by Brandon Bunce")
+tk_root.geometry("800x400")
+tk_status = tk.Label(master = tk_root, text = "")
+tk_status.pack()
+search_directory = tkinter.filedialog.askdirectory(title="Select Instagram Root Directory (should contain comments/files/messages)")
+#tk_root.mainloop()
+#tk_root.withdraw()
 
 # Make sure our working directory is set up.
-if not os.path.exists("output/photos"):
-    os.makedirs("output/photos")
-if not os.path.exists("output/video"):
-    os.makedirs("output/video")
-if not os.path.exists("output/audio"):
-    os.makedirs("output/audio")
-
-# Recursively search for matching files
-for root, dirnames, filenames in os.walk(search_directory):
-    for filename in fnmatch.filter(filenames, file_pattern):
-        matching_files.append(os.path.join(root, filename))
+idmf_check_working_directory()
+
+# Parse all avaialbe html files
+idmf_parse_html_files(search_directory)
 
 # Initialize these lists here to collect data for each HTML file
 imagereferences = []
@@ -96,86 +196,7 @@ def handle_data(self, data):
 imageinmessagecount = 0
 countdowntodate = 0
 
-# Print the list of matching file paths
-for file_path in matching_files:
-    with open(file_path, 'r', encoding='utf-8') as html_file:
-        source_code = html_file.read()
-        print("Parsing: "+file_path)
-        htmlParser.feed(source_code)
-
-        # Make sure that multiple instances of filenames are corrected so they will not cause issues with the filesystem
-        date_counts = Counter(imagedates)
-        corrected_image_dates = []
-
-        for date in imagedates:
-            if date_counts[date] > 1:
-                #Append 00X based on the occurrence
-                occurrence = date_counts[date] - 1
-                new_date = f"{date}_{occurrence:03d}"
-                date_counts[date] -= 1
-            else:
-                new_date = date
-
-            corrected_image_dates.append(new_date)
-
-        # Open all the media and rename them to their corresponding formatted date in array.
-        print("Saving media...")
-        for i in range(len(imagereferences)):
-            if os.path.exists("source/"+imagereferences[i]):
-                # For whatever reason, Instagram will randomly store media without a file extension,
-                # so, we must determine the file type before we pass it to other libraries.
-                file_type = magic.from_file("source/"+imagereferences[i], mime=True)
-                #print(str(i)+ " - " +str(file_type) +" - "+str(imagereferences[i]))
-                if file_type == "image/jpeg":
-                    image = Image.open("source/"+imagereferences[i]).convert("RGB")
-                    image.save("output/photos/"+corrected_image_dates[i]+".jpg", "jpeg")
-                if file_type == "video/mp4":
-                    # Maybe we can do some transcoding here at some point, but highly unneeded
-                    # since Instagram compresses well.
-                    mp4info = MediaInfo.parse("source/"+imagereferences[i])
-                    hasVideo = False
-                    hasAudio = False
-                    for track in mp4info.tracks:
-                        if track.track_type == "Video":
-                            hasVideo = True
-                        if track.track_type == "Audio":
-                            hasAudio = True
-                    if hasAudio and hasVideo:
-                        # MP4 contains a video.
-                        shutil.copy("source/"+imagereferences[i], "output/video/"+corrected_image_dates[i]+".mp4")
-                    elif hasAudio:
-                        # MP4 is just an audio clip.
-                        shutil.copy("source/"+imagereferences[i], "output/audio/"+corrected_image_dates[i]+".mp4")
-            else:
-                print("A reference exists to a file ("+str(imagereferences[i])+") that doesn't exist. Did you extract all media properly?")
-    print("Finished parsing file with ("+str(len(imagereferences))+") unique files.")
-    imagereferences.clear()
-    imagedates.clear()
-    corrected_image_dates.clear()
-    checkingfordate = False
-    countdowntodate = 0
-
 delete_duplicates = input("Would you like to delete any duplicate files? (Spammed memes from your group chats)\nThis works by hashing files to determine if they are unique.\nPlease input (Y/N)\n")
 if delete_duplicates.lower() == "y":
     print("Deleting duplicates...")
-    # Listing out all the files
-    list_of_files = os.walk("output")
-  
-    # In order to detect the duplicate files, define an empty dictionary.
-    unique_files = dict()
-  
-    for root, folders, files in list_of_files:
-        for file in files:
-            file_path = Path(os.path.join(root, file))
-            # Converting all the content of our file into an md5 hash.
-            Hash_file = hashlib.md5(open(file_path, 'rb').read()).hexdigest()
-            if Hash_file not in unique_files:
-                unique_files[Hash_file] = file_path
-            else:
-                # If the hash already exists, compare file sizes to ensure they are identical
-                existing_file_path = unique_files[Hash_file]
-                if os.path.getsize(file_path) == os.path.getsize(existing_file_path):
-                    os.remove(file_path)
-                    print(f"{file_path} has been deleted as a duplicate of {existing_file_path}")
-                else:
-                    print(f"{file_path} has the same hash but different size, not deleting.")
\ No newline at end of file
+    idmf_delete_duplicates("output")
\ No newline at end of file