Update main.py

+ Added support for videos and audio clips + Instagram root directory prompt is now GUI-based + Now accounts for Instagram not putting file extensions on their data. + Sorts organized data into folders (video/audio/photos) + Allows you to remove duplicated media using a hashing algorithm + Accounts for group chat icon photos and Instagram making references to literally nothing in HTML files.
brandonbunce · Nov 7, 2023 · 2101134 · 2101134
1 parent 0042d14
commit 2101134
Showing 1 changed file with 121 additions and 35 deletions.
diff --git a/main.py b/main.py
@@ -3,66 +3,86 @@
 import time
 import fnmatch
 import os
+from collections import Counter
+import shutil
+import magic
+from pymediainfo import MediaInfo
+from tkinter.filedialog import askdirectory
+import tkinter as tk
+from tkinter import Tk
+import hashlib
+from pathlib import Path
 
 print("Starting InstagramDownloadMetadataFixer by Brandon Bunce")
 
-val = input("Please input Instagram download root folder (should contain comments, files, message, and index.html\n")
-#im = Image.open(val).convert("RGB")
-#im.show()
-#im.save()
+# Specify the directory we want to search in.
+Tk().withdraw()
+search_directory = askdirectory(title="Select Instagram Root Directory (should contain comments/files/messages)")
 
 # Define what we will do when searching thru HTML files
 class MyHTMLParser(HTMLParser):
+    # When we observe a start tag (div)
     def handle_starttag(self, tag, attrs):
-        global checkingfordate, countdowntodate, imageinmessagecount
+        global checkingfordate, countdowntodate, imageinmessagecount, isgroupphoto
         #print("Encountered a start tag:", tag)
         for attr, value in attrs:
             #print("     attr:", attr)
             #print("         value:", value)
-            if tag == "img":
+            if tag == "img" or tag == "video" or tag == "audio":
                 if attr == "src" and value != "files/Instagram-Logo.png":
                     #print("Extracted image: "+value)
                     # Record image reference
-                    imagereferences.append(value)
-                    countdowntodate = 0
-                    imageinmessagecount += 1
-                    checkingfordate = True
+                    if isgroupphoto:
+                        isgroupphoto = False
+                        print("Ignoring "+value+" because it is the icon for the chat.")
+                    if value == "":
+                        print("A reference exists to nothing! Good job Instagram.")
+                    else:
+                        imagereferences.append(value)
+                        countdowntodate = 0
+                        imageinmessagecount += 1
+                        checkingfordate = True
             if tag == "div" and checkingfordate:
                 countdowntodate += 1
                 #print(countdowntodate)
 
     def handle_data(self, data):
-        global checkingfordate, countdowntodate, imageinmessagecount
+        global checkingfordate, countdowntodate, imageinmessagecount, isgroupphoto
         #print("Encountered some data  :", data)
         if countdowntodate == 1 and checkingfordate:
             for i in range(imageinmessagecount):
-                # Convert extracted date to the format we want (eg. Sep 30, 2022, 9:32PM TO 20220930_213200)
-                time_obj = time.strptime(data, '%b %d, %Y, %I:%M %p')
-                formatted_time_object = time.strftime('%Y%m%d_%H%M%S', time_obj)
-                if (imageinmessagecount > 1):
-                    imagedates.append(formatted_time_object +"_00"+str(i))
-                    # 20230918_101032_002
-                else:
-                    imagedates.append(formatted_time_object)
-
+                # For every image we find in a message block, 
+                # convert  the extracted date to the format we want 
+                # (eg. Sep 30, 2022, 9:32PM TO 20220930_213200)
+                time_object = time.strptime(data, '%b %d, %Y, %I:%M %p')
+                formatted_time_object = time.strftime('%Y%m%d_%H%M%S', time_object)
+                imagedates.append(formatted_time_object)
                 #print("Added Timestamp: "+ formatted_time_object)
+
+            # Because we found the date for the media, start looking for the next media.
             checkingfordate = False
             countdowntodate = 0
             imageinmessagecount = 0
-
+        if data == "Group photo":
+            # If we are about to encounter the preview image for the group chat, ignore it in the start tags.
+            isgroupphoto = True                
 
 htmlParser = MyHTMLParser()
 
-
-# Specify the directory you want to search in
-search_directory = r'C:\Users\Donut\Desktop\InstagramDownloadMetadataFixer\source'
-
 # Define the pattern for matching HTML files with the name "message_x.html"
 file_pattern = "message_*.html"
 
 # Initialize a list to store the matching file paths
 matching_files = []
 
+# Make sure our working directory is set up.
+if not os.path.exists("output/photos"):
+    os.makedirs("output/photos")
+if not os.path.exists("output/video"):
+    os.makedirs("output/video")
+if not os.path.exists("output/audio"):
+    os.makedirs("output/audio")
+
 # Recursively search for matching files
 for root, dirnames, filenames in os.walk(search_directory):
     for filename in fnmatch.filter(filenames, file_pattern):
@@ -72,6 +92,7 @@ def handle_data(self, data):
 imagereferences = []
 imagedates = []
 checkingfordate = False
+isgroupphoto = False
 imageinmessagecount = 0
 countdowntodate = 0
 
@@ -82,14 +103,79 @@ def handle_data(self, data):
         print("Parsing: "+file_path)
         htmlParser.feed(source_code)
 
-        # Open all the images and rename them to their corresponding formatted date in array.
-        print("Saving images...")
+        # Make sure that multiple instances of filenames are corrected so they will not cause issues with the filesystem
+        date_counts = Counter(imagedates)
+        corrected_image_dates = []
+
+        for date in imagedates:
+            if date_counts[date] > 1:
+                #Append 00X based on the occurrence
+                occurrence = date_counts[date] - 1
+                new_date = f"{date}_{occurrence:03d}"
+                date_counts[date] -= 1
+            else:
+                new_date = date
+
+            corrected_image_dates.append(new_date)
+
+        # Open all the media and rename them to their corresponding formatted date in array.
+        print("Saving media...")
         for i in range(len(imagereferences)):
-            im = Image.open("source/"+imagereferences[i]).convert("RGB")
-            im.save("output/"+imagedates[i]+".png", "png")
-
-        print("Finished parsing file with ("+str(len(imagereferences))+") unique image(s).")
-        imagereferences.clear()
-        imagedates.clear()
-        checkingfordate = False
-        countdowntodate = 0
+            if os.path.exists("source/"+imagereferences[i]):
+                # For whatever reason, Instagram will randomly store media without a file extension,
+                # so, we must determine the file type before we pass it to other libraries.
+                file_type = magic.from_file("source/"+imagereferences[i], mime=True)
+                #print(str(i)+ " - " +str(file_type) +" - "+str(imagereferences[i]))
+                if file_type == "image/jpeg":
+                    image = Image.open("source/"+imagereferences[i]).convert("RGB")
+                    image.save("output/photos/"+corrected_image_dates[i]+".jpg", "jpeg")
+                if file_type == "video/mp4":
+                    # Maybe we can do some transcoding here at some point, but highly unneeded
+                    # since Instagram compresses well.
+                    mp4info = MediaInfo.parse("source/"+imagereferences[i])
+                    hasVideo = False
+                    hasAudio = False
+                    for track in mp4info.tracks:
+                        if track.track_type == "Video":
+                            hasVideo = True
+                        if track.track_type == "Audio":
+                            hasAudio = True
+                    if hasAudio and hasVideo:
+                        # MP4 contains a video.
+                        shutil.copy("source/"+imagereferences[i], "output/video/"+corrected_image_dates[i]+".mp4")
+                    elif hasAudio:
+                        # MP4 is just an audio clip.
+                        shutil.copy("source/"+imagereferences[i], "output/audio/"+corrected_image_dates[i]+".mp4")
+            else:
+                print("A reference exists to a file ("+str(imagereferences[i])+") that doesn't exist. Did you extract all media properly?")
+    print("Finished parsing file with ("+str(len(imagereferences))+") unique files.")
+    imagereferences.clear()
+    imagedates.clear()
+    corrected_image_dates.clear()
+    checkingfordate = False
+    countdowntodate = 0
+
+delete_duplicates = input("Would you like to delete any duplicate files? (Spammed memes from your group chats)\nThis works by hashing files to determine if they are unique.\nPlease input (Y/N)\n")
+if delete_duplicates.lower() == "y":
+    print("Deleting duplicates...")
+    # Listing out all the files
+    list_of_files = os.walk("output")
+
+    # In order to detect the duplicate files, define an empty dictionary.
+    unique_files = dict()
+
+    for root, folders, files in list_of_files:
+        for file in files:
+            file_path = Path(os.path.join(root, file))
+            # Converting all the content of our file into an md5 hash.
+            Hash_file = hashlib.md5(open(file_path, 'rb').read()).hexdigest()
+            if Hash_file not in unique_files:
+                unique_files[Hash_file] = file_path
+            else:
+                # If the hash already exists, compare file sizes to ensure they are identical
+                existing_file_path = unique_files[Hash_file]
+                if os.path.getsize(file_path) == os.path.getsize(existing_file_path):
+                    os.remove(file_path)
+                    print(f"{file_path} has been deleted as a duplicate of {existing_file_path}")
+                else:
+                    print(f"{file_path} has the same hash but different size, not deleting.")