Skip to content

Commit

Permalink
demozoo: last fixed and improvements to the scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
avivace committed Dec 25, 2024
1 parent 7c9dca0 commit 3730d3a
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 65 deletions.
3 changes: 2 additions & 1 deletion scrapers/py_importers/demozoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,8 @@ def scrape_page(slug, url, platform):
video,
date=release_date,
repository=source,
url=demozoo_url,
url=url,
url2=demozoo_url,
)


Expand Down
78 changes: 78 additions & 0 deletions scrapers/py_importers/dupe-check-against-dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import os
import hashlib
import json
import argparse

"""
Run like this
python dupe-check-against-dict.py /home/avivace/<USER>/database/scrapers/py_importers/py_common/beta /home/<USER>/gbdev/database/scripts/hashes.json
After having generated the 'hashes.json' file running scripts/dupe-finder.py once
"""


def get_file_hash(filename, alg="md5", chunksize=131072):
if alg == "sha256":
h = hashlib.sha256()
elif alg == "sha1":
h = hashlib.sha1()
elif alg == "md5":
h = hashlib.md5()

with open(filename, "rb", buffering=0) as f:
for b in iter(lambda: f.read(chunksize), b""):
h.update(b)
return h.hexdigest()


def find_rom_files(folder):
"""Find all .gb and .gbc files in subfolders of the given folder."""
rom_files = []
for root, _, files in os.walk(folder):
for file in files:
if file.endswith(".gb") or file.endswith(".gbc"):
rom_files.append(os.path.join(root, file))
return rom_files


def check_md5_against_json(rom_files, json_path):
"""Check the MD5 of each ROM file against the keys in the JSON file."""
with open(json_path, "r") as f:
md5_dict = json.load(f)

md5_list = list(md5_dict.keys())

for rom_file in rom_files:
md5_checksum = get_file_hash(rom_file)
if md5_checksum in md5_list:
print(
f"[FOUND] {rom_file} has a known MD5 checksum: {md5_dict[md5_checksum]}"
)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Check MD5 checksums of ROM files against a JSON file."
)
parser.add_argument(
"folder_path",
type=str,
help="Path to folder A containing subfolders with ROM files.",
)
parser.add_argument(
"json_file_path",
type=str,
help="Path to the JSON file containing MD5 checksums.",
)

args = parser.parse_args()

# Find ROM files in the folder
rom_files = find_rom_files(args.folder_path)

if not rom_files:
print("No .gb or .gbc files found.")
else:
# Check their MD5 checksums against the JSON file
check_md5_against_json(rom_files, args.json_file_path)
28 changes: 25 additions & 3 deletions scrapers/py_importers/py_common/Production.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,28 @@
class Production:
def __init__(self, title, slug, developer, platform, typetag, screenshots, files,
lic="", assetLicense="", description="", video="", date="", tags=[], alias="", repository="", gameWebsite="", devWebsite="", onlineplay="",
wip="", url=""):
def __init__(
self,
title,
slug,
developer,
platform,
typetag,
screenshots,
files,
lic="",
assetLicense="",
description="",
video="",
date="",
tags=[],
alias="",
repository="",
gameWebsite="",
devWebsite="",
onlineplay="",
wip="",
url="",
url2="",
):
# mandatory fields
self.title = title
self.slug = slug
Expand All @@ -26,3 +47,4 @@ def __init__(self, title, slug, developer, platform, typetag, screenshots, files
self.wip = wip if wip else ""

self.url = url
self.url2 = url2
Loading

0 comments on commit 3730d3a

Please sign in to comment.