diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..16d4fc1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +*.pyc +*.pyd +*.temp +build/ +*.egg-info +temp_* +*.aprof diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..39068b2 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,43 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Test: ACBEdit", + "type": "debugpy", + "request": "launch", + "module": "Tests.test_ACBEdit", + "justMyCode": false + }, + { + "name": "Test: USMBuild", + "type": "debugpy", + "request": "launch", + "module": "Tests.test_USMBuild", + "justMyCode": false + }, + { + "name": "Test: USMDecode", + "type": "debugpy", + "request": "launch", + "module": "Tests.test_USMDecode", + "justMyCode": false + }, + { + "name": "Test: CPK Unpack", + "type": "debugpy", + "request": "launch", + "module": "Tests.test_CPKUnpack", + "justMyCode": false + }, + { + "name": "Test: CPK Build", + "type": "debugpy", + "request": "launch", + "module": "Tests.test_CPKBuild", + "justMyCode": false + }, + ] +} \ No newline at end of file diff --git a/CriCodecs/hca.h b/CriCodecs/hca.h index efb59f2..45e648a 100644 --- a/CriCodecs/hca.h +++ b/CriCodecs/hca.h @@ -78,7 +78,7 @@ void clHCA_ReadSamples16(clHCA *, signed short *outSamples); /* Sets a 64 bit encryption key, to properly decode blocks. This may be called * multiple times to change the key, before or after clHCA_DecodeHeader. * Key is ignored if the file is not encrypted. */ -void setkey(clHCA *, unsigned long long keycode); +void clHCA_SetKey(clHCA *, unsigned long long keycode); /* Tests a single frame for validity, mainly to test if current key is correct. * Returns <0 on incorrect block (wrong key), 0 on silent block (not useful to determine) @@ -95,4 +95,4 @@ void clHCA_DecodeReset(clHCA * hca); } #endif -#endif \ No newline at end of file +#endif diff --git a/PyCriCodecs/__init__.py b/PyCriCodecs/__init__.py index 7379d95..90da358 100644 --- a/PyCriCodecs/__init__.py +++ b/PyCriCodecs/__init__.py @@ -3,7 +3,6 @@ from .chunk import * from .cpk import CPK, CPKBuilder from .usm import USM, USMBuilder -from .utf import UTF, UTFBuilder +from .utf import UTF, UTFBuilder, UTFViewer from .acb import ACB, ACBBuilder from .awb import AWB, AWBBuilder -from .ivf import IVF \ No newline at end of file diff --git a/PyCriCodecs/acb.py b/PyCriCodecs/acb.py index 8307729..0609b34 100644 --- a/PyCriCodecs/acb.py +++ b/PyCriCodecs/acb.py @@ -1,180 +1,100 @@ from struct import iter_unpack +from typing import BinaryIO, List +from io import BytesIO from .chunk import * -from .utf import UTF, UTFBuilder +from .utf import UTF, UTFBuilder, UTFViewer from .awb import AWB, AWBBuilder from .hca import HCA +from copy import deepcopy import os -# TODO revamp the whole ACB class. ACB is a lot more complex with those @UTF tables. -class ACB(UTF): - """ An ACB is basically a giant @UTF table. Use this class to extract any ACB. """ - __slots__ = ["filename", "payload", "filename", "awb"] - payload: list - filename: str - awb: AWB +# Credit: +# - github.com/vgmstream/vgmstream which is why this is possible at all +# - Original work by https://github.com/Youjose/PyCriCodecs +# See Research/ACBSchema.py for more details. + +class CueNameTable(UTFViewer): + CueIndex: int + CueName: str + + +class CueTable(UTFViewer): + CueId: int + ReferenceIndex: int + ReferenceType: int + + +class SequenceTable(UTFViewer): + TrackIndex: bytes + Type: int + + +class SynthTable(UTFViewer): + ReferenceItems: bytes + + +class TrackEventTable(UTFViewer): + Command: bytes + + +class TrackTable(UTFViewer): + EventIndex: int + +class WaveformTable(UTFViewer): + EncodeType: int + MemoryAwbId: int + NumChannels: int + NumSamples: int + SamplingRate: int + Streaming: int + + +class ACBTable(UTFViewer): + AcbGuid: bytes + Name: str + Version: int + VersionString: str + + AwbFile: bytes + CueNameTable: List[CueNameTable] + CueTable: List[CueTable] + SequenceTable: List[SequenceTable] + SynthTable: List[SynthTable] + TrackEventTable: List[TrackEventTable] + TrackTable: List[TrackTable] + WaveformTable: List[WaveformTable] + + +class ACB(UTF): + """An ACB is basically a giant @UTF table. Use this class to extract any ACB, and potentially modifiy it in place.""" def __init__(self, filename) -> None: - self.payload = UTF(filename).get_payload() - self.filename = filename - self.acbparse(self.payload) - # TODO check on ACB version. - - def acbparse(self, payload: list) -> None: - """ Recursively parse the payload. """ - for dict in range(len(payload)): - for k, v in payload[dict].items(): - if v[0] == UTFTypeValues.bytes: - if v[1].startswith(UTFType.UTF.value): #or v[1].startswith(UTFType.EUTF.value): # ACB's never gets encrypted? - par = UTF(v[1]).get_payload() - payload[dict][k] = par - self.acbparse(par) - self.load_awb() - - def load_awb(self) -> None: - # There are two types of ACB's, one that has an AWB file inside it, - # and one with an AWB pair. - if self.payload[0]['AwbFile'][1] == b'': - if type(self.filename) == str: - awbObj = AWB(os.path.join(os.path.dirname(self.filename), self.payload[0]['Name'][1]+".awb")) - else: - awbObj = AWB(self.payload[0]['Name'][1]+".awb") - else: - awbObj = AWB(self.payload[0]['AwbFile'][1]) - self.awb = awbObj - - # revamping... - def exp_extract(self, decode: bool = False, key = 0): - # There are two types of ACB's, one that has an AWB file inside it, - # and one with an AWB pair. Or multiple AWB's. - - # TODO Add multiple AWB loading. - if self.payload[0]['AwbFile'][1] == b'': - if type(self.filename) == str: - awbObj = AWB(os.path.join(os.path.dirname(self.filename), self.payload[0]['Name'][1]+".awb")) - else: - awbObj = AWB(self.payload[0]['Name'][1]+".awb") - else: - awbObj = AWB(self.payload[0]['AwbFile'][1]) - - pl = self.payload[0] - names = [] # Where all filenames will end up in. - # cuename > cue > block > sequence > track > track_event > command > synth > waveform - # seems to be the general way to do it, some may repeat, and some may go back to other tables. - # I will try to make this code go through all of them in advance. - - """ Load Cue names and indexes. """ - cue_names_and_indexes: list = [] - for i in pl["CueNameTable"]: - cue_names_and_indexes.append((i["CueIndex"], i["CueName"])) - srt_names = sorted(cue_names_and_indexes, key=lambda x: x[0]) - - """ Go through all cues and match wavforms or names. """ - for i in cue_names_and_indexes: - - cue_Info = pl["CueTable"][i[0]] - ref_type = cue_Info["ReferenceType"][1] - wavform = pl["WaveformTable"][i[0]] - - if ref_type == 1: - usememory: bool = wavform['Streaming'][1] == 0 - - if "Id" in wavform: - wavform["MemoryAwbId"] = wavform["Id"] # Old ACB's use "Id", so we default it to the new MemoryAwbId slot. - - if usememory: - assert len(wavform['MemoryAwbId']) == len(srt_names) # Will error if not so. TODO add extracting without filenames references. - names = [y[1][1] for _,y in sorted(zip([x[1] for x in pl["WaveformTable"]], srt_names), key=lambda z: z[0])] - break # We break, since we did everything in the line above. I don't think ref_type changes between cues. - - else: - # TODO - raise NotImplementedError("ACB needs multiple AWB's, not unsupported yet.") - - elif ref_type == 2: - # TODO - raise NotImplementedError("Unsupported ReferenceType.") - - elif ref_type == 3: - sequence = pl['SequenceTable'][i[0]] - track_type = sequence['Type'][1] # Unused but will leave it here if needed. - for tr_idx in iter_unpack(">H", sequence['TrackIndex'][1]): - # TODO I am here currently. - pass - - elif ref_type == 8: - # TODO - raise NotImplementedError("Unsupported ReferenceType.") - - else: - raise NotImplementedError("Unknown ReferenceType inside ACB.") - - def parse_type1(self): - pass - - def parse_type2(self): - pass - - def parse_type3(self): - pass - - def parse_type8(self): - pass - - def parse_cues(self): - pass - - def parse_synth(self): - pass - - def parse_wavform(self): - pass - - def parse_tracktable(self): - pass - - def parse_commands(self): - pass - - def parse_sequence(self): - pass - - def extract(self, decode: bool = False, key: int = 0, dirname: str = ""): - """ Extracts audio files in an AWB/ACB without preserving filenames. """ - if dirname: - os.makedirs(dirname, exist_ok=True) - filename = 0 - for i in self.awb.getfiles(): - Extension: str = self.get_extension(self.payload[0]['WaveformTable'][filename]['EncodeType'][1]) - if decode and Extension == ".hca": - hca = HCA(i, key=key, subkey=self.awb.subkey).decode() - open(os.path.join(dirname, str(filename)+".wav"), "wb").write(hca) - filename += 1 - else: - open(os.path.join(dirname, f"{filename}{Extension}"), "wb").write(i) - filename += 1 - - def get_extension(self, EncodeType: int) -> str: - if EncodeType == 0 or EncodeType == 3: - return ".adx" # Maybe 0 is ahx? - elif EncodeType == 2 or EncodeType == 6: - return ".hca" - elif EncodeType == 7 or EncodeType == 10: - return ".vag" - elif EncodeType == 8: - return ".at3" - elif EncodeType == 9: - return ".bcwav" - elif EncodeType == 11 or EncodeType == 18: - return ".at9" - elif EncodeType == 12: - return ".xma" - elif EncodeType == 13 or EncodeType == 4 or EncodeType == 5: - return ".dsp" - elif EncodeType == 19: - return ".m4a" - else: - return "" - -# TODO Have to finish correct ACB extracting first. -class ACBBuilder(UTFBuilder): - pass \ No newline at end of file + super().__init__(filename,recursive=True) + + @property + def payload(self) -> dict: + """Retrives the only top-level UTF table dict within the ACB file.""" + return self.dictarray[0] + + @property + def view(self) -> ACBTable: + """Returns a view of the ACB file, with all known tables mapped to their respective classes.""" + return ACBTable(self.payload) + + # TODO: Extraction routines + # See Research/ACBSchema.py. vgmstream presented 4 possible permutations of subsong retrieval. + +class ACBBuilder: + acb: ACB + + def __init__(self, acb: ACB) -> None: + self.acb = acb + + def build(self) -> bytes: + """Builds an ACB binary blob from the current ACB object. + + The object may be modified in place before building, which will be reflected in the output binary. + """ + payload = deepcopy(self.acb.dictarray) + binary = UTFBuilder(payload, encoding=self.acb.encoding, table_name=self.acb.table_name) + return binary.bytes() diff --git a/PyCriCodecs/adx.py b/PyCriCodecs/adx.py index e162dee..bd12694 100644 --- a/PyCriCodecs/adx.py +++ b/PyCriCodecs/adx.py @@ -4,11 +4,13 @@ class ADX: """ADX Module for decoding and encoding ADX files, pass the either `adx file` or `wav file` in bytes to either `decode` or `encode` respectively.""" # Decodes ADX to WAV. + @staticmethod def decode(data: bytes) -> bytes: """ Decodes ADX to WAV. """ return CriCodecs.AdxDecode(bytes(data)) # Encodes WAV to ADX. + @staticmethod def encode(data: bytes, BitDepth = 0x4, Blocksize = 0x12, Encoding = 3, AdxVersion = 0x4, Highpass_Frequency = 0x1F4, Filter = 0, force_not_looping = False) -> bytes: """ Encodes WAV to ADX. """ return CriCodecs.AdxEncode(bytes(data), BitDepth, Blocksize, Encoding, Highpass_Frequency, Filter, AdxVersion, force_not_looping) \ No newline at end of file diff --git a/PyCriCodecs/awb.py b/PyCriCodecs/awb.py index cc2f4ed..15cb673 100644 --- a/PyCriCodecs/awb.py +++ b/PyCriCodecs/awb.py @@ -7,8 +7,7 @@ # for AFS2 only. class AWB: - """ Use this class to return any AWB data with the getfiles function. """ - __slots__ = ["stream", "numfiles", "align", "subkey", "version", "ids", "ofs", "filename", "headersize", "id_alignment"] + """ Use this class to return any AWB data with the getfiles function. """ stream: BinaryIO numfiles: int align: int @@ -31,7 +30,7 @@ def __init__(self, stream) -> None: def readheader(self): # Reads header. - magic, self.version, offset_intsize, id_intsize, self.numfiles, self.align, self.subkey = AWBChunkHeader.unpack( + magic, self.version, offset_intsize, self.id_intsize, self.numfiles, self.align, self.subkey = AWBChunkHeader.unpack( self.stream.read(AWBChunkHeader.size) ) if magic != b'AFS2': @@ -40,54 +39,25 @@ def readheader(self): # Reads data in the header. self.ids = list() self.ofs = list() - for i in iter_unpack(f"<{self.stringtypes(id_intsize)}", self.stream.read(id_intsize*self.numfiles)): + for i in iter_unpack(f"<{self.stringtypes(self.id_intsize)}", self.stream.read(self.id_intsize*self.numfiles)): self.ids.append(i[0]) for i in iter_unpack(f"<{self.stringtypes(offset_intsize)}", self.stream.read(offset_intsize*(self.numfiles+1))): self.ofs.append(i[0] if i[0] % self.align == 0 else (i[0] + (self.align - (i[0] % self.align)))) # Seeks to files offset. - self.headersize = 16 + (offset_intsize*(self.numfiles+1)) + (id_intsize*self.numfiles) + self.headersize = 16 + (offset_intsize*(self.numfiles+1)) + (self.id_intsize*self.numfiles) if self.headersize % self.align != 0: self.headersize = self.headersize + (self.align - (self.headersize % self.align)) self.stream.seek(self.headersize, 0) - def extract(self, decode=False, key=0): - """ Extracts the files. """ - count = 0 - for i in self.getfiles(): - # Apparently AWB's can have many types of files, focusing on HCA's here though. So TODO. - if self.filename: - if i.startswith(HCAType.HCA.value) or i.startswith(HCAType.EHCA.value): - if decode: - filename = self.filename.rsplit(".", 1)[0] + "_" + str(count) + ".wav" - else: - filename = self.filename.rsplit(".", 1)[0] + "_" + str(count) + ".hca" - else: - # Probably ADX. - filename = self.filename.rsplit(".", 1)[0] + "_" + str(count) + ".dat" - open(filename, "wb").write(i) - count += 1 - continue - open(filename, "wb").write(i if not decode else HCA(i, key=key, subkey=self.subkey).decode()) - count += 1 - else: - if i.startswith(HCAType.HCA.value) or i.startswith(HCAType.EHCA.value): - if decode: - open(str(count)+".wav", "wb").write(HCA(i, key=key, subkey=self.subkey).decode()) - else: - open(str(count)+".hca", "wb").write(i) - else: - open(str(count)+".dat", "wb").write(i) - count += 1 - - def getfiles(self): - """ Generator function to yield data from an AWB. """ + def get_files(self): + """ Generator function to yield all data blobs from an AWB. """ for i in range(1, len(self.ofs)): data = self.stream.read((self.ofs[i]-self.ofs[i-1])) self.stream.seek(self.ofs[i], 0) yield data - def getfile_atindex(self, index): + def get_file_at(self, index): """ Gets you a file at specific index. """ index += 1 self.stream.seek(self.ofs[index], 0) @@ -108,36 +78,35 @@ def stringtypes(self, intsize: int) -> str: raise ValueError("Unknown int size.") class AWBBuilder: - """ Use this class to build any AWB of any kind given a directory with files. """ - __slots__ = ["dirname", "outfile", "version", "align", "subkey", "id_intsize"] - - def __init__(self, dirname: list[str], subkey: int = 0, version: int = 2, id_intsize = 0x2, align: int = 0x20) -> None: - if dirname == "": - raise ValueError("Invalid directory.") - elif version == 1 and subkey != 0: + def __init__(self, infiles: list[bytes], subkey: int = 0, version: int = 2, id_intsize = 0x2, align: int = 0x20) -> None: + if version == 1 and subkey != 0: raise ValueError("Cannot have a subkey with AWB version of 1.") elif id_intsize not in [0x2, 0x4, 0x8]: raise ValueError("id_intsize must be either 2, 4 or 8.") - self.dirname = dirname + self.infiles = infiles self.version = version self.align = align self.subkey = subkey self.id_intsize = id_intsize - - def build(self, outfile): - if outfile == "": - raise ValueError("Invalid output file name.") - if type(self.dirname) == list: - self.build_files(outfile) + + def stringtypes(self, intsize: int) -> str: + if intsize == 1: + return "B" # Probably impossible. + elif intsize == 2: + return "H" + elif intsize == 4: + return "I" + elif intsize == 8: + return "Q" else: - self.build_dir(outfile) - - def build_files(self, outfile: str): + raise ValueError("Unknown int size.") + + def build(self) -> bytes: size = 0 ofs = [] numfiles = 0 - for file in self.dirname: - sz = os.stat(file).st_size + for file in self.infiles: + sz = len(file) ofs.append(size+sz) size += sz numfiles += 1 @@ -172,76 +141,11 @@ def build_files(self, outfile: str): if headersize % self.align != 0: header = header.ljust(headersize + (self.align - (headersize % self.align)), b"\x00") - out = open(outfile, "wb") - out.write(header) - for idx, file in enumerate(self.dirname): - fl = open(file, "rb").read() - if len(fl) % self.align != 0 and idx != len(self.dirname) - 1: + outfile = BytesIO() + outfile.write(header) + for idx, file in enumerate(self.infiles): + fl = file + if len(fl) % self.align != 0 and idx != len(self.infiles) - 1: fl = fl.ljust(len(fl) + (self.align - (len(fl) % self.align)), b"\x00") - out.write(fl) - out.close() - - def build_dir(self, outfile: str): - size = 0 - ofs = [] - numfiles = 0 - for r, d, f in os.walk(self.dirname): - for file in f: - sz = os.stat(os.path.join(r, file)).st_size - if sz % self.align != 0: # Doesn't always needs to be this way? - sz = sz + (self.align - sz % self.align) - ofs.append(size+sz) - size += sz - numfiles += 1 - - if size > 0xFFFFFFFF: - intsize = 8 # Unsigned long long. - strtype = " str: - if intsize == 1: - return "B" # Probably impossible. - elif intsize == 2: - return "H" - elif intsize == 4: - return "I" - elif intsize == 8: - return "Q" - else: - raise ValueError("Unknown int size.") \ No newline at end of file + outfile.write(fl) + return outfile.getvalue() \ No newline at end of file diff --git a/PyCriCodecs/chunk.py b/PyCriCodecs/chunk.py index aab0cf8..3ecaaa3 100644 --- a/PyCriCodecs/chunk.py +++ b/PyCriCodecs/chunk.py @@ -10,6 +10,8 @@ WavSmplHeaderStruct = Struct("<4sIIIIIIIIIIIIIIII") # Supports only 1 looping point. WavNoteHeaderStruct = Struct("<4sII") WavDataHeaderStruct = Struct("<4sI") +AdxHeaderStruct = Struct(">HHBBBBIIHBB") +AdxLoopHeaderStruct = Struct(">HHHHIIII") class USMChunckHeaderType(Enum): CRID = b"CRID" # Header. diff --git a/PyCriCodecs/cpk.py b/PyCriCodecs/cpk.py index 2ed2cbb..3d68980 100644 --- a/PyCriCodecs/cpk.py +++ b/PyCriCodecs/cpk.py @@ -1,12 +1,37 @@ +import os from typing import BinaryIO from io import BytesIO, FileIO -import os from .chunk import * from .utf import UTF, UTFBuilder +from dataclasses import dataclass +from concurrent.futures import ProcessPoolExecutor, as_completed +from tempfile import NamedTemporaryFile import CriCodecs -class TOC(): - __slots__ = ["magic", "encflag", "packet_size", "unk0C", "stream", "table"] +def worker_do_compression(src : str, dst: str): + with open(src, "rb") as fsrc, open(dst, "wb") as fdst: + data = fsrc.read() + compressed = CriCodecs.CriLaylaCompress(data) + fdst.write(compressed) +@dataclass +class _PackFile(): + stream: BinaryIO + path: str + offset: int + size : int + compressed : bool = False + + def get_bytes(self) -> bytes: + self.stream.seek(self.offset) + data = self.stream.read(self.size) + if self.compressed: + data = CriCodecs.CriLaylaDecompress(data) + return data + + def save(self, path : str): + with open(path, "wb") as f: + f.write(self.get_bytes()) +class _TOC(): magic: bytes encflag: int packet_size: int @@ -23,7 +48,6 @@ def __init__(self, stream: bytes) -> None: self.table = UTF(self.stream.read()).table class CPK: - __slots__ = ["magic", "encflag", "packet_size", "unk0C", "stream", "tables", "filename"] magic: bytes encflag: int packet_size: int @@ -44,18 +68,18 @@ def __init__(self, filename) -> None: if self.magic != CPKChunkHeaderType.CPK.value: raise ValueError("Invalid CPK file.") self.tables = dict(CPK = UTF(self.stream.read(0x800-CPKChunkHeader.size)).table) - self.checkTocs() + self._load_tocs() - def checkTocs(self) -> None: + def _load_tocs(self) -> None: for key, value in self.tables["CPK"].items(): if key == "TocOffset": if value[0]: self.stream.seek(value[0], 0) - self.tables["TOC"] = TOC(self.stream.read(self.tables['CPK']["TocSize"][0])).table + self.tables["TOC"] = _TOC(self.stream.read(self.tables['CPK']["TocSize"][0])).table elif key == "ItocOffset": if value[0]: self.stream.seek(value[0], 0) - self.tables["ITOC"] = TOC(self.stream.read(self.tables['CPK']["ItocSize"][0])).table + self.tables["ITOC"] = _TOC(self.stream.read(self.tables['CPK']["ItocSize"][0])).table if "DataL" in self.tables["ITOC"]: self.tables["ITOC"]['DataL'][0] = UTF(self.tables["ITOC"]['DataL'][0]).table if "DataH" in self.tables["ITOC"]: @@ -63,11 +87,11 @@ def checkTocs(self) -> None: elif key == "HtocOffset": if value[0]: self.stream.seek(value[0], 0) - self.tables["HTOC"] = TOC(self.stream.read(self.tables['CPK']["HtocSize"][0])).table + self.tables["HTOC"] = _TOC(self.stream.read(self.tables['CPK']["HtocSize"][0])).table elif key == "GtocOffset": if value[0]: self.stream.seek(value[0], 0) - self.tables["GTOC"] = TOC(self.stream.read(self.tables['CPK']["GtocSize"][0])).table + self.tables["GTOC"] = _TOC(self.stream.read(self.tables['CPK']["GtocSize"][0])).table if "AttrData" in self.tables["GTOC"]: self.tables["GTOC"]['AttrData'][0] = UTF(self.tables["GTOC"]['AttrData'][0]).table if "Fdata" in self.tables["GTOC"]: @@ -77,32 +101,42 @@ def checkTocs(self) -> None: elif key == "HgtocOffset": if value[0]: self.stream.seek(value[0], 0) - self.tables["HGTOC"] = TOC(self.stream.read(self.tables['CPK']["HgtocSize"][0])).table + self.tables["HGTOC"] = _TOC(self.stream.read(self.tables['CPK']["HgtocSize"][0])).table elif key == "EtocOffset": if value[0]: self.stream.seek(value[0], 0) - self.tables["ETOC"] = TOC(self.stream.read(self.tables['CPK']["EtocSize"][0])).table + self.tables["ETOC"] = _TOC(self.stream.read(self.tables['CPK']["EtocSize"][0])).table - def extract(self): + @property + def mode(self): + TOC, ITOC, GTOC = 'TOC' in self.tables, 'ITOC' in self.tables, 'GTOC' in self.tables + if TOC and ITOC and GTOC: + return 3 + elif TOC and ITOC: + return 2 + elif TOC: + return 1 + elif ITOC: + return 0 + raise ValueError("Unknown CPK mode.") + + @property + def files(self): + """Retrieves a list of all files in the CPK archive.""" if "TOC" in self.tables: toctable = self.tables['TOC'] rel_off = 0x800 for i in range(len(toctable['FileName'])): - if toctable["DirName"][i%len(toctable["DirName"])] == '': - dirname = self.filename.rsplit(".")[0] - else: - dirname = os.path.join(self.filename.rsplit(".")[0], toctable["DirName"][i%len(toctable["DirName"])]) - os.makedirs(dirname, exist_ok=True) + dirname = toctable["DirName"][i%len(toctable["DirName"])] filename = toctable['FileName'][i] if len(filename) >= 255: filename = filename[:250] + "_" + str(i) # 250 because i might be 4 digits long. if toctable['ExtractSize'][i] > toctable['FileSize'][i]: self.stream.seek(rel_off+toctable["FileOffset"][i], 0) - comp_data = self.stream.read(toctable['FileSize'][i]) - open(os.path.join(dirname, filename), "wb").write(CriCodecs.CriLaylaDecompress(comp_data)) + yield _PackFile(self.stream, os.path.join(dirname,filename), self.stream.tell(), toctable['FileSize'][i], compressed=True) else: self.stream.seek(rel_off+toctable["FileOffset"][i], 0) - open(os.path.join(dirname, filename), "wb").write(self.stream.read(toctable['FileSize'][i])) + yield _PackFile(self.stream, os.path.join(dirname,filename), self.stream.tell(), toctable['FileSize'][i]) elif "ITOC" in self.tables: toctableL = self.tables["ITOC"]['DataL'][0] toctableH = self.tables["ITOC"]['DataH'][0] @@ -110,109 +144,28 @@ def extract(self): offset = self.tables["CPK"]["ContentOffset"][0] files = self.tables["CPK"]["Files"][0] self.stream.seek(offset, 0) - if self.filename: - dirname = self.filename.rsplit(".")[0] - os.makedirs(dirname, exist_ok=True) - else: - dirname = "" for i in sorted(toctableH['ID']+toctableL['ID']): if i in toctableH['ID']: idx = toctableH['ID'].index(i) if toctableH['ExtractSize'][idx] > toctableH['FileSize'][idx]: - comp_data = self.stream.read(toctableH['FileSize'][idx]) - open(os.path.join(dirname, str(i)), "wb").write(CriCodecs.CriLaylaDecompress(comp_data)) + yield _PackFile(self.stream, str(i), self.stream.tell(), toctableH['FileSize'][idx], compressed=True) else: - open(os.path.join(dirname, str(i)), "wb").write(self.stream.read(toctableH['FileSize'][idx])) + yield _PackFile(self.stream, str(i), self.stream.tell(), toctableH['FileSize'][idx]) if toctableH['FileSize'][idx] % align != 0: seek_size = (align - toctableH['FileSize'][idx] % align) self.stream.seek(seek_size, 1) elif i in toctableL['ID']: idx = toctableL['ID'].index(i) if toctableL['ExtractSize'][idx] > toctableL['FileSize'][idx]: - comp_data = self.stream.read(toctableL['FileSize'][idx]) - open(os.path.join(dirname, str(i)), "wb").write(CriCodecs.CriLaylaDecompress(comp_data)) + yield _PackFile(self.stream, str(i), self.stream.tell(), toctableL['FileSize'][idx], compressed=True) else: - open(os.path.join(dirname, str(i)), "wb").write(self.stream.read(toctableL['FileSize'][idx])) + yield _PackFile(self.stream, str(i), self.stream.tell(), toctableL['FileSize'][idx]) if toctableL['FileSize'][idx] % align != 0: seek_size = (align - toctableL['FileSize'][idx] % align) self.stream.seek(seek_size, 1) - - def extract_file(self, filename): - if "TOC" in self.tables: - toctable = self.tables['TOC'] - rel_off = 0x800 - if toctable["DirName"][0] == '': - dirname = self.filename.rsplit(".")[0] - else: - dirname = os.path.join(self.filename.rsplit(".")[0], toctable["DirName"][0]) - if self.filename: - os.makedirs(dirname, exist_ok=True) - if filename not in toctable['FileName']: - raise ValueError("Given filename does not exist inside the provided CPK.") - idx = toctable['FileName'].index(filename) - offset = rel_off+toctable["FileOffset"][idx] - size = toctable['FileSize'][idx] - self.stream.seek(offset, 0) - open(os.path.join(dirname, filename), "wb").write(self.stream.read(size)) - elif "ITOC" in self.tables: - filename = int(filename) - toctableL = self.tables["ITOC"]['DataL'][0] - toctableH = self.tables["ITOC"]['DataH'][0] - alignmentsize = self.tables["CPK"]["Align"][0] - files = self.tables["CPK"]["Files"][0] - offset = self.tables["CPK"]["ContentOffset"][0] - if filename in toctableL['ID']: - idxg = toctableL['ID'].index(filename) - elif filename in toctableH['ID']: - idxg = toctableH['ID'].index(filename) - else: - raise ValueError("Given ID does not exist in the given CPK.") - self.stream.seek(offset, 0) - realOffset = offset - for i in sorted(toctableH['ID']+toctableL['ID']): - if i != filename: - if i in toctableH["ID"]: - idx = toctableH['ID'].index(i) - realOffset += toctableH["FileSize"][idx] - if toctableH["FileSize"][idx] % alignmentsize != 0: - realOffset += (alignmentsize - toctableH["FileSize"][idx] % alignmentsize) - elif i in toctableL["ID"]: - idx = toctableL['ID'].index(i) - realOffset += toctableH["FileSize"][idx] - if toctableL["FileSize"][idx] % alignmentsize != 0: - realOffset += (alignmentsize - toctableL["FileSize"][idx] % alignmentsize) - else: - if self.filename: - dirname = self.filename.rsplit(".")[0] - os.makedirs(dirname) - else: - dirname = "" - if filename in toctableH["ID"]: - extractsz = toctableH['ExtractSize'][idxg] - flsize = toctableH['FileSize'][idxg] - if extractsz > flsize: - self.stream.seek(realOffset) - comp_data = self.stream.read(toctableH['FileSize'][idxg]) - open(os.path.join(dirname, str(filename)), "wb").write(CriCodecs.CriLaylaDecompress(comp_data)) - else: - open(os.path.join(dirname, str(filename)), "wb").write(self.stream.read(toctableH['FileSize'][idxg])) - else: - extractsz = toctableL['ExtractSize'][idxg] - flsize = toctableL['FileSize'][idxg] - if extractsz > flsize: - self.stream.seek(realOffset) - comp_data = self.stream.read(toctableL['FileSize'][idxg]) - open(os.path.join(dirname, str(filename)), "wb").write(CriCodecs.CriLaylaDecompress(comp_data)) - else: - open(os.path.join(dirname, str(filename)), "wb").write(self.stream.read(toctableL['FileSize'][idxg])) - break - class CPKBuilder: """ Use this class to build semi-custom CPK archives. """ - __slots__ = ["CpkMode", "Tver", "dirname", "itoc_size", "encrypt", "encoding", "files", "fileslen", - "ITOCdata", "CPKdata", "ContentSize", "EnabledDataSize", "outfile", "TOCdata", "GTOCdata", - "ETOCdata", "compress", "EnabledPackedSize", "init_toc_len"] - CpkMode: int + mode: int # CPK mode dictates (at least from what I saw) the use of filenames in TOC or the use of # ITOC without any filenames (Use of ID's only, will be sorted). # CPK mode of 0 = Use of ITOC only, CPK mode = 1, use of TOC, ITOC and optionally ETOC? @@ -224,7 +177,6 @@ class CPKBuilder: itoc_size: int encrypt: bool encoding: str - files: list fileslen: int ITOCdata: bytearray TOCdata: bytearray @@ -233,109 +185,182 @@ class CPKBuilder: EnabledDataSize: int EnabledPackedSize: int outfile: str - compress: bool init_toc_len: int # This is a bit of a redundancy, but some CPK's need it. - def __init__(self, dirname: str, outfile: str, CpkMode: int = 1, Tver: str = False, encrypt: bool = False, encoding: str = "utf-8", compress: bool = False) -> None: - self.CpkMode = CpkMode - self.compress = False + in_files : list[tuple[str, str, bool]] # (source path, dest filename, compress or not) + os_files : list[tuple[str, bool]] # (os path, temp or not) + files: list[tuple[str, int, int]] # (filename, file size, compressed file size). + + progress_cb : callable # Progress callback taking (task name, current, total) + + def __init__(self, mode: int = 1, Tver: str = None, encrypt: bool = False, encoding: str = "utf-8", progress_cb : callable = None) -> None: + """Setup CPK file building + + Args: + mode (int, optional): CPK mode. 0: ID Only (ITOC), 1: Name Only (TOC), 2: Name + ID (ITOC + TOC), 3: Name + ID + GTOC (GTOC). Defaults to 1. + Tver (str, optional): CPK version. Defaults to None. + encrypt (bool, optional): Enable encryption. Defaults to False. + encoding (str, optional): Filename encoding. Defaults to "utf-8". + progress_cb (callable, optional): Progress callback taking (task name, current, total). Defaults to None. + """ + self.progress_cb = progress_cb + if not self.progress_cb: + self.progress_cb = lambda task_name, current, total: None + self.mode = mode if not Tver: # Some default ones I found with the matching CpkMode, hope they are good enough for all cases. - if self.CpkMode == 0: + if self.mode == 0: self.Tver = 'CPKMC2.18.04, DLL2.78.04' - elif self.CpkMode == 1: + elif self.mode == 1: self.Tver = 'CPKMC2.45.00, DLL3.15.00' - elif self.CpkMode == 2: + elif self.mode == 2: self.Tver = 'CPKMC2.49.32, DLL3.24.00' - elif self.CpkMode == 3: + elif self.mode == 3: self.Tver = 'CPKFBSTD1.49.35, DLL3.24.00' else: raise ValueError("Unknown CpkMode.") else: self.Tver = Tver - if dirname == "": - raise ValueError("Invalid directory name/path.") - elif self.CpkMode not in [0, 1, 2, 3]: + if self.mode not in [0, 1, 2, 3]: raise ValueError("Unknown CpkMode.") - elif self.CpkMode == 0 and self.compress: - # CpkMode of 0 is a bit hard to do with compression, as I don't know where the actual data would be - # categorized (either H or L) after compression. Needs proper testing for me to implement. - raise NotImplementedError("CpkMode of 0 with compression is not supported yet.") - self.dirname = dirname + self.encrypt = encrypt self.encoding = encoding self.EnabledDataSize = 0 self.EnabledPackedSize = 0 self.ContentSize = 0 + self.in_files = [] + self.os_files = [] + + def add_file(self, src : str, dst : str = None, compress=False): + """Add a file to the bundle. + + Args: + src (str): The source file path. + dst (str): The destination full file name (containing directory). Can be None in ITOC Mode. Defaults to None. + compress (bool, optional): Whether to compress the file. Defaults to False. + + NOTE: + - In ITOC-related mode, the insertion order determines the final integer ID of the files. + - Compression can be VERY slow with high entropy files (e.g. encoded media). Use at discretion. + """ + if not dst and self.mode != 0: + raise ValueError("Destination filename must be specified in non-ITOC mode.") + + self.in_files.append((src, dst, compress)) + + def _writetofile(self, header) -> None: + with open(self.outfile, "wb") as out: + out.write(header) + for i, ((path, _), (filename, file_size, pack_size)) in enumerate(zip(self.os_files, self.files)): + src = open(path, 'rb').read() + out.write(src) + out.write(bytes(0x800 - pack_size % 0x800)) + self.progress_cb("Write %s" % os.path.basename(filename), i + 1, len(self.files)) + + def _populate_files(self, parallel : bool): + self.files = [] + for src, dst, compress in self.in_files: + if compress: + tmp = NamedTemporaryFile(delete=False) + self.os_files.append((tmp.name, True)) + else: + self.os_files.append((src, False)) + if parallel: + with ProcessPoolExecutor() as exec: + futures = [] + for (src, _, _), (dst, compress) in zip(self.in_files,self.os_files): + if compress: + futures.append(exec.submit(worker_do_compression, src, dst)) + for i, fut in as_completed(futures): + try: + fut.result() + except: + pass + self.progress_cb("Compress %s" % os.path.basename(src), i + 1, len(futures)) + else: + for i, ((src, _, _), (dst, compress)) in enumerate(zip(self.in_files,self.os_files)): + if compress: + worker_do_compression(src, dst) + self.progress_cb("Compress %s" % os.path.basename(src), i + 1, len(self.in_files)) + for (src, filename, _) , (dst, _) in zip(self.in_files,self.os_files): + file_size = os.stat(src).st_size + pack_size = os.stat(dst).st_size + self.files.append((filename, file_size, pack_size)) + + def _cleanup_files(self): + self.files = [] + for path, is_temp in self.os_files: + if not is_temp: + continue + try: + os.unlink(path) + except: + pass + self.os_files = [] + + def save(self, outfile : str, parallel : bool = False): + """Build and save the bundle into a file + + + Args: + outfile (str): The output file path. + parallel (bool, optional): Whether to use parallel processing for file compression (if at all used). Defaults to False. + + NOTE: + - Temporary files may be created during the process if compression is used. + - parallel uses multiprocessing. Make sure your main function is guarded with `if __name__ == '__main__'` clause. + """ + assert self.in_files, "cannot save empty bundle" self.outfile = outfile - self.compress = compress - self.generate_payload() - - def generate_payload(self): + self._populate_files(parallel) if self.encrypt: encflag = 0 else: encflag = 0xFF - if self.CpkMode == 3: - self.TOCdata = self.generate_TOC() + data = None + if self.mode == 3: + self.TOCdata = self._generate_TOC() self.TOCdata = bytearray(CPKChunkHeader.pack(b'TOC ', encflag, len(self.TOCdata), 0)) + self.TOCdata self.TOCdata = self.TOCdata.ljust(len(self.TOCdata) + (0x800 - len(self.TOCdata) % 0x800), b'\x00') assert self.init_toc_len == len(self.TOCdata) - self.GTOCdata = self.generate_GTOC() + self.GTOCdata = self._generate_GTOC() self.GTOCdata = bytearray(CPKChunkHeader.pack(b'GTOC', encflag, len(self.GTOCdata), 0)) + self.GTOCdata self.GTOCdata = self.GTOCdata.ljust(len(self.GTOCdata) + (0x800 - len(self.GTOCdata) % 0x800), b'\x00') - self.CPKdata = self.generate_CPK() + self.CPKdata = self._generate_CPK() self.CPKdata = bytearray(CPKChunkHeader.pack(b'CPK ', encflag, len(self.CPKdata), 0)) + self.CPKdata data = self.CPKdata.ljust(len(self.CPKdata) + (0x800 - len(self.CPKdata) % 0x800) - 6, b'\x00') + bytearray(b"(c)CRI") + self.TOCdata + self.GTOCdata - self.writetofile(data) - elif self.CpkMode == 2: - self.TOCdata = self.generate_TOC() + elif self.mode == 2: + self.TOCdata = self._generate_TOC() self.TOCdata = bytearray(CPKChunkHeader.pack(b'TOC ', encflag, len(self.TOCdata), 0)) + self.TOCdata self.TOCdata = self.TOCdata.ljust(len(self.TOCdata) + (0x800 - len(self.TOCdata) % 0x800), b'\x00') assert self.init_toc_len == len(self.TOCdata) - self.ITOCdata = self.generate_ITOC() + self.ITOCdata = self._generate_ITOC() self.ITOCdata = bytearray(CPKChunkHeader.pack(b'ITOC', encflag, len(self.ITOCdata), 0)) + self.ITOCdata self.ITOCdata = self.ITOCdata.ljust(len(self.ITOCdata) + (0x800 - len(self.ITOCdata) % 0x800), b'\x00') - self.CPKdata = self.generate_CPK() + self.CPKdata = self._generate_CPK() self.CPKdata = bytearray(CPKChunkHeader.pack(b'CPK ', encflag, len(self.CPKdata), 0)) + self.CPKdata data = self.CPKdata.ljust(len(self.CPKdata) + (0x800 - len(self.CPKdata) % 0x800) - 6, b'\x00') + bytearray(b"(c)CRI") + self.TOCdata + self.ITOCdata - self.writetofile(data) - elif self.CpkMode == 1: - self.TOCdata = self.generate_TOC() + elif self.mode == 1: + self.TOCdata = self._generate_TOC() self.TOCdata = bytearray(CPKChunkHeader.pack(b'TOC ', encflag, len(self.TOCdata), 0)) + self.TOCdata self.TOCdata = self.TOCdata.ljust(len(self.TOCdata) + (0x800 - len(self.TOCdata) % 0x800), b'\x00') assert self.init_toc_len == len(self.TOCdata) - self.CPKdata = self.generate_CPK() + self.CPKdata = self._generate_CPK() self.CPKdata = bytearray(CPKChunkHeader.pack(b'CPK ', encflag, len(self.CPKdata), 0)) + self.CPKdata data = self.CPKdata.ljust(len(self.CPKdata) + (0x800 - len(self.CPKdata) % 0x800) - 6, b'\x00') + bytearray(b"(c)CRI") + self.TOCdata - self.writetofile(data) - elif self.CpkMode == 0: - self.ITOCdata = self.generate_ITOC() + elif self.mode == 0: + self.ITOCdata = self._generate_ITOC() self.ITOCdata = bytearray(CPKChunkHeader.pack(b'ITOC', encflag, len(self.ITOCdata), 0)) + self.ITOCdata self.ITOCdata = self.ITOCdata.ljust(len(self.ITOCdata) + (0x800 - len(self.ITOCdata) % 0x800), b'\x00') - self.CPKdata = self.generate_CPK() + self.CPKdata = self._generate_CPK() self.CPKdata = bytearray(CPKChunkHeader.pack(b'CPK ', encflag, len(self.CPKdata), 0)) + self.CPKdata data = self.CPKdata.ljust(len(self.CPKdata) + (0x800 - len(self.CPKdata) % 0x800) - 6, b'\x00') + bytearray(b"(c)CRI") + self.ITOCdata - self.writetofile(data) - - def writetofile(self, data) -> None: - out = open(self.outfile, "wb") - out.write(data) - if self.compress: - for d in self.files: - if len(d) % 0x800 != 0: - d = d.ljust(len(d) + (0x800 - len(d) % 0x800), b"\x00") - out.write(d) - out.close() - else: - for i in self.files: - d = open(i, "rb").read() - if len(d) % 0x800 != 0: - d = d.ljust(len(d) + (0x800 - len(d) % 0x800), b"\x00") - out.write(d) - out.close() + self._writetofile(data) + self._cleanup_files() - def generate_GTOC(self) -> bytearray: + def _generate_GTOC(self) -> bytearray: + # NOTE: Practically useless # I have no idea why are those numbers here. Gdata = [ { @@ -382,48 +407,32 @@ def generate_GTOC(self) -> bytearray: "Glink": (UTFTypeValues.uint, 2), "Flink": (UTFTypeValues.uint, 3), "Attr" : (UTFTypeValues.uint, 1), - "Gdata": (UTFTypeValues.bytes, UTFBuilder(Gdata, encrypt=False, encoding=self.encoding, table_name="CpkGtocGlink").parse()), - "Fdata": (UTFTypeValues.bytes, UTFBuilder(Fdata, encrypt=False, encoding=self.encoding, table_name="CpkGtocFlink").parse()), - "Attrdata": (UTFTypeValues.bytes, UTFBuilder(Attrdata, encrypt=False, encoding=self.encoding, table_name="CpkGtocAttr").parse()), + "Gdata": (UTFTypeValues.bytes, UTFBuilder(Gdata, encrypt=False, encoding=self.encoding, table_name="CpkGtocGlink").bytes()), + "Fdata": (UTFTypeValues.bytes, UTFBuilder(Fdata, encrypt=False, encoding=self.encoding, table_name="CpkGtocFlink").bytes()), + "Attrdata": (UTFTypeValues.bytes, UTFBuilder(Attrdata, encrypt=False, encoding=self.encoding, table_name="CpkGtocAttr").bytes()), } ] - return UTFBuilder(payload, encrypt=self.encrypt, encoding=self.encoding, table_name="CpkGtocInfo").parse() + return UTFBuilder(payload, encrypt=self.encrypt, encoding=self.encoding, table_name="CpkGtocInfo").bytes() - def generate_ETOC(self) -> bytearray: - """ This is now unused, a CPK won't be unfuctional without it. I will leave it here for reference. """ - payload = [ - { - "UpdateDateTime": (UTFTypeValues.ullong, 0), - "LocalDir": (UTFTypeValues.string, self.dirname) - } - ] - return UTFBuilder(payload, encrypt=self.encrypt, encoding=self.encoding, table_name="CpkEtocInfo").parse() - - def generate_TOC(self) -> bytearray: - payload = [] - self.files = [] - temp = [] - self.get_files(sorted(os.listdir(self.dirname), key=lambda x: "".join([s if s != '_' else "~" for s in x]).lower()), self.dirname) + def _generate_TOC(self) -> bytearray: + payload = [] + temp = [] count = 0 lent = 0 switch = False sf = set() sd = set() - for i in self.files: + for filename, store_size, full_size in self.files: # Dirname management. - dirname = os.path.dirname(i.split(self.dirname)[1]) - if dirname.startswith(os.sep) or dirname.startswith("\\"): - dirname = dirname[1:] - if "\\" in dirname or os.sep in dirname: - dirname = dirname.replace("\\", "/") - dirname = dirname.replace(os.sep, "/") + # Must be POSIX path + dirname = os.path.dirname(filename) if dirname not in sd: switch = True lent += len(dirname) + 1 sd.update({dirname}) # Filename management. - flname = os.path.basename(i) + flname = os.path.basename(filename) if flname not in sf: lent += len(flname) + 1 sf.update({flname}) @@ -444,65 +453,93 @@ def generate_TOC(self) -> bytearray: self.fileslen = count count = 0 - for file in self.files: - sz = os.stat(file).st_size - fz = sz + for filename, store_size, full_size in self.files: + sz = store_size + fz = full_size if sz > 0xFFFFFFFF: raise OverflowError("4GBs is the max size of a single file that can be bundled in a CPK archive of mode 1.") - if self.compress: - self.EnabledPackedSize += sz - comp_data = CriCodecs.CriLaylaCompress(open(file, "rb").read()) - temp.append(comp_data) - fz = len(comp_data) - self.EnabledDataSize += fz - if fz % 0x800 != 0: - self.ContentSize += fz + (0x800 - fz % 0x800) - else: - self.ContentSize += fz + self.EnabledDataSize += fz + self.EnabledPackedSize += sz + if sz % 0x800 != 0: + self.ContentSize += sz + (0x800 - sz % 0x800) else: - self.EnabledDataSize += sz - self.EnabledPackedSize += sz - if sz % 0x800 != 0: - self.ContentSize += sz + (0x800 - sz % 0x800) - else: - self.ContentSize += sz - dirname = os.path.dirname(file.split(self.dirname)[1]) - if dirname.startswith(os.sep) or dirname.startswith("\\"): - dirname = dirname[1:] - if "\\" in dirname or os.sep in dirname: - dirname = dirname.replace("\\", "/") - dirname = dirname.replace(os.sep, "/") + self.ContentSize += sz + dirname = os.path.dirname(filename) payload.append( { "DirName": (UTFTypeValues.string, dirname), - "FileName": (UTFTypeValues.string, os.path.basename(file)), + "FileName": (UTFTypeValues.string, os.path.basename(filename)), "FileSize": (UTFTypeValues.uint, sz), - "ExtractSize": (UTFTypeValues.uint, (sz if not self.compress else fz)), + "ExtractSize": (UTFTypeValues.uint, fz), "FileOffset": (UTFTypeValues.ullong, lent), "ID": (UTFTypeValues.uint, count), "UserString": (UTFTypeValues.string, "") } ) count += 1 - sz = fz if sz % 0x800 != 0: lent += sz + (0x800 - sz % 0x800) else: lent += sz - if self.compress: - self.files = temp - return UTFBuilder(payload, encrypt=self.encrypt, encoding=self.encoding, table_name="CpkTocInfo").parse() - - def get_files(self, lyst, root): - for i in lyst: - name = os.path.join(root, i) - if os.path.isdir(name): - self.get_files(sorted(os.listdir(name), key=lambda x: "".join([s if s != '_' else "~" for s in x]).lower()), name) - else: - self.files.append(name) + return UTFBuilder(payload, encrypt=self.encrypt, encoding=self.encoding, table_name="CpkTocInfo").bytes() - def generate_CPK(self) -> bytearray: - if self.CpkMode == 3: + def _generate_ITOC(self) -> bytearray: + if self.mode == 2: + payload = [] + for i, (filename, store_size, full_size) in enumerate(self.files): + payload.append( + { + "ID": (UTFTypeValues.int, i), + "TocIndex": (UTFTypeValues.int, i) + } + ) + return UTFBuilder(payload, encrypt=self.encrypt, encoding=self.encoding, table_name="CpkExtendId").bytes() + else: + assert len(self.files) < 65535, "ITOC requires less than 65535 files." + self.fileslen = len(self.files) + datal = [] + datah = [] + for i, (filename, store_size, full_size) in enumerate(self.files): + sz = store_size + fz = full_size + self.EnabledDataSize += fz + self.EnabledPackedSize += sz + if sz % 0x800 != 0: + self.ContentSize += sz + (0x800 - sz % 0x800) + else: + self.ContentSize += sz + if sz > 0xFFFF: + dicth = { + "ID": (UTFTypeValues.ushort, i), + "FileSize": (UTFTypeValues.uint, sz), + "ExtractSize": (UTFTypeValues.uint, sz) + } + datah.append(dicth) + else: + dictl = { + "ID": (UTFTypeValues.ushort, i), + "FileSize": (UTFTypeValues.ushort, sz), + "ExtractSize": (UTFTypeValues.ushort, sz) + } + datal.append(dictl) + datallen = len(datal) + datahlen = len(datah) + if len(datal) == 0: + datal.append({"ID": (UTFTypeValues.ushort, 0), "FileSize": (UTFTypeValues.ushort, 0), "ExtractSize": (UTFTypeValues.ushort, 0)}) + elif len(datah) == 0: + datah.append({"ID": (UTFTypeValues.uint, 0), "FileSize": (UTFTypeValues.uint, 0), "ExtractSize": (UTFTypeValues.uint, 0)}) + payload = [ + { + "FilesL" : (UTFTypeValues.uint, datallen), + "FilesH" : (UTFTypeValues.uint, datahlen), + "DataL" : (UTFTypeValues.bytes, UTFBuilder(datal, table_name="CpkItocL", encrypt=False, encoding=self.encoding).bytes()), + "DataH" : (UTFTypeValues.bytes, UTFBuilder(datah, table_name="CpkItocH", encrypt=False, encoding=self.encoding).bytes()) + } + ] + return UTFBuilder(payload, table_name="CpkItocInfo", encrypt=self.encrypt, encoding=self.encoding).bytes() + + def _generate_CPK(self) -> bytearray: + if self.mode == 3: ContentOffset = (0x800+len(self.TOCdata)+len(self.GTOCdata)) CpkHeader = [ { @@ -525,7 +562,7 @@ def generate_CPK(self) -> bytearray: "Align": (UTFTypeValues.ushort, 0x800), "Sorted": (UTFTypeValues.ushort, 1), "EnableFileName": (UTFTypeValues.ushort, 1), - "CpkMode": (UTFTypeValues.uint, self.CpkMode), + "CpkMode": (UTFTypeValues.uint, self.mode), "Tvers": (UTFTypeValues.string, self.Tver), "Codec": (UTFTypeValues.uint, 0), "DpkItoc": (UTFTypeValues.uint, 0), @@ -552,7 +589,7 @@ def generate_CPK(self) -> bytearray: "Comment": (UTFTypeValues.string, ''), } ] - elif self.CpkMode == 2: + elif self.mode == 2: ContentOffset = 0x800+len(self.TOCdata)+len(self.ITOCdata) CpkHeader = [ { @@ -576,7 +613,7 @@ def generate_CPK(self) -> bytearray: "Sorted": (UTFTypeValues.ushort, 1), "EnableFileName": (UTFTypeValues.ushort, 1), "EID": (UTFTypeValues.ushort, None), - "CpkMode": (UTFTypeValues.uint, self.CpkMode), + "CpkMode": (UTFTypeValues.uint, self.mode), "Tvers": (UTFTypeValues.string, self.Tver), "Codec": (UTFTypeValues.uint, 0), "DpkItoc": (UTFTypeValues.uint, 0), @@ -601,7 +638,7 @@ def generate_CPK(self) -> bytearray: "Comment": (UTFTypeValues.string, ''), } ] - elif self.CpkMode == 1: + elif self.mode == 1: ContentOffset = 0x800 + len(self.TOCdata) CpkHeader = [ { @@ -635,7 +672,7 @@ def generate_CPK(self) -> bytearray: "Align": (UTFTypeValues.ushort, 0x800), "Sorted": (UTFTypeValues.ushort, 1), "EID": (UTFTypeValues.ushort, None), - "CpkMode": (UTFTypeValues.uint, self.CpkMode), + "CpkMode": (UTFTypeValues.uint, self.mode), "Tvers": (UTFTypeValues.string, self.Tver), "Comment": (UTFTypeValues.string, ''), "Codec": (UTFTypeValues.uint, 0), @@ -651,7 +688,7 @@ def generate_CPK(self) -> bytearray: "HgtocSize": (UTFTypeValues.ullong, None), } ] - elif self.CpkMode == 0: + elif self.mode == 0: CpkHeader = [ { "UpdateDateTime": (UTFTypeValues.ullong, 0), @@ -669,7 +706,7 @@ def generate_CPK(self) -> bytearray: "Align": (UTFTypeValues.ushort, 0x800), "Sorted": (UTFTypeValues.ushort, 0), "EID": (UTFTypeValues.ushort, None), - "CpkMode": (UTFTypeValues.uint, self.CpkMode), + "CpkMode": (UTFTypeValues.uint, self.mode), "Tvers": (UTFTypeValues.string, self.Tver), "Codec": (UTFTypeValues.uint, 0), "DpkItoc": (UTFTypeValues.uint, 0), @@ -691,66 +728,5 @@ def generate_CPK(self) -> bytearray: "Comment": (UTFTypeValues.string, ''), } ] - return UTFBuilder(CpkHeader, encrypt=self.encrypt, encoding=self.encoding, table_name="CpkHeader").parse() - - def generate_ITOC(self) -> bytearray: - if self.CpkMode == 2: - payload = [] - for i in range(len(self.files)): - payload.append( - { - "ID": (UTFTypeValues.int, i), - "TocIndex": (UTFTypeValues.int, i) - } - ) - return UTFBuilder(payload, encrypt=self.encrypt, encoding=self.encoding, table_name="CpkExtendId").parse() - else: - try: - files = sorted(os.listdir(self.dirname), key=int) - except: - raise ValueError("CpkMode of 0 requires filenames to be integers.") - self.files = [os.path.join(self.dirname, i) for i in files] - if len(files) == 0: - raise ValueError("No files are present in the given directory.") - elif len(files) > 0xFFFF: - raise OverflowError("CpkMode of 0 can only contain 65535 files at max.") - self.fileslen = len(files) - datal = [] - datah = [] - for i in files: - sz = os.stat(os.path.join(self.dirname, i)).st_size - self.EnabledDataSize += sz - if sz % 0x800 != 0: - self.ContentSize += sz + (0x800 - sz % 0x800) - else: - self.ContentSize += sz - if sz > 0xFFFF: - dicth = { - "ID": (UTFTypeValues.ushort, int(i)), - "FileSize": (UTFTypeValues.uint, sz), - "ExtractSize": (UTFTypeValues.uint, sz) - } - datah.append(dicth) - else: - dictl = { - "ID": (UTFTypeValues.ushort, int(i)), - "FileSize": (UTFTypeValues.ushort, sz), - "ExtractSize": (UTFTypeValues.ushort, sz) - } - datal.append(dictl) - datallen = len(datal) - datahlen = len(datah) - self.EnabledPackedSize = self.EnabledDataSize - if len(datal) == 0: - datal.append({"ID": (UTFTypeValues.ushort, 0), "FileSize": (UTFTypeValues.ushort, 0), "ExtractSize": (UTFTypeValues.ushort, 0)}) - elif len(datah) == 0: - datah.append({"ID": (UTFTypeValues.uint, 0), "FileSize": (UTFTypeValues.uint, 0), "ExtractSize": (UTFTypeValues.uint, 0)}) - payload = [ - { - "FilesL" : (UTFTypeValues.uint, datallen), - "FilesH" : (UTFTypeValues.uint, datahlen), - "DataL" : (UTFTypeValues.bytes, UTFBuilder(datal, table_name="CpkItocL", encrypt=False, encoding=self.encoding).parse()), - "DataH" : (UTFTypeValues.bytes, UTFBuilder(datah, table_name="CpkItocH", encrypt=False, encoding=self.encoding).parse()) - } - ] - return UTFBuilder(payload, table_name="CpkItocInfo", encrypt=self.encrypt, encoding=self.encoding).parse() \ No newline at end of file + return UTFBuilder(CpkHeader, encrypt=self.encrypt, encoding=self.encoding, table_name="CpkHeader").bytes() + \ No newline at end of file diff --git a/PyCriCodecs/hca.py b/PyCriCodecs/hca.py index 21292fd..27f35b2 100644 --- a/PyCriCodecs/hca.py +++ b/PyCriCodecs/hca.py @@ -18,11 +18,6 @@ HcaRvaHeaderStruct = Struct(">4sf") class HCA: - __slots__ = ["stream", "HcaSig", "version", "header_size", "key", "subkey", "hca", "filetype", "wavbytes", - "riffSignature", "riffSize", "wave", "fmt", "fmtSize", "fmtType", "fmtChannelCount", "hcastream", - "fmtSamplingRate", "fmtSamplesPerSec", "fmtSamplingSize", "fmtBitCount", "dataSig", "dataSize", "Flags", - "AlignmentSamples", "LoopCount", "LoopNum", "LoopType", "LoopStartSample", "LoopStartByte", "LoopEndSample", - "LoopEndByte", "looping", "hcabytes", "encrypted", "enc_table", "table"] stream: BinaryIO hcastream: BinaryIO HcaSig: bytes diff --git a/PyCriCodecs/ivf.py b/PyCriCodecs/ivf.py deleted file mode 100644 index 2d5bb2a..0000000 --- a/PyCriCodecs/ivf.py +++ /dev/null @@ -1,62 +0,0 @@ -from io import FileIO, BytesIO -from typing import BinaryIO, Generator -from struct import Struct - -IvfChunkHeaderStruct = Struct("<4sHH4sHHIIII") -IvfFrameChunkHeaderStruct = Struct(" None: - """ Loads in the IVF file. """ - if type(ivffile) == str: - self.stream = FileIO(ivffile) - elif type(ivffile) == bytes or type(ivffile) == bytearray: - self.stream = BytesIO(ivffile) - else: - self.stream = ivffile - self.loadfile() - - def loadfile(self) -> None: - """ Cache data into class property. """ - header, version, header_len, codec, width, height, tbd, tbn, num_frames, reserved = IvfChunkHeaderStruct.unpack( - self.stream.read(IvfChunkHeaderStruct.size) - ) - - if header != b'DKIF' and codec != b"VP90": # Only VP9. - raise ValueError("Invalid or unsupported IVF file/codec.") - - self.ivf = dict( - Header = header, - Version = version, - HeaderSize = header_len, - Codec = codec, - Width = width, - Height = height, - time_base_denominator = tbd, - time_base_numerator = tbn, - FrameCount = num_frames, - Reserved = reserved - ) - self.stream.seek(header_len, 0) - - def get_frames(self) -> Generator: - """ Generator function to retrieve Frame size, Frame time, Frame number, Frame data, and Key Frame Flag. """ - for i in range(self.ivf['FrameCount']): - FrameSize, TimeStamp = IvfFrameChunkHeaderStruct.unpack( - self.stream.read(IvfFrameChunkHeaderStruct.size) - ) - self.stream.seek(-IvfFrameChunkHeaderStruct.size, 1) - FrameData = self.stream.read(FrameSize+IvfFrameChunkHeaderStruct.size) - Keyframe = False - if FrameData.startswith(KeyFrame_Flag): - Keyframe = True - yield (FrameSize+IvfFrameChunkHeaderStruct.size, TimeStamp, i, FrameData, Keyframe) # Basically, (len, time, framenum, data, Keyframeflag) - - def info(self) -> dict: - return self.ivf \ No newline at end of file diff --git a/PyCriCodecs/usm.py b/PyCriCodecs/usm.py index a077f78..07f1955 100644 --- a/PyCriCodecs/usm.py +++ b/PyCriCodecs/usm.py @@ -1,49 +1,28 @@ import os -from typing import BinaryIO +import itertools, shutil +from typing import BinaryIO, List from io import FileIO, BytesIO +from functools import cached_property + from .chunk import * from .utf import UTF, UTFBuilder -from .ivf import IVF from .adx import ADX from .hca import HCA +import ffmpeg, tempfile + # Big thanks and credit for k0lb3 and 9th helping me write this specific code. # Also credit for the original C++ code from Nyagamon/bnnm. # Apparently there is an older USM format called SofDec? This is for SofDec2 though. # Extraction working only for now, although check https://github.com/donmai-me/WannaCRI/ # code for a complete breakdown of the USM format. -class USM: - """ USM class for extracting infromation and data from a USM file. """ - __slots__ = ["filename", "videomask1", "videomask2", "audiomask", "decrypt", - "stream", "__fileinfo", "CRIDObj", "size", "output", "codec", "demuxed"] - filename: BinaryIO + +class USMCrypt: videomask1: bytearray videomask2: bytearray audiomask: bytearray - decrypt: bool - stream: BinaryIO - __fileinfo: list - CRIDObj: UTF - output: dict[str, bytes] - size: int - codec: int - demuxed: bool - - def __init__(self, filename, key: str = False): - """ - Sets the decryption status, if the key is not given, it will return the plain SFV data. - If the key is given the code will decrypt SFA data if it was ADX, otherwise return plain SFA data. - """ - self.filename = filename - self.decrypt = False - - if key and type(key) != bool: - self.decrypt = True - self.init_key(key) - self.load_file() - def init_key(self, key: str): if type(key) == str: if len(key) <= 16: @@ -56,7 +35,9 @@ def init_key(self, key: str): key1 = int.to_bytes(key & 0xFFFFFFFF, 4, "big") key2 = int.to_bytes(key >> 32, 4, "big") else: - raise ValueError("Invalid key format, must be either a string or an integer.") + raise ValueError( + "Invalid key format, must be either a string or an integer." + ) t = bytearray(0x20) t[0x00:0x09] = [ key1[3], @@ -82,7 +63,7 @@ def init_key(self, key: str): (t[0x0D] ^ 0xFF) % 0x100, (t[0x0A] - t[0x0B]) % 0x100, ] - t[0x10] = ((t[0x08] - t[0x0F]) % 0x100) + t[0x10] = (t[0x08] - t[0x0F]) % 0x100 t[0x11:0x17] = [ (t[0x10] ^ t[0x07]) % 0x100, (t[0x0F] ^ 0xFF) % 0x100, @@ -106,174 +87,15 @@ def init_key(self, key: str): (t[0x05] - t[0x16]) % 0x100, ] t[0x1F] = (t[0x1D] ^ t[0x13]) % 0x100 - t2=[b'U', b'R', b'U', b'C'] + t2 = [b"U", b"R", b"U", b"C"] self.videomask1 = t self.videomask2 = bytearray(map(lambda x: x ^ 0xFF, t)) self.audiomask = bytearray(0x20) for x in range(0x20): - if (x&1) == 1: - self.audiomask[x] = ord(t2[(x>>1)&3]) + if (x & 1) == 1: + self.audiomask[x] = ord(t2[(x >> 1) & 3]) else: self.audiomask[x] = self.videomask2[x] - - # Loads in the file and check if it's an USM file. - def load_file(self): - if type(self.filename) == str: - self.stream = FileIO(self.filename) - else: - self.stream = BytesIO(self.filename) - self.stream.seek(0, 2) - self.size = self.stream.tell() - self.stream.seek(0) - header = self.stream.read(4) - if header != USMChunckHeaderType.CRID.value: - raise NotImplementedError(f"Unsupported file type: {header}") - self.stream.seek(0) - self.demuxed = False - - # Demuxes the USM - def demux(self) -> None: - """ Gets data from USM chunks and assignes them to output. """ - self.stream.seek(0) - self.__fileinfo = list() # Prototype, should be improved. - header, chuncksize, unk08, offset, padding, chno, unk0D, unk0E, type, frametime, framerate, unk18, unk1C = USMChunkHeader.unpack( - self.stream.read(USMChunkHeader.size) - ) - chuncksize -= 0x18 - offset -= 0x18 - self.CRIDObj = UTF(self.stream.read(chuncksize)) - CRID_payload = self.CRIDObj.get_payload() - self.__fileinfo.append({self.CRIDObj.table_name: CRID_payload}) - headers = [(int.to_bytes(x['stmid'][1], 4, "big")).decode() for x in CRID_payload[1:]] - chnos = [x['chno'][1] for x in CRID_payload[1:]] - output = dict() - for i in range(len(headers)): - output[headers[i]+"_"+str(chnos[i])] = bytearray() - while self.stream.tell() < self.size: - header: bytes - header, chuncksize, unk08, offset, padding, chno, unk0D, unk0E, type, frametime, framerate, unk18, unk1C = USMChunkHeader.unpack( - self.stream.read(USMChunkHeader.size) - ) - chuncksize -= 0x18 - offset -= 0x18 - if header.decode() in headers: - if type == 0: - data = self.reader(chuncksize, offset, padding, header) - output[header.decode()+"_"+str(chno)].extend(data) - elif type == 1 or type == 3: - ChunkObj = UTF(self.stream.read(chuncksize)) - self.__fileinfo.append({ChunkObj.table_name: ChunkObj.get_payload()}) - if type == 1 and header == USMChunckHeaderType.SFA.value: - codec = ChunkObj.get_payload()[0] - self.codec = codec['audio_codec'][1] # So far, audio_codec of 2, means ADX, while audio_codec 4 means HCA. - else: - self.stream.seek(chuncksize, 1) - else: - # It is likely impossible for the code to reach here, since the code right now is suitable - # for any chunk type specified in the CRID header. - # But just incase somehow there's an extra chunk, this code might handle it. - if header in [chunk.value for chunk in USMChunckHeaderType]: - if type == 0: - output[header.decode()+"_0"] = bytearray() - data = self.reader(chuncksize, offset, padding, header) - output[header.decode()+"_0"].extend(data) # No channel number info, code here assumes it's a one channel data type. - elif type == 1 or type == 3: - ChunkObj = UTF(self.stream.read(chuncksize)) - self.__fileinfo.append({ChunkObj.table_name: ChunkObj.get_payload()}) - if type == 1 and header == USMChunckHeaderType.SFA.value: - codec = ChunkObj.get_payload()[0] - self.codec = codec['audio_codec'][1] - else: - self.stream.seek(chuncksize, 1) - else: - raise NotImplementedError(f"Unsupported chunk type: {header}") - self.output = output - self.demuxed = True - - def extract(self, dirname: str = ""): - """ Extracts all USM contents. """ - self.stream.seek(0) - if not self.demuxed: - self.demux() - table = self.CRIDObj.get_payload() - point = 0 # My method is not ideal here, but it'll hopefully work. - dirname = dirname # You can add a directory where all extracted data goes into. - filenames = [] - for i in table[1:]: # Skips the CRID table since it has no actual file. - filename: str = i['filename'][1] - - # Adjust filenames and/or paths to extract them into the current directory. - if ":\\" in filename: # Absolute paths. - filename = filename.split(":\\", 1)[1] - elif ":/" in filename: # Absolute paths. - filename = filename.split(":/", 1)[1] - elif ":"+os.sep in filename: # Absolute paths. - filename = filename.split(":"+os.sep, 1)[1] - elif ".."+os.sep in filename: # Relative paths. - filename = filename.rsplit(".."+os.sep, 1)[1] - elif "../" in filename: # Relative paths. - filename = filename.rsplit("../", 1)[1] - elif "..\\" in filename: # Relative paths. - filename = filename.rsplit("..\\", 1)[1] - filename = ''.join(x for x in filename if x not in ':?*<>|"') # removes illegal characters. - - filename = os.path.join(dirname, filename) # Preserves the path structure if there's one. - if filename not in filenames: - filenames.append(filename) - else: - if "." in filename: - fl = filename.rsplit(".", 1) - filenames.append(fl[0] + "_" + str(point) + "." + fl[1]) - point += 1 - else: - filenames.append(filename + "_" + str(point)) - point += 1 - - point = 0 - for chunk, data in self.output.items(): - chunk = chunk.rsplit("_", 1)[0] - if dirname or "\\" in filenames[point] or "/" in filenames[point] or os.sep in filenames[point]: - os.makedirs(os.path.dirname(filenames[point]), exist_ok=True) - if chunk == USMChunckHeaderType.SBT.value.decode(): - # Subtitle information. - texts = self.sbt_to_srt(data) - for i in range(len(texts)): - filename = filenames[point] - if "." in filename: - fl = filename.rsplit(".", 1) - filename = fl[0] + "_" + str(i) + ".srt" - else: - filename = filename + "_" + str(i) - open(filename, "w", encoding="utf-8").write(texts[i]) - else: - open(filenames[point], "wb").write(data) - point += 1 - elif chunk == USMChunckHeaderType.CUE.value.decode(): - # CUE chunks is actually just metadata. - # and can be accessed by get_metadata() function after demuxing or extracting. - point += 1 - elif data == bytearray(): - # This means it has no data, and just like the CUE, it might be just metadata. - point += 1 - elif filenames[point] == "": - # Rare case and might never happen unless the USM is artificially edited. - fl = table[0]["filename"][1].rsplit(".", 1)[0] + "_" + str(point) + ".bin" - open(fl, "wb").write(data) - point += 1 - else: - open(filenames[point], "wb").write(data) - point += 1 - - def reader(self, chuncksize, offset, padding, header) -> bytearray: - """ Chunks reader function, reads all data in a chunk and returns a bytearray. """ - data = bytearray(self.stream.read(chuncksize)[offset:]) - if header == USMChunckHeaderType.SFV.value or header == USMChunckHeaderType.ALP.value: - data = self.VideoMask(data) if self.decrypt else data - elif header == USMChunckHeaderType.SFA.value: - data = self.AudioMask(data) if (self.codec == 2 and self.decrypt) else data - if padding: - data = data[:-padding] - return data # Decrypt SFV chunks or ALP chunks, should only be used if the video data is encrypted. def VideoMask(self, memObj: bytearray) -> bytearray: @@ -282,7 +104,7 @@ def VideoMask(self, memObj: bytearray) -> bytearray: size = len(memObj) # memObj len is a cached property, very fast to lookup if size <= 0x200: - return (head + memObj) + return head + memObj data_view = memoryview(memObj).cast("Q") # mask 2 @@ -307,7 +129,7 @@ def VideoMask(self, memObj: bytearray) -> bytearray: data_view[i] ^= mask_view[mask_index] mask_index = (mask_index + 1) % 4 - return (head + memObj) + return head + memObj # Decrypts SFA chunks, should just be used with ADX files. def AudioMask(self, memObj: bytearray) -> bytearray: @@ -317,249 +139,399 @@ def AudioMask(self, memObj: bytearray) -> bytearray: data_view = memoryview(memObj).cast("Q") mask = bytearray(self.audiomask) mask_view = memoryview(mask).cast("Q") - for i in range(size//8): - data_view[i] ^= mask_view[i%4] - return (head + memObj) - - def sbt_to_srt(self, stream: bytearray) -> list: - """ Convert SBT chunks info to SRT. """ - # After searching, I found how the SBT format is actually made. - # But the use case for them is not ideal as they are proprietary. - # So I will just convert them to SRT. - size = len(stream) - stream: BytesIO = BytesIO(stream) - out = dict() - while stream.tell() < size: - langid, framerate, frametime, duration, data_size = SBTChunkHeader.unpack( - stream.read(SBTChunkHeader.size) - ) - # Language ID's are arbitrary, so they could be anything. - duration_in_ms = frametime - ms = duration_in_ms % framerate - sec = (duration_in_ms // framerate) % 60 - mins = (duration_in_ms // (framerate*60)) % 60 - hrs = (duration_in_ms // (framerate*60*60)) % 24 - start = f'{hrs:0>2.0f}:{mins:0>2.0f}:{sec:0>2.0f},{ms:0>3.0f}' - - duration_in_ms = frametime + duration - ms = duration_in_ms % framerate - sec = (duration_in_ms // framerate) % 60 - mins = (duration_in_ms // (framerate*60)) % 60 - hrs = (duration_in_ms // (framerate*60*60)) % 24 - end = f'{hrs:0>2.0f}:{mins:0>2.0f}:{sec:0>2.0f},{ms:0>3.0f}' - - text = stream.read(data_size) - if text.endswith(b"\x00\x00"): - text = text[:-2].decode("utf-8", errors="ignore") + "\n\n" - else: - text = text.decode("utf-8", errors="ignore") - if langid in out: - out[langid].append(str(int(out[langid][-1].split("\n", 1)[0]) + 1) + "\n" + start + " --> " + end + "\n" + text) - else: - out[langid] = [(str(1) + "\n" + start + " --> " + end + "\n" + text)] - out = ["".join(v) for k, v in out.items()] - return out - - def get_metadata(self): - """ Function to return USM metadata after demuxing. """ - return self.__fileinfo + for i in range(size // 8): + data_view[i] ^= mask_view[i % 4] + return head + memObj + # There are a lot of unknowns, minbuf(minimum buffer of what?) and avbps(average bitrate per second) # are still unknown how to derive them, at least video wise it is possible, no idea how it's calculated audio wise nor anything else # seems like it could be random values and the USM would still work. -class USMBuilder: - __slots__ = ["ivfObj", "videomask1", "videomask2", "audiomask", "encrypt", "audio_codec", - "streams", "encryptAudio", "SFA_chunk_size", "base_interval_per_SFA_chunk", - "video_codec", "SFV_interval_for_VP9", "audio", "video_filename", "minchk", - "audio_filenames", "minbuf", "avbps", "key", "usm"] - ivfObj: IVF - videomask1: bytearray - videomask2: bytearray - audiomask: bytearray - encrypt: bool - audio_codec: str - streams: list - encryptAudio: bool - SFA_chunk_size: list - base_interval_per_SFA_chunk: list - video_codec: str - SFV_interval_for_VP9: float - audio: bool - video_filename: str - audio_filenames: list +class FFmpegCodec: + filename: str + filesize: int + + info: dict + file: FileIO + minchk: int minbuf: int avbps: int - key: int - usm: bytes - def __init__(self, video, audio = False, key = False, audio_codec: str = "adx", encryptAudio: bool = False) -> None: - """ USM constructor, needs a video to build a USM. """ - if type(video) == str: - videostream = FileIO(video) - self.video_filename = video - else: - videostream = BytesIO(video) - self.video_filename = "temp.ivf" - - header = videostream.read(4) - - if header == USMChunckHeaderType.CRID.value: - raise NotImplementedError("USM editing is not implemented yet.") - # self.load_usm() - # self.ivfObj = False - # self.encryptAudio = encryptAudio - # self.audio_codec = audio_codec.lower() - # self.encrypt = False - # if key: - # self.init_key(key) - # self.encrypt = True - elif header != VideoType.IVF.value: - raise NotImplementedError("Video container must be in IVF format containing VP9 codec.") + def __init__(self, stream: str | bytes): + if type(stream) == str: + self.filename = stream else: - videostream.seek(0) - self.ivfObj = IVF(videostream) - self.video_codec = "vp9" - self.audio_codec = audio_codec.lower() - self.encrypt = False - self.audio = False - self.encryptAudio = encryptAudio - self.key = 0 - if encryptAudio and not key: - raise ValueError("Cannot encrypt Audio without key.") - if key: - self.init_key(key) - self.encrypt = True - if audio: - self.load_audio(audio) - self.audio = True - - def load_audio(self, audio): - self.audio_filenames = [] - if type(audio) == list: - count = 0 - for track in audio: - if type(track) == str: - self.audio_filenames.append(track) - else: - self.audio_filenames.append("{:02d}.sfa".format(count)) - count += 1 + self.tempfile = tempfile.NamedTemporaryFile(delete=False) + self.tempfile.write(stream) + self.tempfile.close() + self.filename = self.tempfile.name + self.info = ffmpeg.probe( + self.filename, show_entries="packet=dts,pts_time,pos,flags,duration_time" + ) + if type(stream) == str: + self.file = open(self.filename, "rb") + self.filesize = os.path.getsize(self.filename) else: - if type(audio) == str: - self.audio_filenames.append(audio) - else: - self.audio_filenames.append("00.sfa") + os.unlink(self.tempfile.name) + self.file = BytesIO(stream) + self.filesize = len(stream) - self.streams = [] - if self.audio_codec == "adx": - if type(audio) == list: - for track in audio: - wav_bytes = open(track, "rb").read() - adxObj = ADX.encode(wav_bytes, AdxVersion=4, Encoding=3, force_not_looping=True) - self.streams.append(adxObj) - else: - wav_bytes = open(audio, "rb").read() - adxObj = ADX.encode(wav_bytes, AdxVersion=4, Encoding=3, force_not_looping=True) - self.streams.append(adxObj) - elif self.audio_codec == "hca": - if type(audio) == list: - for track in audio: - hcaObj = HCA(track, key=self.key) - if hcaObj.filetype == "wav": - hcaObj.encode(force_not_looping=True, encrypt=self.encryptAudio, keyless=False) - self.streams.append(hcaObj) - else: - hcaObj = HCA(audio, key=self.key) - if hcaObj.filetype == "wav": - hcaObj.encode(force_not_looping=True, encrypt=self.encryptAudio, keyless=False) - self.streams.append(hcaObj) - else: - raise ValueError("Supported audio codecs in USM are only HCA and ADX.") - - def append_stream(self, audio): - assert type(audio) != list - if self.audio_codec == "adx": - wav_bytes = open(audio, "rb").read() - adxObj = ADX.encode(wav_bytes, AdxVersion=4, Encoding=3, force_not_looping=True) - self.streams.append(adxObj) - elif self.audio_codec == "hca": - hcaObj = HCA(audio, self.key) - if hcaObj.filetype == "wav": - hcaObj.encode(force_not_looping=True, encrypt=self.encryptAudio, keyless=False) - self.streams.append(hcaObj) - else: - raise ValueError("Supported audio codecs in USM are only HCA and ADX.") - - def build(self) -> bytes: - if not self.ivfObj: - raise NotImplementedError("Loaded USM is not supported yet.") # saved with get_usm() - if self.audio: - self.prepare_SFA() - self.prepare_SFV() - # This will be a hit to performance, but I will store the building USM on memory instead of - # flushing it to disk right away, this in case something going wrong. - self.get_data() + @property + def format(self): + return self.info["format"]["format_name"] + + @property + def stream(self) -> dict: + return self.info["streams"][0] + + @property + def codec(self): + return self.stream["codec_name"] - # So, so bad. FIXME - def get_data(self) -> bytes: - ivfinfo = self.ivfObj.info() - self.ivfObj.stream.seek(0) - current_interval = 0 - v_framerate = int((ivfinfo["time_base_denominator"] / ivfinfo["time_base_numerator"]) * 100) - SFV_header = self.ivfObj.stream.read(ivfinfo["HeaderSize"]) + @cached_property + def framerate(self): + """Running framerate (max frame rate)""" + # Lesson learned. Do NOT trust the metadata. + # num, denom = self.stream["r_frame_rate"].split("/") + # return int(int(num) / int(denom)) + return 1 / min((dt for _, _, _, dt in self.frames())) - ######################################### - # SFV chunks generator. - ######################################### + @cached_property + def avg_framerate(self): + """Average framerate""" + # avg_frame_rate = self.stream.get("avg_frame_rate", None) + # if avg_frame_rate: + # num, denom = avg_frame_rate.split("/") + # return int(int(num) / int(denom)) + return self.frame_count / sum((dt for _, _, _, dt in self.frames())) + + @property + def packets(self): + return self.info["packets"] + + @property + def width(self): + return self.stream["width"] + + @property + def height(self): + return self.stream["height"] + + @property + def frame_count(self): + return len(self.packets) + + def frames(self): + """frame data, frame dict, is keyframe, duration""" + offsets = [int(packet["pos"]) for packet in self.packets] + [self.filesize] + for i, frame in enumerate(self.packets): + frame_size = offsets[i + 1] - offsets[i] + self.file.seek(offsets[i]) + raw_frame = self.file.read(frame_size) + yield raw_frame, frame, frame["flags"][0] == "K", float(frame["duration_time"]) + + def generate_SFV(self, builder: "USMBuilder"): + v_framerate = int(self.framerate) + current_interval = 0 SFV_list = [] - SFV_chunk = b'' + SFV_chunk = b"" count = 0 self.minchk = 0 self.minbuf = 0 bitrate = 0 - for data in self.ivfObj.get_frames(): + for data, _, is_keyframe, dt in self.frames(): # SFV has priority in chunks, it comes first. - pad_len = data[0] + len(SFV_header) if count == 0 else data[0] - padding = (0x20 - (pad_len % 0x20) if pad_len % 0x20 != 0 else 0) + datalen = len(data) + padlen = 0x20 - (datalen % 0x20) if datalen % 0x20 != 0 else 0 SFV_chunk = USMChunkHeader.pack( - USMChunckHeaderType.SFV.value, - pad_len + 0x18 + padding, - 0, - 0x18, - padding, - 0, - 0, - 0, - 0, - current_interval, - v_framerate, - 0, - 0 - ) - temp = data[3] - if count == 0: - temp = SFV_header + temp - if self.encrypt: - temp = self.VideoMask(temp) - SFV_chunk += temp - SFV_chunk = SFV_chunk.ljust(pad_len + 0x18 + padding + 0x8, b"\x00") + USMChunckHeaderType.SFV.value, + datalen + 0x18 + padlen, + 0, + 0x18, + padlen, + 0, + 0, + 0, + 0, + int(current_interval), + v_framerate, + 0, + 0, + ) + if builder.encrypt: + data = builder.VideoMask(data) + SFV_chunk += data + SFV_chunk = SFV_chunk.ljust(datalen + 0x18 + padlen + 0x8, b"\x00") SFV_list.append(SFV_chunk) count += 1 - current_interval = int(count * self.SFV_interval_for_VP9) - if data[4]: + current_interval += 2997 * dt # 29.97 as base + if is_keyframe: self.minchk += 1 - if self.minbuf < pad_len: - self.minbuf = pad_len - bitrate += (pad_len * 8 * (v_framerate/100)) + if self.minbuf < datalen: + self.minbuf = datalen + bitrate += datalen * 8 * v_framerate else: - self.avbps = int(bitrate/count) + self.avbps = int(bitrate / count) SFV_chunk = USMChunkHeader.pack( - USMChunckHeaderType.SFV.value, + USMChunckHeaderType.SFV.value, 0x38, 0, 0x18, 0, 0, 0, 0, 2, 0, 30, 0, 0 + ) + SFV_chunk += b"#CONTENTS END ===============\x00" + SFV_list.append(SFV_chunk) + return SFV_list + + def save(self, filepath: str): + '''Saves the raw, underlying video stream to a file.''' + tell = self.file.tell() + self.file.seek(0) + shutil.copyfileobj(self.file, open(filepath, 'wb')) + self.file.seek(tell) + +class VP9Codec(FFmpegCodec): + MPEG_CODEC = 9 + MPEG_DCPREC = 0 + VERSION = 16777984 + + def __init__(self, filename: str | bytes): + super().__init__(filename) + assert self.format == "ivf", "must be ivf format." +class H264Codec(FFmpegCodec): + MPEG_CODEC = 5 + MPEG_DCPREC = 11 + VERSION = 0 + + def __init__(self, filename : str | bytes): + super().__init__(filename) + assert ( + self.format == "h264" + ), "must be raw h264 data. transcode with '.h264' suffix as output" +class MPEG1Codec(FFmpegCodec): + MPEG_CODEC = 1 + MPEG_DCPREC = 11 + VERSION = 0 + + def __init__(self, stream : str | bytes): + super().__init__(stream) + assert 'mp4' in self.format, "must be mp4 format." + +class HCACodec(HCA): + CHUNK_INTERVAL = 64 + BASE_FRAMERATE = 2997 # dt = CHUNK_INTERVAL / BASE_FRAMERATE + AUDIO_CODEC = 4 + METADATA_COUNT = 1 + + filename: str + + chnls: int + sampling_rate: int + total_samples: int + avbps: int + + filesize: int + + def __init__(self, stream: str | bytes, filename: str, quality: CriHcaQuality, key=0, subkey=0, **kwargs): + self.filename = filename + super().__init__(stream, key, subkey) + if self.filetype == "wav": + self.encode( + force_not_looping=True, + encrypt=key != 0, + keyless=False, + quality_level=quality + ) + self.hcastream.seek(0, 2) + self.filesize = self.hcastream.tell() + self.hcastream.seek(0) + + if self.filetype == "wav": + self.chnls = self.fmtChannelCount + self.sampling_rate = self.fmtSamplingRate + self.total_samples = int(self.dataSize // self.fmtSamplingSize) + else: + self.chnls = self.hca["ChannelCount"] + self.sampling_rate = self.hca["SampleRate"] + self.total_samples = self.hca["FrameCount"] + # I don't know how this is derived so I am putting my best guess here. TODO + self.avbps = int(self.filesize / self.chnls) + + def generate_SFA(self, index: int, builder: "USMBuilder"): + current_interval = 0 + padding = ( + 0x20 - (self.hca["HeaderSize"] % 0x20) + if self.hca["HeaderSize"] % 0x20 != 0 + else 0 + ) + SFA_chunk = USMChunkHeader.pack( + USMChunckHeaderType.SFA.value, + self.hca["HeaderSize"] + 0x18 + padding, + 0, + 0x18, + padding, + index, + 0, + 0, + 0, + current_interval, + self.BASE_FRAMERATE, + 0, + 0, + ) + SFA_chunk += self.get_header().ljust(self.hca["HeaderSize"] + padding, b"\x00") + res = [] + res.append(SFA_chunk) + for i, frame in enumerate(self.get_frames(), start=1): + padding = ( + 0x20 - (self.hca["FrameSize"] % 0x20) + if self.hca["FrameSize"] % 0x20 != 0 + else 0 + ) + SFA_chunk = USMChunkHeader.pack( + USMChunckHeaderType.SFA.value, + self.hca["FrameSize"] + 0x18 + padding, + 0, + 0x18, + padding, + index, + 0, + 0, + 0, + current_interval, + self.BASE_FRAMERATE, + 0, + 0, + ) + SFA_chunk += frame[1].ljust(self.hca["FrameSize"] + padding, b"\x00") + current_interval = round(i * self.CHUNK_INTERVAL) + res.append(SFA_chunk) + else: + SFA_chunk = USMChunkHeader.pack( + USMChunckHeaderType.SFA.value, + 0x38, + 0, + 0x18, + 0, + index, + 0, + 0, + 2, + 0, + 30, + 0, + 0, + ) + SFA_chunk += b"#CONTENTS END ===============\x00" + res[-1] += SFA_chunk + + return res + + def get_metadata(self): + payload = [dict(hca_header=(UTFTypeValues.bytes, self.get_header()))] + p = UTFBuilder(payload, table_name="AUDIO_HEADER") + p.strings = b"\x00" + p.strings + return p.bytes() + + def save(self, filepath: str): + """Saves the decoded WAV audio to filepath""" + with open(filepath, "wb") as f: + f.write(self.decode()) + +class ADXCodec(ADX): + CHUNK_INTERVAL = 99.9 + BASE_FRAMERATE = 2997 + # TODO: Move these to an enum + AUDIO_CODEC = 2 + METADATA_COUNT = 0 + + filename : str + filesize : int + + adx : bytes + header : bytes + sfaStream: BinaryIO + + AdxDataOffset: int + AdxEncoding: int + AdxBlocksize: int + AdxSampleBitdepth: int + AdxChannelCount: int + AdxSamplingRate: int + AdxSampleCount: int + AdxHighpassFrequency: int + AdxVersion: int + AdxFlags: int + + chnls: int + sampling_rate: int + total_samples: int + avbps: int + + def __init__(self, stream: str | bytes, filename: str, bitdepth: int = 4, **kwargs): + if type(stream) == str: + self.adx = open(stream, "rb").read() + else: + self.adx = stream + self.filename = filename + self.filesize = len(self.adx) + magic = self.adx[:4] + if magic == b"RIFF": + self.adx = self.encode(self.adx, bitdepth, force_not_looping=True) + self.sfaStream = BytesIO(self.adx) + header = AdxHeaderStruct.unpack(self.sfaStream.read(AdxHeaderStruct.size)) + FourCC, self.AdxDataOffset, self.AdxEncoding, self.AdxBlocksize, self.AdxSampleBitdepth, self.AdxChannelCount, self.AdxSamplingRate, self.AdxSampleCount, self.AdxHighpassFrequency, self.AdxVersion, self.AdxFlags = header + assert FourCC == 0x8000, "Either ADX or WAV is supported" + assert self.AdxVersion in {3,4}, "Unsupported ADX version" + if self.AdxVersion == 4: + self.sfaStream.seek(4 + 4 * self.AdxChannelCount, 1) # Padding + Hist values, they always seem to be 0. + self.sfaStream.seek(0) + self.chnls = self.AdxChannelCount + self.sampling_rate = self.AdxSamplingRate + self.total_samples = self.AdxSampleCount + self.avbps = int(self.filesize * 8 * self.chnls) - self.filesize + + def generate_SFA(self, index: int, builder: "USMBuilder"): + current_interval = 0 + stream_size = len(self.adx) - self.AdxBlocksize + chunk_size = int(self.AdxSamplingRate // (self.BASE_FRAMERATE / 100) // 32) * (self.AdxBlocksize * self.AdxChannelCount) + self.sfaStream.seek(0) + res = [] + while self.sfaStream.tell() < stream_size: + if self.sfaStream.tell() > 0: + if self.sfaStream.tell() + chunk_size < stream_size: + datalen = chunk_size + else: + datalen = (stream_size - (self.AdxDataOffset + 4) - chunk_size) % chunk_size + else: + datalen = self.AdxDataOffset + 4 + padding = (0x20 - (datalen % 0x20) if datalen % 0x20 != 0 else 0) + SFA_chunk = USMChunkHeader.pack( + USMChunckHeaderType.SFA.value, + datalen + 0x18 + padding, + 0, + 0x18, + padding, + index, + 0, + 0, + 0, + round(current_interval), + self.BASE_FRAMERATE, + 0, + 0 + ) + chunk_data = self.sfaStream.read(datalen) + if builder.encrypt_audio: + SFA_chunk = builder.AudioMask(chunk_data) + SFA_chunk += chunk_data.ljust(datalen + padding, b"\x00") + current_interval += self.CHUNK_INTERVAL + res.append(SFA_chunk) + else: + SFA_chunk = USMChunkHeader.pack( + USMChunckHeaderType.SFA.value, 0x38, 0, 0x18, 0, - 0, + index, 0, 0, 2, @@ -568,350 +540,495 @@ def get_data(self) -> bytes: 0, 0 ) - SFV_chunk += b"#CONTENTS END ===============\x00" - SFV_list.append(SFV_chunk) - ######################################### - # SFV chunks generator end. - ######################################### - - ######################################### - # SFA chunks generator. - ######################################### - if self.audio: - SFA_chunks = [[] for i in range(len(self.streams))] - for stream in self.streams: - current_interval = 0 - if self.audio_codec == "adx": - stream.sfaStream.seek(0, 2) - stream_size = stream.sfaStream.tell() - (0x12 if stream.filetype == "wav" else stream.Blocksize) - stream.sfaStream.seek(0) - count = 0 - while stream.sfaStream.tell() < stream_size: - if stream.sfaStream.tell() == 0: - if stream.filetype == "wav": - do = 0x120 - else: - do = stream.dataOffset+4 - else: - # Compute expensive. - do = (stream_size - (0x120 if stream.filetype == "wav" else stream.dataOffset+4) - self.SFA_chunk_size[self.streams.index(stream)]) % self.SFA_chunk_size[self.streams.index(stream)] if stream.sfaStream.tell() + self.SFA_chunk_size[self.streams.index(stream)] > stream_size else self.SFA_chunk_size[self.streams.index(stream)] - padding = (0x20 - (do % 0x20) if do % 0x20 != 0 else 0) - SFA_chunk = USMChunkHeader.pack( - USMChunckHeaderType.SFA.value, - do + 0x18 + padding, - 0, - 0x18, - padding, - self.streams.index(stream), - 0, - 0, - 0, - current_interval, - 2997, - 0, - 0 - ) - temp_stream = stream.sfaStream.read(do) - if self.encryptAudio: - temp_stream = self.AudioMask(temp_stream) - SFA_chunk += temp_stream.ljust(do + padding, b"\x00") - SFA_chunks[self.streams.index(stream)].append(SFA_chunk) - current_interval = int(count * self.base_interval_per_SFA_chunk[self.streams.index(stream)]) - count += 1 - else: - do = (0x12 if stream.filetype == "wav" else stream.Blocksize) - padding = (0x20 - (do % 0x20) if do % 0x20 != 0 else 0) - SFA_chunk = USMChunkHeader.pack( - USMChunckHeaderType.SFA.value, - do + 0x18 + padding, - 0, - 0x18, - padding, - self.streams.index(stream), - 0, - 0, - 0, - current_interval, - 2997, - 0, - 0 - ) - SFA_chunk += stream.sfaStream.read(do).ljust(do + padding, b"\x00") - SFA_chunks[self.streams.index(stream)].append(SFA_chunk) - current_interval = int(count * self.base_interval_per_SFA_chunk[self.streams.index(stream)]) - SFA_chunk = USMChunkHeader.pack( - USMChunckHeaderType.SFA.value, - 0x38, - 0, - 0x18, - 0, - self.streams.index(stream), - 0, - 0, - 2, - 0, - 30, - 0, - 0 - ) - SFA_chunk += b"#CONTENTS END ===============\x00" - SFA_chunks[self.streams.index(stream)][-1]+=SFA_chunk + SFA_chunk += b"#CONTENTS END ===============\x00" + res[-1] += SFA_chunk + return res + + def get_metadata(self): + return None + + def save(self, filepath: str): + """Saves the encoded ADX audio to filepath""" + with open(filepath, "wb") as f: + f.write(self.decode(self.adx)) + + +class USM(USMCrypt): + """USM class for extracting infromation and data from a USM file.""" + + filename: str + decrypt: bool + stream: BinaryIO + CRIDObj: UTF + output: dict[str, bytes] + size: int + demuxed: bool + + audio_codec: int + video_codec: int + + metadata: list + + def __init__(self, filename, key: str | int = None): + """Loads a USM file into memory and prepares it for processing. + + Args: + filename (str): The path to the USM file. + key (str, optional): The decryption key. Either int64 or a hex string. Defaults to None. + """ + self.filename = filename + self.decrypt = False + + if key: + self.decrypt = True + self.init_key(key) + self._load_file() + + def _load_file(self): + self.stream = open(self.filename, "rb") + self.stream.seek(0, 2) + self.size = self.stream.tell() + self.stream.seek(0) + header = self.stream.read(4) + if header != USMChunckHeaderType.CRID.value: + raise NotImplementedError(f"Unsupported file type: {header}") + self.stream.seek(0) + self._demux() + + def _demux(self) -> None: + """Gets data from USM chunks and assignes them to output.""" + self.stream.seek(0) + self.metadata = list() + ( + header, + chuncksize, + unk08, + offset, + padding, + chno, + unk0D, + unk0E, + type, + frametime, + framerate, + unk18, + unk1C, + ) = USMChunkHeader.unpack(self.stream.read(USMChunkHeader.size)) + chuncksize -= 0x18 + offset -= 0x18 + self.CRIDObj = UTF(self.stream.read(chuncksize)) + CRID_payload = self.CRIDObj.dictarray + headers = [ + (int.to_bytes(x["stmid"][1], 4, "big")).decode() for x in CRID_payload[1:] + ] + chnos = [x["chno"][1] for x in CRID_payload[1:]] + output = dict() + for i in range(len(headers)): + output[headers[i] + "_" + str(chnos[i])] = bytearray() + while self.stream.tell() < self.size: + header: bytes + ( + header, + chuncksize, + unk08, + offset, + padding, + chno, + unk0D, + unk0E, + type, + frametime, + framerate, + unk18, + unk1C, + ) = USMChunkHeader.unpack(self.stream.read(USMChunkHeader.size)) + chuncksize -= 0x18 + offset -= 0x18 + if header.decode() in headers: + if type == 0: + data = self._reader(chuncksize, offset, padding, header) + output[header.decode() + "_" + str(chno)].extend(data) + elif type == 1 or type == 3: + ChunkObj = UTF(self.stream.read(chuncksize)) + self.metadata.append(ChunkObj) + if type == 1: + if header == USMChunckHeaderType.SFA.value: + codec = ChunkObj.dictarray[0] + self.audio_codec = codec["audio_codec"][1] + # So far, audio_codec of 2, means ADX, while audio_codec 4 means HCA. + if header == USMChunckHeaderType.SFV.value: + self.video_codec = ChunkObj.dictarray[0]['mpeg_codec'][1] else: - stream: HCA - padding = (0x20 - (stream.hca["HeaderSize"] % 0x20) if stream.hca["HeaderSize"] % 0x20 != 0 else 0) - SFA_chunk = USMChunkHeader.pack( - USMChunckHeaderType.SFA.value, - stream.hca["HeaderSize"] + 0x18 + padding, - 0, - 0x18, - padding, - self.streams.index(stream), - 0, - 0, - 0, - current_interval, - 2997, - 0, - 0 - ) - SFA_chunk += stream.get_header().ljust(stream.hca["HeaderSize"]+ padding, b"\x00") - SFA_chunks[self.streams.index(stream)].append(SFA_chunk) - for i in stream.get_frames(): - padding = (0x20 - (stream.hca["FrameSize"] % 0x20) if stream.hca["FrameSize"] % 0x20 != 0 else 0) - SFA_chunk = USMChunkHeader.pack( - USMChunckHeaderType.SFA.value, - stream.hca["FrameSize"] + 0x18 + padding, - 0, - 0x18, - padding, - self.streams.index(stream), - 0, - 0, - 0, - current_interval, - 2997, - 0, - 0 - ) - SFA_chunk += i[1].ljust(stream.hca["FrameSize"] + padding , b"\x00") - current_interval += self.base_interval_per_SFA_chunk[self.streams.index(stream)] - SFA_chunks[self.streams.index(stream)].append(SFA_chunk) + self.stream.seek(chuncksize, 1) + else: + # It is likely impossible for the code to reach here, since the code right now is suitable + # for any chunk type specified in the CRID header. + # But just incase somehow there's an extra chunk, this code might handle it. + if header in [chunk.value for chunk in USMChunckHeaderType]: + if type == 0: + output[header.decode() + "_0"] = bytearray() + data = self._reader(chuncksize, offset, padding, header) + output[header.decode() + "_0"].extend( + data + ) # No channel number info, code here assumes it's a one channel data type. + elif type == 1 or type == 3: + ChunkObj = UTF(self.stream.read(chuncksize)) + self.metadata.append(ChunkObj) + if type == 1 and header == USMChunckHeaderType.SFA.value: + codec = ChunkObj.dictarray[0] + self.audio_codec = codec["audio_codec"][1] else: - SFA_chunk = USMChunkHeader.pack( - USMChunckHeaderType.SFA.value, - 0x38, - 0, - 0x18, - 0, - self.streams.index(stream), - 0, - 0, - 2, - 0, - 30, - 0, - 0 - ) - SFA_chunk += b"#CONTENTS END ===============\x00" - SFA_chunks[self.streams.index(stream)][-1]+=SFA_chunk - ######################################### - # SFA chunks generator end. - ######################################### - if self.audio: - self.build_usm(SFV_list, SFA_chunks) - else: - self.streams = [] - self.build_usm(SFV_list) - + self.stream.seek(chuncksize, 1) + else: + raise NotImplementedError(f"Unsupported chunk type: {header}") + self.output = output + self.demuxed = True + + def _reader(self, chuncksize, offset, padding, header) -> bytearray: + """Chunks reader function, reads all data in a chunk and returns a bytearray.""" + data = bytearray(self.stream.read(chuncksize)[offset:]) + if ( + header == USMChunckHeaderType.SFV.value + or header == USMChunckHeaderType.ALP.value + ): + data = self.VideoMask(data) if self.decrypt else data + elif header == USMChunckHeaderType.SFA.value: + data = self.AudioMask(data) if (self.audio_codec == 2 and self.decrypt) else data + if padding: + data = data[:-padding] + return data + + @property + def streams(self): + """[Type (@SFV, @SFA), Filename, Raw stream data]""" + for stream in self.CRIDObj.dictarray[1:]: + filename, stmid, chno = stream["filename"][1], stream["stmid"][1], stream["chno"][1] + stmid = int.to_bytes(stmid, 4, 'big', signed='False') + yield stmid, str(filename), self.output.get(f'{stmid.decode()}_{chno}', None) + + def get_video(self): + """Create a video codec from the available streams. + + NOTE: A temporary file may be created with this process to determine the stream information.""" + stype, sfname, sraw = next(filter(lambda x: x[0] == USMChunckHeaderType.SFV.value, self.streams), (None, None, None)) + stream = None + match self.video_codec: + case MPEG1Codec.MPEG_CODEC: + stream = MPEG1Codec(sraw) + case H264Codec.MPEG_CODEC: + stream = H264Codec(sraw) + case VP9Codec.MPEG_CODEC: + stream = VP9Codec(sraw) + case _: + raise NotImplementedError(f"Unsupported video codec: {self.video_codec}") + stream.filename = sfname + return stream + + def get_audios(self) -> List[HCACodec]: + """Create a list of audio codecs from the available streams.""" + match self.audio_codec: + case ADXCodec.AUDIO_CODEC: + return [ADXCodec(s[2], s[1]) for s in self.streams if s[0] == USMChunckHeaderType.SFA.value] + case HCACodec.AUDIO_CODEC: + return [HCACodec(s[2], s[1]) for s in self.streams if s[0] == USMChunckHeaderType.SFA.value] # HCAs are never encrypted in USM + case _: + return [] + +class USMBuilder(USMCrypt): + """USM class for building USM files.""" + video_stream: VP9Codec | H264Codec | MPEG1Codec - # TODO Add support for Subtitle information. - def build_usm(self, SFV_list: list, SFA_chunks: list = False, SBT_chunks = None): - header = self.build_header(SFV_list, SFA_chunks, SBT_chunks) - len_sfv = len(SFV_list) - if self.audio: - len_sfa = [len(x) for x in SFA_chunks] + enable_audio: bool + audio_streams: List[HCACodec | ADXCodec] + + key: int + encrypt: bool + encrypt_audio: bool + + audio_codec: int + # !!: TODO Quality settings + def __init__( + self, + video: str, + audio: List[str] | str = None, + key = None, + audio_codec=HCACodec.AUDIO_CODEC, + encrypt_audio: bool = False, + ) -> None: + """Initialize the USMBuilder from set source files. + + Args: + video (str): The path to the video file. The video source format will be used to map accordingly to the ones Sofdec use. + - MPEG1 (with MP4 container): MPEG1 Codec (Sofdec Prime) + - H264 (with H264 container): H264 Codec + - VP9 (with IVF container): VP9 Codec + audio (List[str] | str, optional): The path(s) to the audio file(s). Defaults to None. + key (str | int, optional): The encryption key. Either int64 or a hex string. Defaults to None. + audio_codec (int, optional): The audio codec to use. Defaults to HCACodec.AUDIO_CODEC. + encrypt_audio (bool, optional): Whether to encrypt the audio. Defaults to False. + """ + self.audio_codec = audio_codec + self.encrypt = False + self.enable_audio = False + self.encrypt_audio = encrypt_audio + self.key = 0 + if encrypt_audio and not key: + raise ValueError("Cannot encrypt Audio without key.") + if key: + self.init_key(key) + self.encrypt = True + self.load_video(video) + self.audio_streams = [] + if audio: + self.load_audio(audio) + self.enable_audio = True + + def load_video(self, video): + temp_stream = FFmpegCodec(video) + self.video_stream = None + match temp_stream.stream["codec_name"]: + case "h264": + self.video_stream = H264Codec(video) + case "vp9": + self.video_stream = VP9Codec(video) + case "mpeg1video": + self.video_stream = MPEG1Codec(video) + assert self.video_stream, ( + "fail to match suitable video codec. Codec=%s" + % temp_stream.stream["codec_name"] + ) + + def load_audio(self, audio): + self.audio_filenames = [] + if type(audio) == list: + count = 0 + for track in audio: + if type(track) == str: + self.audio_filenames.append(os.path.basename(track)) + else: + self.audio_filenames.append("{:02d}.sfa".format(count)) + count += 1 + else: + if type(audio) == str: + self.audio_filenames.append(os.path.basename(audio)) + else: + self.audio_filenames.append("00.sfa") + + self.audio_streams = [] + codec = None + match self.audio_codec: + case HCACodec.AUDIO_CODEC: + codec = HCACodec + case ADXCodec.AUDIO_CODEC: + codec = ADXCodec + assert codec, ( + "fail to match suitable audio codec given option: %s" % self.audio_codec + ) + if type(audio) == list: + for track in audio: + if type(track) == str: + fn = os.path.basename(track) + else: + fn = "{:02d}.sfa".format(count) + hcaObj = codec(track, fn, key=self.key) + self.audio_streams.append(hcaObj) else: - len_sfa = [0] - max_len = max(len_sfv, max(len_sfa)) + if type(audio) == str: + fn = os.path.basename(audio) + else: + fn = "00.sfa" + hcaObj = codec(audio, fn, key=self.key) + self.audio_streams.append(hcaObj) - # SFV gets the order priority if the interval is matching that of SFA - # furthermore, SFA chunks keep going until the next SFV interval is reached. - # - current_interval = 0 - target_interval = 0 - sfa_count = 0 - for i in range(max_len): - if i < len_sfv: - header += SFV_list[i] - target_interval += self.SFV_interval_for_VP9 - - if self.audio: - while current_interval < target_interval: - idx = 0 - for stream in SFA_chunks: - if current_interval > target_interval: - # This would not just break the loop, this would break everything. - # Will not happen in typical cases. But if a video had a really weird framerate, this might skew it. - current_interval += self.base_interval_per_SFA_chunk[0] # Not safe. FIXME - break - if sfa_count == 0: - header += stream[sfa_count] - if sfa_count < len_sfa[idx]-1: - header += stream[sfa_count+1] - idx += 1 - else: - current_interval += self.base_interval_per_SFA_chunk[0] - # This is wrong actually, I made the base interval a list in case the intervals are different - # But it seems they are the same no matter what, however I will leave it as this just in case. - sfa_count += 1 + + def build(self) -> bytes: + SFV_list = self.video_stream.generate_SFV(self) + if self.enable_audio: + SFA_chunks = [s.generate_SFA(i, self) for i, s in enumerate(self.audio_streams) ] + else: + SFA_chunks = [] + SBT_chunks = [] # TODO: Subtitles + header = self._build_header(SFV_list, SFA_chunks, SBT_chunks) + chunks = list(itertools.chain(SFV_list, *SFA_chunks)) + + def chunk_key_sort(chunk): + ( + header, + chuncksize, + unk08, + offset, + padding, + chno, + unk0D, + unk0E, + type, + frametime, + framerate, + unk18, + unk1C, + ) = USMChunkHeader.unpack(chunk[: USMChunkHeader.size]) + prio = 0 if header == USMChunckHeaderType.SFV else 1 + # all stream chunks before section_end chunks, then sort by frametime, with SFV chunks before SFA chunks + return (type, frametime, prio) + + chunks.sort(key=chunk_key_sort) self.usm = header - - def build_header(self, SFV_list: list, SFA_chunks: list = False, SBT_chunks = None) -> bytes: + chunks = b''.join(chunks) + self.usm += chunks + return self.usm + def _build_header( + self, SFV_list: list, SFA_chunks: list, SBT_chunks: list # TODO: Not used + ) -> bytes: + # Main USM file CRIUSF_DIR_STREAM = [ dict( - avbps = (UTFTypeValues.uint, -1), # Will be updated later. - chno = (UTFTypeValues.ushort, 0xFFFF), - datasize = (UTFTypeValues.uint, 0), - filename = (UTFTypeValues.string, self.video_filename.rsplit(".", 1)[0]+".usm"), - filesize = (UTFTypeValues.uint, -1), # Will be updated later. - fmtver = (UTFTypeValues.uint, 16777984), - minbuf = (UTFTypeValues.uint, -1), # Will be updated later. - minchk = (UTFTypeValues.ushort, 1), - stmid = (UTFTypeValues.uint, 0) + fmtver=(UTFTypeValues.uint, self.video_stream.VERSION), + filename=( + UTFTypeValues.string, + os.path.splitext(os.path.basename(self.video_stream.filename))[0] + + ".usm", + ), + filesize=(UTFTypeValues.uint, -1), # Will be updated later. + datasize=(UTFTypeValues.uint, 0), + stmid=(UTFTypeValues.uint, 0), + chno=(UTFTypeValues.ushort, 0xFFFF), + minchk=(UTFTypeValues.ushort, 1), + minbuf=(UTFTypeValues.uint, -1), # Will be updated later. + avbps=(UTFTypeValues.uint, -1), # Will be updated later. ) ] - total_avbps = self.avbps - minbuf = 4 + self.minbuf + total_avbps = self.video_stream.avbps + minbuf = 4 + self.video_stream.minbuf - self.ivfObj.stream.seek(0, 2) - v_filesize = self.ivfObj.stream.tell() - self.ivfObj.stream.seek(0) + v_filesize = self.video_stream.filesize video_dict = dict( - avbps = (UTFTypeValues.uint, self.avbps), - chno = (UTFTypeValues.ushort, 0), - datasize = (UTFTypeValues.uint, 0), - filename = (UTFTypeValues.string, self.video_filename), - filesize = (UTFTypeValues.uint, v_filesize), - fmtver = (UTFTypeValues.uint, 16777984), - minbuf = (UTFTypeValues.uint, self.minbuf), - minchk = (UTFTypeValues.ushort, self.minchk), - stmid = (UTFTypeValues.uint, int.from_bytes(USMChunckHeaderType.SFV.value, "big")) + fmtver=(UTFTypeValues.uint, self.video_stream.VERSION), + filename=( + UTFTypeValues.string, + os.path.basename(self.video_stream.filename), + ), + filesize=(UTFTypeValues.uint, v_filesize), + datasize=(UTFTypeValues.uint, 0), + stmid=( + UTFTypeValues.uint, + int.from_bytes(USMChunckHeaderType.SFV.value, "big"), + ), + chno=(UTFTypeValues.ushort, 0), + minchk=(UTFTypeValues.ushort, self.video_stream.minchk), + minbuf=(UTFTypeValues.uint, self.video_stream.minbuf), + avbps=(UTFTypeValues.uint, self.video_stream.avbps), ) CRIUSF_DIR_STREAM.append(video_dict) - if self.audio: + if self.enable_audio: chno = 0 - for stream in self.streams: - sz = 0 - if self.audio_codec == "adx": - stream: ADX - stream.sfaStream.seek(0, 2) - sz = stream.sfaStream.tell() - stream.sfaStream.seek(0) - if stream.filetype == "wav": - chnls = stream.fmtChannelCount - else: - chnls = stream.channelCount - # I am not sure if this only works when there's one audio stream. TODO - avbps = (sz * 8 * chnls) - sz - else: - stream: HCA - stream.hcastream.seek(0, 2) - sz = stream.hcastream.tell() - stream.hcastream.seek(0) - if stream.filetype == "wav": - chnls = stream.fmtChannelCount - else: - chnls = stream.hca['ChannelCount'] - # I don't know how this is derived so I am putting my best guess here. TODO - avbps = int(sz / chnls) + for stream in self.audio_streams: + avbps = stream.avbps total_avbps += avbps minbuf += 27860 audio_dict = dict( - avbps = (UTFTypeValues.uint, avbps), - chno = (UTFTypeValues.ushort, chno), - datasize = (UTFTypeValues.uint, 0), - filename = (UTFTypeValues.string, self.audio_filenames[chno]), - filesize = (UTFTypeValues.uint, sz), - fmtver = (UTFTypeValues.uint, 16777984), - minbuf = (UTFTypeValues.uint, 27860), # minbuf is fixed at that for audio. - minchk = (UTFTypeValues.ushort, 1), - stmid = (UTFTypeValues.uint, int.from_bytes(USMChunckHeaderType.SFA.value, "big")) + fmtver=(UTFTypeValues.uint, 0), + filename=(UTFTypeValues.string, self.audio_filenames[chno]), + filesize=(UTFTypeValues.uint, stream.filesize), + datasize=(UTFTypeValues.uint, 0), + stmid=( + UTFTypeValues.uint, + int.from_bytes(USMChunckHeaderType.SFA.value, "big"), + ), + chno=(UTFTypeValues.ushort, chno), + minchk=(UTFTypeValues.ushort, 1), + minbuf=( + UTFTypeValues.uint, + 27860, + ), # minbuf is fixed at that for audio. + avbps=(UTFTypeValues.uint, avbps), ) CRIUSF_DIR_STREAM.append(audio_dict) chno += 1 - CRIUSF_DIR_STREAM[0]["avbps"] = (UTFTypeValues.uint, total_avbps) - CRIUSF_DIR_STREAM[0]["minbuf"] = (UTFTypeValues.uint, minbuf) # Wrong. TODO Despite being fixed per SFA stream, seems to change internally before summation. - - v_framrate = int(round(self.ivfObj.ivf['time_base_denominator'] / self.ivfObj.ivf['time_base_numerator'], 3) * 1000) - VIDEO_HDRINFO = [ - { - 'alpha_type': (UTFTypeValues.uint, 0), - 'color_space': (UTFTypeValues.uint, 0), - 'disp_height': (UTFTypeValues.uint, self.ivfObj.ivf["Height"]), - 'disp_width': (UTFTypeValues.uint, self.ivfObj.ivf["Width"]), - 'framerate_d': (UTFTypeValues.uint, 1000), - 'framerate_n': (UTFTypeValues.uint, v_framrate), - 'height': (UTFTypeValues.uint, self.ivfObj.ivf["Height"]), - 'ixsize': (UTFTypeValues.uint, self.minbuf), - 'mat_height': (UTFTypeValues.uint, self.ivfObj.ivf["Height"]), - 'mat_width': (UTFTypeValues.uint, self.ivfObj.ivf["Width"]), - 'max_picture_size': (UTFTypeValues.uint, 0), - 'metadata_count': (UTFTypeValues.uint, 1), # Could be 0 and ignore metadata? - 'metadata_size': (UTFTypeValues.uint, 224), # Not the actual value, I am just putting default value for one seek info. - 'mpeg_codec': (UTFTypeValues.uchar, 9), - 'mpeg_dcprec': (UTFTypeValues.uchar, 0), - 'picture_type': (UTFTypeValues.uint, 0), - 'pre_padding': (UTFTypeValues.uint, 0), - 'scrn_width': (UTFTypeValues.uint, 0), - 'total_frames': (UTFTypeValues.uint, self.ivfObj.ivf["FrameCount"]), - 'width': (UTFTypeValues.uint, self.ivfObj.ivf["Width"]) - } - ] - v = UTFBuilder(VIDEO_HDRINFO, table_name="VIDEO_HDRINFO") - v.strings = b"\x00" + v.strings - VIDEO_HDRINFO = v.parse() - padding = (0x20 - (len(VIDEO_HDRINFO) % 0x20) if (len(VIDEO_HDRINFO) % 0x20) != 0 else 0) - chk = USMChunkHeader.pack( - USMChunckHeaderType.SFV.value, - len(VIDEO_HDRINFO) + 0x18 + padding, - 0, - 0x18, - padding, - 0, - 0, - 0, - 1, - 0, - 30, - 0, - 0 - ) - chk += VIDEO_HDRINFO.ljust(len(VIDEO_HDRINFO) + padding, b"\x00") - VIDEO_HDRINFO = chk + CRIUSF_DIR_STREAM[0]["avbps"] = (UTFTypeValues.uint, total_avbps) + CRIUSF_DIR_STREAM[0]["minbuf"] = ( + UTFTypeValues.uint, + minbuf, + ) # Wrong. TODO Despite being fixed per SFA stream, seems to change internally before summation. + + def gen_video_hdr_info(metadata_size: int): + hdr = [ + { + "width": (UTFTypeValues.uint, self.video_stream.width), + "height": (UTFTypeValues.uint, self.video_stream.height), + "mat_width": (UTFTypeValues.uint, self.video_stream.width), + "mat_height": (UTFTypeValues.uint, self.video_stream.height), + "disp_width": (UTFTypeValues.uint, self.video_stream.width), + "disp_height": (UTFTypeValues.uint, self.video_stream.height), + "scrn_width": (UTFTypeValues.uint, 0), + "mpeg_dcprec": (UTFTypeValues.uchar, self.video_stream.MPEG_DCPREC), + "mpeg_codec": (UTFTypeValues.uchar, self.video_stream.MPEG_CODEC), + "alpha_type": (UTFTypeValues.uint, 0), + "total_frames": (UTFTypeValues.uint, self.video_stream.frame_count), + "framerate_n": ( + UTFTypeValues.uint, + int(self.video_stream.framerate * 1000), + ), + "framerate_d": (UTFTypeValues.uint, 1000), # Denominator + "metadata_count": ( + UTFTypeValues.uint, + 1, + ), # Could be 0 and ignore metadata? + "metadata_size": ( + UTFTypeValues.uint, + metadata_size, + ), + "ixsize": (UTFTypeValues.uint, self.video_stream.minbuf), + "pre_padding": (UTFTypeValues.uint, 0), + "max_picture_size": (UTFTypeValues.uint, 0), + "color_space": (UTFTypeValues.uint, 0), + "picture_type": (UTFTypeValues.uint, 0), + } + ] + v = UTFBuilder(hdr, table_name="VIDEO_HDRINFO") + v.strings = b"\x00" + v.strings + hdr = v.bytes() + padding = 0x20 - (len(hdr) % 0x20) if (len(hdr) % 0x20) != 0 else 0 + chk = USMChunkHeader.pack( + USMChunckHeaderType.SFV.value, + len(hdr) + 0x18 + padding, + 0, + 0x18, + padding, + 0, + 0, + 0, + 1, + 0, + 30, + 0, + 0, + ) + chk += hdr.ljust(len(hdr) + padding, b"\x00") + return chk audio_metadata = [] - if self.audio: - if self.audio_codec == "hca": - chno = 0 - for stream in self.streams: - payload = [ - dict( - hca_header = (UTFTypeValues.bytes, stream.get_header()) - ) - ] - p = UTFBuilder(payload, table_name="AUDIO_HEADER") - p.strings = b"\x00" + p.strings - metadata = p.parse() - padding = 0x20 - (len(metadata) % 0x20) if len(metadata) % 0x20 != 0 else 0 + audio_headers = [] + if self.enable_audio: + chno = 0 + for stream in self.audio_streams: + metadata = stream.get_metadata() + if not metadata: + break + else: + padding = ( + 0x20 - (len(metadata) % 0x20) + if len(metadata) % 0x20 != 0 + else 0 + ) chk = USMChunkHeader.pack( USMChunckHeaderType.SFA.value, len(metadata) + 0x18 + padding, - 0, + 0, 0x18, padding, chno, @@ -921,47 +1038,32 @@ def build_header(self, SFV_list: list, SFA_chunks: list = False, SBT_chunks = No 0, 30, 0, - 0 + 0, ) - chk += metadata - chk.ljust(len(metadata) + padding, b"\x00") + chk += metadata.ljust(len(metadata) + padding, b"\x00") audio_metadata.append(chk) - chno += 1 - + chno += 1 - audio_headers = [] chno = 0 - for stream in self.streams: - if self.audio_codec == "adx": - if stream.filetype == "wav": - chnls = stream.fmtChannelCount - sampling_rate = stream.fmtSamplingRate - total_samples = int(stream.dataSize // stream.fmtSamplingSize) - else: - chnls = stream.channelCount - sampling_rate = stream.SamplingRate - total_samples = stream.SampleCount - else: - chnls = stream.hca["ChannelCount"] - sampling_rate = stream.hca["SampleRate"] - total_samples = stream.hca['FrameCount'] + for stream in self.audio_streams: AUDIO_HDRINFO = [ { - "audio_codec": (UTFTypeValues.uchar, (2 if self.audio_codec == "adx" else 4)), + "audio_codec": (UTFTypeValues.uchar, stream.AUDIO_CODEC), + "sampling_rate": (UTFTypeValues.uint, stream.sampling_rate), + "total_samples": (UTFTypeValues.uint, stream.total_samples), + "num_channels": (UTFTypeValues.uchar, stream.chnls), + "metadata_count": (UTFTypeValues.uint, stream.METADATA_COUNT), + "metadat_size": (UTFTypeValues.uint, len(audio_metadata[chno]) if audio_metadata else 0), "ixsize": (UTFTypeValues.uint, 27860), - "metadata_count": (UTFTypeValues.uint, (0 if self.audio_codec == "adx" else 1)), - "metadat_size": (UTFTypeValues.uint, (0 if self.audio_codec == "adx" else len(audio_metadata[chno]))), - "num_channels": (UTFTypeValues.uchar, chnls), - "sampling_rate": (UTFTypeValues.uint, sampling_rate), - "total_samples": (UTFTypeValues.uint, total_samples) + "ambisonics": (UTFTypeValues.uint, 0) } - ] - if self.audio_codec == "hca": - AUDIO_HDRINFO[0].update({"ambisonics": (UTFTypeValues.uint, 0)}) + ] p = UTFBuilder(AUDIO_HDRINFO, table_name="AUDIO_HDRINFO") p.strings = b"\x00" + p.strings - header = p.parse() - padding = (0x20 - (len(header) % 0x20) if (len(header) % 0x20) != 0 else 0) + header = p.bytes() + padding = ( + 0x20 - (len(header) % 0x20) if (len(header) % 0x20) != 0 else 0 + ) chk = USMChunkHeader.pack( USMChunckHeaderType.SFA.value, len(header) + 0x18 + padding, @@ -975,34 +1077,78 @@ def build_header(self, SFV_list: list, SFA_chunks: list = False, SBT_chunks = No 0, 30, 0, - 0 + 0, ) chk += header.ljust(len(header) + padding, b"\x00") audio_headers.append(chk) chno += 1 - first_chk_ofs = 0x800 + len(VIDEO_HDRINFO) + 0x20 + 0x40 * len(self.streams) + 192 + (0 if not self.audio else sum([len(x) + 0x40 for x in audio_headers]) + (sum([len(x) + 0x40 for x in audio_metadata]) if self.audio_codec == "hca" else 0)) - VIDEO_SEEKINFO = [ - { - 'num_skip': (UTFTypeValues.short, 0), - 'ofs_byte': (UTFTypeValues.ullong, first_chk_ofs), - 'ofs_frmid': (UTFTypeValues.int, 0), - 'resv': (UTFTypeValues.short, 0) - } + keyframes = [ + (data["pos"], i) + for i, (frame, data, is_keyframe, duration) in enumerate(self.video_stream.frames()) + if is_keyframe ] + def comp_seek_info(first_chk_ofs): + seek = [ + { + "ofs_byte": (UTFTypeValues.ullong, first_chk_ofs + int(pos)), + "ofs_frmid": (UTFTypeValues.int, i), + "num_skip": (UTFTypeValues.short, 0), + "resv": (UTFTypeValues.short, 0), + } + for pos, i in keyframes + ] + seek = UTFBuilder(seek, table_name="VIDEO_SEEKINFO") + seek.strings = b"\x00" + seek.strings + seek = seek.bytes() + padding = 0x20 - len(seek) % 0x20 if len(seek) % 0x20 != 0 else 0 + seekinf = USMChunkHeader.pack( + USMChunckHeaderType.SFV.value, + len(seek) + 0x18 + padding, + 0, + 0x18, + padding, + 0, + 0, + 0, + 3, + 0, + 30, + 0, + 0, + ) + seekinf += seek.ljust(len(seek) + padding, b"\x00") + return seekinf + + len_seek = len(comp_seek_info(0)) + len_audio_headers = sum([len(x) + 0x40 for x in audio_headers]) + len_audio_metadata = sum([len(x) + 0x40 for x in audio_metadata]) + first_chk_ofs = ( + 0x800 # CRID + + 512 # VIDEO_HDRINFO + + len_seek + + 128 # SFV_END * 2 + + len_audio_headers + + len_audio_metadata + ) + VIDEO_SEEKINFO = comp_seek_info(first_chk_ofs) + VIDEO_HDRINFO = gen_video_hdr_info(len(VIDEO_SEEKINFO)) + total_len = sum([len(x) for x in SFV_list]) + first_chk_ofs - if self.audio: + if self.enable_audio: sum_len = 0 for stream in SFA_chunks: for x in stream: sum_len += len(x) total_len += sum_len - + CRIUSF_DIR_STREAM[0]["filesize"] = (UTFTypeValues.uint, total_len) - CRIUSF_DIR_STREAM = UTFBuilder(CRIUSF_DIR_STREAM, table_name="CRIUSF_DIR_STREAM") + CRIUSF_DIR_STREAM = UTFBuilder( + CRIUSF_DIR_STREAM, table_name="CRIUSF_DIR_STREAM" + ) CRIUSF_DIR_STREAM.strings = b"\x00" + CRIUSF_DIR_STREAM.strings - CRIUSF_DIR_STREAM = CRIUSF_DIR_STREAM.parse() + CRIUSF_DIR_STREAM = CRIUSF_DIR_STREAM.bytes() ############################################## # Parsing everything. @@ -1023,35 +1169,15 @@ def build_header(self, SFV_list: list, SFA_chunks: list = False, SBT_chunks = No 0, 30, 0, - 0 + 0, ) - CRID += CRIUSF_DIR_STREAM.ljust(0x800-0x20, b"\x00") + CRID += CRIUSF_DIR_STREAM.ljust(0x800 - 0x20, b"\x00") header += CRID # Header chunks header += VIDEO_HDRINFO - if self.audio: - SFA_END = [] - count = 0 - for chunk in audio_headers: - header += chunk - SFA_chk_END = USMChunkHeader.pack( - USMChunckHeaderType.SFA.value, - 0x38, - 0, - 0x18, - 0x0, - count, - 0x0, - 0x0, - 2, - 0, - 30, - 0, - 0 - ) - SFA_END.append(SFA_chk_END + b"#HEADER END ===============\x00") - count += 1 + if self.enable_audio: + header += b''.join(audio_headers) SFV_END = USMChunkHeader.pack( USMChunckHeaderType.SFV.value, 0x38, @@ -1065,59 +1191,35 @@ def build_header(self, SFV_list: list, SFA_chunks: list = False, SBT_chunks = No 0, 30, 0, - 0 + 0, ) SFV_END += b"#HEADER END ===============\x00" - header += SFV_END - if self.audio: - for chk in SFA_END: - header += chk - - VIDEO_SEEKINFO = UTFBuilder(VIDEO_SEEKINFO, table_name="VIDEO_SEEKINFO") - VIDEO_SEEKINFO.strings = b"\x00" + VIDEO_SEEKINFO.strings - VIDEO_SEEKINFO = VIDEO_SEEKINFO.parse() - padding = 0x20 - len(VIDEO_SEEKINFO) % 0x20 if len(VIDEO_SEEKINFO) % 0x20 != 0 else 0 - seekinf = USMChunkHeader.pack( - USMChunckHeaderType.SFV.value, - len(VIDEO_SEEKINFO) + 0x18 + padding, - 0, - 0x18, - padding, - 0, - 0, - 0, - 3, - 0, - 30, - 0, - 0 - ) - seekinf += VIDEO_SEEKINFO.ljust(len(VIDEO_SEEKINFO) + padding, b"\x00") - header += seekinf - if self.audio and self.audio_codec == "hca": - count = 0 - metadata_end = [] - for metadata in audio_metadata: - header += metadata - SFA_chk_END = USMChunkHeader.pack( + SFA_chk_END = b'' # Maybe reused + if self.enable_audio: + SFA_chk_END = b''.join([ + USMChunkHeader.pack( USMChunckHeaderType.SFA.value, 0x38, 0, 0x18, 0x0, - count, + i, 0x0, 0x0, 2, 0, 30, 0, - 0 - ) - metadata_end.append(SFA_chk_END + b"#METADATA END ===============\x00") - count += 1 + 0, + ) + b"#HEADER END ===============\x00" for i in range(len(audio_headers)) + ]) + header += SFA_chk_END # Ends audio_headers + header += VIDEO_SEEKINFO + + if self.enable_audio: + header += b''.join(audio_metadata) SFV_END = USMChunkHeader.pack( USMChunckHeaderType.SFV.value, 0x38, @@ -1131,172 +1233,29 @@ def build_header(self, SFV_list: list, SFA_chunks: list = False, SBT_chunks = No 0, 30, 0, - 0 + 0, ) SFV_END += b"#METADATA END ===============\x00" header += SFV_END - if self.audio and self.audio_codec == "hca": - for chk in metadata_end: - header += chk - - return header - - def prepare_SFV(self): - if self.video_codec == "vp9": - ivfinfo = self.ivfObj.info() - v_framerate = round(ivfinfo["time_base_denominator"] / ivfinfo["time_base_numerator"], 2) - framerate = 2997 - self.SFV_interval_for_VP9 = round(framerate / v_framerate, 1) # Not the actual interval for the VP9 codec, but USM calculate this way. - - def prepare_SFA(self): - """ Generates info needed per SFA stream. """ - self.SFA_chunk_size = [] - self.base_interval_per_SFA_chunk = [] - framerate = 29.97 - # I just noticed that audio framerate is not based off the video input, in fact, it seems to be locked at 29.97 - if self.audio_codec == "adx": - # The Audio chunks must be equal in size? or could it vary per audio stream? - # This is weird since chunksize is dictated by the sampling rate and channel count. (blocksize as well) - for adx in self.streams: - adx: ADX - if adx.filetype == "wav": - self.SFA_chunk_size.append(int(adx.fmtSamplingRate // framerate // 32) * (18 * adx.fmtChannelCount)) # 18 is standard ADX blocksize. - else: - self.SFA_chunk_size.append(int(adx.SamplingRate // framerate // 32) * (adx.Blocksize * adx.channelCount)) - self.base_interval_per_SFA_chunk.append(99.9 if self.video_codec == "vp9" else 100) # For VP9, this is the only repeating pattern I found. Anything else has another interval. - else: - # HCA chunks here, which are harder a bit, since the interval per chunks is rather derived - # from the resulting framesize of the HCA as well sample rate. - # However, SFA chunk size is at least given. - for hca in self.streams: - hca: HCA - hca.Pyparse_header() - framesize = hca.hca["FrameSize"] - self.SFA_chunk_size.append(framesize) - self.base_interval_per_SFA_chunk.append(64) # I am not sure about this. - - def init_key(self, key: str): - # Copied from USM class, it's hard to combine them at this point with how the USM class is created for extraction. - if type(key) == str: - if len(key) < 16: - key = key.rjust(16, "0") - self.key == int(key, 16) - key1 = bytes.fromhex(key[8:]) - key2 = bytes.fromhex(key[:8]) - else: - raise ValueError("Inavild input key.") - elif type(key) == int: - self.key = key - key1 = int.to_bytes(key & 0xFFFFFFFF, 4, "big") - key2 = int.to_bytes(key >> 32, 4, "big") - else: - raise ValueError("Invalid key format, must be either a string or an integer.") - t = bytearray(0x20) - t[0x00:0x09] = [ - key1[3], - key1[2], - key1[1], - (key1[0] - 0x34) % 0x100, - (key2[3] + 0xF9) % 0x100, - (key2[2] ^ 0x13) % 0x100, - (key2[1] + 0x61) % 0x100, - (key1[3] ^ 0xFF) % 0x100, - (key1[1] + key1[2]) % 0x100, - ] - t[0x09:0x0C] = [ - (t[0x01] - t[0x07]) % 0x100, - (t[0x02] ^ 0xFF) % 0x100, - (t[0x01] ^ 0xFF) % 0x100, - ] - t[0x0C:0x0E] = [ - (t[0x0B] + t[0x09]) % 0x100, - (t[0x08] - t[0x03]) % 0x100, - ] - t[0x0E:0x10] = [ - (t[0x0D] ^ 0xFF) % 0x100, - (t[0x0A] - t[0x0B]) % 0x100, - ] - t[0x10] = ((t[0x08] - t[0x0F]) % 0x100) - t[0x11:0x17] = [ - (t[0x10] ^ t[0x07]) % 0x100, - (t[0x0F] ^ 0xFF) % 0x100, - (t[0x03] ^ 0x10) % 0x100, - (t[0x04] - 0x32) % 0x100, - (t[0x05] + 0xED) % 0x100, - (t[0x06] ^ 0xF3) % 0x100, - ] - t[0x17:0x1A] = [ - (t[0x13] - t[0x0F]) % 0x100, - (t[0x15] + t[0x07]) % 0x100, - (0x21 - t[0x13]) % 0x100, - ] - t[0x1A:0x1C] = [ - (t[0x14] ^ t[0x17]) % 0x100, - (t[0x16] + t[0x16]) % 0x100, - ] - t[0x1C:0x1F] = [ - (t[0x17] + 0x44) % 0x100, - (t[0x03] + t[0x04]) % 0x100, - (t[0x05] - t[0x16]) % 0x100, - ] - t[0x1F] = (t[0x1D] ^ t[0x13]) % 0x100 - t2=[b'U', b'R', b'U', b'C'] - self.videomask1 = t - self.videomask2 = bytearray(map(lambda x: x ^ 0xFF, t)) - self.audiomask = bytearray(0x20) - for x in range(0x20): - if (x&1) == 1: - self.audiomask[x] = ord(t2[(x>>1)&3]) - else: - self.audiomask[x] = self.videomask2[x] - - # Decrypt SFV chunks or ALP chunks, should only be used if the video data is encrypted. - def VideoMask(self, memObj: bytes) -> bytes: - head = memObj[:0x40] - memObj = bytearray(memObj[0x40:]) - size = len(memObj) - # memObj len is a cached property, very fast to lookup - if size <= 0x200: - return (head + memObj) - data_view = memoryview(memObj) - - - # mask 1 - mask = bytearray(self.videomask1) - mask_view = memoryview(mask) - mask_index = 0 - for i in range(0x100): - mask_view[mask_index] ^= data_view[i + 0x100] - data_view[i] ^= mask_view[mask_index] - mask_index = (mask_index + 1) % 32 - - # mask 2 - mask = bytearray(self.videomask2) - mask_view = memoryview(mask) - vmask = self.videomask2 - vmask_view = memoryview(vmask) - - mask_index = 0 - - for i in range(0x100, size): - temp = data_view[i] - data_view[i] ^= mask_view[mask_index] - mask_view[mask_index] = temp ^ vmask_view[mask_index] - mask_index = (mask_index + 1) % 32 - - return bytes(head + memObj) + if audio_metadata: + SFA_chk_END = b''.join([ + USMChunkHeader.pack( + USMChunckHeaderType.SFA.value, + 0x38, + 0, + 0x18, + 0x0, + i, + 0x0, + 0x0, + 2, + 0, + 30, + 0, + 0, + ) + b"#METADATA END ===============\x00" for i in range(len(audio_headers)) + ]) + header += SFA_chk_END # Ends audio_headers - def AudioMask(self, memObj: bytes) -> bytes: - head = memObj[:0x140] - memObj = bytearray(memObj[0x140:]) - size = len(memObj) - data_view = memoryview(memObj) - mask = bytearray(self.audiomask) - mask_view = memoryview(mask) - for i in range(size): - data_view[i] ^= mask_view[i%32] - return bytes(head + memObj) - - def get_usm(self) -> bytes: - return self.usm \ No newline at end of file + return header diff --git a/PyCriCodecs/utf.py b/PyCriCodecs/utf.py index d1662d1..dc2ea45 100644 --- a/PyCriCodecs/utf.py +++ b/PyCriCodecs/utf.py @@ -1,84 +1,136 @@ -from typing import BinaryIO +from typing import BinaryIO, TypeVar, Type, List + +T = TypeVar("T") from io import BytesIO, FileIO from struct import unpack, calcsize, pack from .chunk import * -# FIXME Really awful. Although works. class UTF: - """ Use this class to return a dict containing all @UTF chunk information. """ - __slots__ = ["magic", "table_size", "rows_offset", "string_offset", "data_offset", - "table_name", "num_columns", "row_length", "num_rows", "stream", "table", - "__payload", "encoding"] + + _dictarray: list + magic: bytes table_size: int rows_offset: int string_offset: int data_offset: int - table_name: int num_columns: int row_length: int num_rows: int stream: BinaryIO - table: dict - __payload: list - encoding: str - def __init__(self, stream): + recursive: bool + encoding : str = 'utf-8' + + def __init__(self, stream, recursive=False): + """Unpacks UTF table binary payload + + Args: + stream (Union[str, bytes]): The input stream or file path to read the UTF table from. + recursive (bool): Whether to recursively unpack nested UTF tables. + """ if type(stream) == str: self.stream = FileIO(stream) else: self.stream = BytesIO(stream) - self.magic, self.table_size, self.rows_offset, self.string_offset, self.data_offset, self.table_name, self.num_columns, self.row_length, self.num_rows = UTFChunkHeader.unpack( - self.stream.read(UTFChunkHeader.size) - ) + ( + self.magic, + self.table_size, + self.rows_offset, + self.string_offset, + self.data_offset, + self.table_name, + self.num_columns, + self.row_length, + self.num_rows, + ) = UTFChunkHeader.unpack(self.stream.read(UTFChunkHeader.size)) if self.magic == UTFType.UTF.value: - self.table = self.read_rows_and_columns() + self._read_rows_and_columns() elif self.magic == UTFType.EUTF.value: self.stream.seek(0) data = memoryview(bytearray(self.stream.read())) - m = 0x655f + m = 0x655F t = 0x4115 for i in range(len(data)): - data[i] ^= (0xFF & m) + data[i] ^= 0xFF & m m = (m * t) & 0xFFFFFFFF self.stream = BytesIO(bytearray(data)) - self.magic, self.table_size, self.rows_offset, self.string_offset, self.data_offset, self.table_name, self.num_columns, self.row_length, self.num_rows = UTFChunkHeader.unpack( - self.stream.read(UTFChunkHeader.size) - ) + ( + self.magic, + self.table_size, + self.rows_offset, + self.string_offset, + self.data_offset, + self.table_name, + self.num_columns, + self.row_length, + self.num_rows, + ) = UTFChunkHeader.unpack(self.stream.read(UTFChunkHeader.size)) if self.magic != UTFType.UTF.value: raise Exception("Decryption error.") - self.table = self.read_rows_and_columns() + self._read_rows_and_columns() else: raise ValueError("UTF chunk is not present.") - - def read_rows_and_columns(self) -> dict: - stream = self.stream.read(self.data_offset-0x18) + self.recursive = recursive + if recursive: + def dfs(payload: list[dict]) -> None: + for col in range(len(payload)): + for k, v in payload[col].items(): + typeof, value = v + if typeof == UTFTypeValues.bytes: + # XXX: Recursive UTF tables doesn't seem to get encrypted (e.g. CPK, ACB) + # We can pass addition reconstruction flags alongside table names later on, but this is good enough for now + if value.startswith(UTFType.UTF.value) or value.startswith( + UTFType.EUTF.value + ): + table = UTF(value, recursive=False) + payload[col][k] = (table.table_name, table.dictarray) + dfs(table.dictarray) + + dfs(self.dictarray) + + def _read_rows_and_columns(self): + stream = self.stream.read(self.data_offset - 0x18) stream = BytesIO(stream) types = [[], [], [], []] target_data = [] target_constant = [] target_tuple = [] + s_offsets = [] for i in range(self.num_columns): flag = stream.read(1)[0] stflag = flag >> 4 typeflag = flag & 0xF if stflag == 0x1: - target_constant.append(int.from_bytes(stream.read(4), "big")) - types[2].append((">"+self.stringtypes(typeflag), typeflag)) + offset = int.from_bytes(stream.read(4), "big") + s_offsets.append(offset) + target_constant.append(offset) + types[2].append((">" + self._stringtypes(typeflag), typeflag)) elif stflag == 0x3: - target_tuple.append((int.from_bytes(stream.read(4), "big"), unpack(">"+self.stringtypes(typeflag), stream.read(calcsize(self.stringtypes(typeflag)))))) - types[1].append((">"+self.stringtypes(typeflag), typeflag)) + offset = int.from_bytes(stream.read(4), "big") + s_offsets.append(offset) + target_tuple.append( + ( + offset, + unpack( + ">" + self._stringtypes(typeflag), + stream.read(calcsize(self._stringtypes(typeflag))), + ), + ) + ) + types[1].append((">" + self._stringtypes(typeflag), typeflag)) elif stflag == 0x5: - target_data.append(int.from_bytes(stream.read(4), "big")) - types[0].append((">"+self.stringtypes(typeflag), typeflag)) - elif stflag == 0x7: # Exists in old CPK's. + offset = int.from_bytes(stream.read(4), "big") + s_offsets.append(offset) + target_data.append(offset) + types[0].append((">" + self._stringtypes(typeflag), typeflag)) + elif stflag == 0x7: # Exists in old CPK's. # target_tuple.append((int.from_bytes(stream.read(4), "big"), int.from_bytes(stream.read(calcsize(self.stringtypes(typeflag))), "big"))) # types[3].append((">"+self.stringtypes(typeflag), typeflag)) raise NotImplementedError("Unsupported 0x70 storage flag.") else: raise Exception("Unknown storage flag.") - - rows = [] - table = dict() + + rows = [] for j in range(self.num_rows): for i in types[0]: rows.append(unpack(i[0], stream.read(calcsize(i[0])))) @@ -86,85 +138,129 @@ def read_rows_and_columns(self) -> dict: for i in range(4): for j in range(len(types[i])): types[i][j] = (types[i][j][0][1:], types[i][j][1]) - strings = (stream.read()).split(b'\x00') + strings = (stream.read()).split(b"\x00") strings_copy = strings[:] - self.__payload = [] - self.encoding = 'utf-8' + self._dictarray = [] + self.encoding = "utf-8" for i in range(len(strings)): - try: - strings_copy[i] = strings[i].decode("utf-8") - except: - for x in ["shift-jis", "utf-16"]: - try: - strings_copy[i] = strings[i].decode(x) - self.encoding = x - # This looks sketchy, but it will always work since @UTF only supports these 3 encodings. - break - except: - continue - else: - # Probably useless. - raise UnicodeDecodeError(f"String of unknown encoding: {strings[i]}") + try: + strings_copy[i] = strings[i].decode("utf-8") + except: + for x in ["shift-jis", "utf-16"]: + try: + strings_copy[i] = strings[i].decode(x) + self.encoding = x + # This looks sketchy, but it will always work since @UTF only supports these 3 encodings. + break + except: + continue + else: + # Probably useless. + raise UnicodeDecodeError( + f"String of unknown encoding: {strings[i]}" + ) t_t_dict = dict() - self.table_name = strings_copy[self.finder(self.table_name, strings)] + self.table_name = strings_copy[self._finder(self.table_name, strings)] UTFTypeValuesList = list(UTFTypeValues) + s_orders = [strings_copy[self._finder(i, strings)] for i in s_offsets] + + def ensure_order(d: dict) -> dict: + return {k: d[k] for k in s_orders if k in d} + for i in range(len(target_constant)): if types[2][i][1] not in [0xA, 0xB]: - val = self.finder(target_constant[i], strings) - table.setdefault(strings_copy[val], []).append(0) - t_t_dict.update({strings_copy[val]: (UTFTypeValuesList[types[2][i][1]], None)}) + val = self._finder(target_constant[i], strings) + t_t_dict.update( + {strings_copy[val]: (UTFTypeValuesList[types[2][i][1]], None)} + ) elif types[2][i][1] == 0xA: - val = self.finder(target_constant[i], strings) - table.setdefault(strings_copy[val], []).append("") + val = self._finder(target_constant[i], strings) t_t_dict.update({strings_copy[val]: (UTFTypeValues.string, "")}) else: # Most likely useless, since the code doesn seem to reach here. - val = self.finder(target_constant[i], strings) - table.setdefault(strings_copy[val], []).append(b'') - t_t_dict.update({strings_copy[val]: (UTFTypeValues.bytes, b'')}) + val = self._finder(target_constant[i], strings) + t_t_dict.update({strings_copy[val]: (UTFTypeValues.bytes, b"")}) for i in range(len(target_tuple)): - if types[1][i%(len(types[1]))][1] not in [0xA, 0xB]: - table.setdefault(strings_copy[self.finder(target_tuple[i][0], strings)], []).append(target_tuple[i][1]) - t_t_dict.update({strings_copy[self.finder(target_tuple[i][0], strings)]: (UTFTypeValuesList[types[1][i%len(types[1])][1]], target_tuple[i][1][0])}) - elif types[1][i%(len(types[1]))][1] == 0xA: - table.setdefault(strings_copy[self.finder(target_tuple[i][0], strings)], []).append(strings_copy[self.finder(target_tuple[i][1][0], strings)]) - t_t_dict.update({strings_copy[self.finder(target_tuple[i][0], strings)]: (UTFTypeValues.string, strings_copy[self.finder(target_tuple[i][1][0], strings)])}) + if types[1][i % (len(types[1]))][1] not in [0xA, 0xB]: + t_t_dict.update( + { + strings_copy[self._finder(target_tuple[i][0], strings)]: ( + UTFTypeValuesList[types[1][i % len(types[1])][1]], + target_tuple[i][1][0], + ) + } + ) + elif types[1][i % (len(types[1]))][1] == 0xA: + t_t_dict.update( + { + strings_copy[self._finder(target_tuple[i][0], strings)]: ( + UTFTypeValues.string, + strings_copy[self._finder(target_tuple[i][1][0], strings)], + ) + } + ) else: - self.stream.seek(self.data_offset+target_tuple[i][1][0]+0x8, 0) + self.stream.seek(self.data_offset + target_tuple[i][1][0] + 0x8, 0) bin_val = self.stream.read((target_tuple[i][1][1])) - table.setdefault(strings_copy[self.finder(target_tuple[i][0], strings)], []).append(bin_val) - t_t_dict.update({strings_copy[self.finder(target_tuple[i][0], strings)]: (UTFTypeValues.bytes, bin_val)}) + t_t_dict.update( + { + strings_copy[self._finder(target_tuple[i][0], strings)]: ( + UTFTypeValues.bytes, + bin_val, + ) + } + ) temp_dict = dict() if len(rows) == 0: - self.__payload.append(t_t_dict) + self._dictarray.append(ensure_order(t_t_dict)) for i in range(len(rows)): - if types[0][i%(len(types[0]))][1] not in [0xA, 0xB]: - table.setdefault(strings_copy[self.finder(target_data[i%(len(target_data))], strings)], []).append(rows[i][0]) - temp_dict.update({strings_copy[self.finder(target_data[i%(len(target_data))], strings)]: (UTFTypeValuesList[types[0][i%(len(types[0]))][1]], rows[i][0])}) - elif types[0][i%(len(types[0]))][1] == 0xA: - table.setdefault(strings_copy[self.finder(target_data[i%(len(target_data))], strings)], []).append(strings_copy[self.finder(rows[i][0], strings)]) - temp_dict.update({strings_copy[self.finder(target_data[i%(len(target_data))], strings)]: (UTFTypeValues.string, strings_copy[self.finder(rows[i][0], strings)])}) + if types[0][i % (len(types[0]))][1] not in [0xA, 0xB]: + temp_dict.update( + { + strings_copy[ + self._finder(target_data[i % (len(target_data))], strings) + ]: ( + UTFTypeValuesList[types[0][i % (len(types[0]))][1]], + rows[i][0], + ) + } + ) + elif types[0][i % (len(types[0]))][1] == 0xA: + temp_dict.update( + { + strings_copy[ + self._finder(target_data[i % (len(target_data))], strings) + ]: ( + UTFTypeValues.string, + strings_copy[self._finder(rows[i][0], strings)], + ) + } + ) else: - self.stream.seek(self.data_offset+rows[i][0]+0x8, 0) + self.stream.seek(self.data_offset + rows[i][0] + 0x8, 0) bin_val = self.stream.read((rows[i][1])) - table.setdefault(strings_copy[self.finder(target_data[i%(len(target_data))], strings)], []).append(bin_val) - temp_dict.update({strings_copy[self.finder(target_data[i%(len(target_data))], strings)]: (UTFTypeValues.bytes, bin_val)}) - if not (i+1)%(len(types[0])): + temp_dict.update( + { + strings_copy[ + self._finder(target_data[i % (len(target_data))], strings) + ]: (UTFTypeValues.bytes, bin_val) + } + ) + if not (i + 1) % (len(types[0])): temp_dict.update(t_t_dict) - self.__payload.append(temp_dict) + self._dictarray.append(ensure_order(temp_dict)) temp_dict = dict() - return table - - def stringtypes(self, type: int) -> str: + + def _stringtypes(self, type: int) -> str: types = "BbHhIiQqfdI" if type != 0xB: return types[type] elif type == 0xB: - return("II") + return "II" else: raise Exception("Unkown data type.") - def finder(self, pointer, strings) -> int: + def _finder(self, pointer, strings) -> int: sum = 0 for i in range(len(strings)): if sum < pointer: @@ -173,25 +269,26 @@ def finder(self, pointer, strings) -> int: return i else: raise Exception("Failed string lookup.") - - def get_payload(self) -> list: - """ Returns list of dictionaries used in the UTF. """ - # I am a noob, but I want to standardize the table output to Donmai WannaCri's payload type. - # Since my table parser has a different approach (an awful one at that), - # (And it's integrated with the other tools in this lib specifically), - # So I can't change it. However this function will return a payload list of Donmai WannaCri's type. - # And this format can be used to build custom @UTF tables in this lib as well. - # As for key strings, according to Donmai, they are always in ASCII encoding - # despite, what seems to me, nothing stopping it for being any of the other 3 encodings, - # since the header allows it. - return self.__payload - -# Revised it a bit. + + @property + def table(self) -> dict: + """Returns a dictionary representation of the UTF table. + + Effectively, this retrieves a transposed version of the dictarray. Whilst discarding + type info. + + This is mostly here for cpk.py compatibility. + """ + keys = self._dictarray[0].keys() + return {key: [d[key][1] for d in self._dictarray] for key in keys} + + @property + def dictarray(self) -> list[dict]: + """Returns a list representation of the UTF table. """ + return self._dictarray + class UTFBuilder: - """ Use this class to build custom UTF tables. """ - __slots__ = ["encoding", "dictarray", "keyset", "encrypt", "encoding", "strings", - "table_name", "binary", "table", "rows_data", "stflag", "column_data", - "data_offset"] + encoding: str dictarray: list strings: bytes @@ -202,96 +299,191 @@ class UTFBuilder: rows_data: bytearray column_data: bytearray data_offset: int - - def __init__(self, dictarray: list, encrypt: bool = False, encoding: str = "utf-8", table_name: str = "PyCriCodecs_table") -> None: + + def __init__( + self, + dictarray: list[dict], + encrypt: bool = False, + encoding: str = "utf-8", + table_name: str = "PyCriCodecs_table", + ignore_recursion: bool = False, + ) -> None: + """Packs UTF payload back into their binary form + + Args: + dictarray: A list of dictionaries representing the UTF table. + encrypt: Whether to encrypt the table (default: False). + encoding: The character encoding to use (default: "utf-8"). + table_name: The name of the table (default: "PyCriCodecs_table"). + ignore_recursion: Whether to ignore recursion when packing (default: False). + """ + assert type(dictarray) == list, "dictarray must be a list of dictionaries (see UTF.dictarray)." + + # Preprocess for nested dictarray types + def dfs(payload: list[dict], name: str) -> None: + for dict in range(len(payload)): + for k, v in payload[dict].items(): + typeof_or_name, value = v + if type(value) == list: + assert type(typeof_or_name) == str, "bogus payload data" + payload[dict][k] = ( + UTFTypeValues.bytes, + dfs(value, typeof_or_name), + ) + # ? Could subtables be encrypted at all? + return UTFBuilder( + payload, encoding=encoding, table_name=name, ignore_recursion=True + ).bytes() + + if not ignore_recursion: + dfs(dictarray, table_name) l = set([len(x) for x in dictarray]) if len(l) != 1: raise ValueError("All dictionaries must be equal in length.") matches = [(k, v[0]) for k, v in dictarray[0].items()] for i in range(1, len(dictarray)): if matches != [(k, v[0]) for k, v in dictarray[i].items()]: - raise ValueError("Keys and/or value types are not matching across dictionaries.") + raise ValueError( + "Keys and/or value types are not matching across dictionaries." + ) self.dictarray = dictarray self.encrypt = encrypt self.encoding = encoding self.table_name = table_name - self.binary = b'' - self.get_strings() - - def parse(self) -> bytearray: - """ Returns a @UTF bytearray Table from the provided payload dict. """ - self.get_stflag() - self.column_data = self.write_columns() - self.rows_data = self.write_rows() - header_data = self.write_header() - dataarray = header_data + self.column_data + self.rows_data + self.strings + self.binary - if len(dataarray) % 8 != 0: - dataarray = dataarray[:8] + dataarray[8:].ljust(self.data_offset, b'\x00') # Padding. - if self.encrypt: - dataarray = memoryview(dataarray) - m = 0x655f - t = 0x4115 - for i in range(len(dataarray)): - dataarray[i] ^= (0xFF & m) - m = (m * t) & 0xFFFFFFFF - dataarray = bytearray(dataarray) - return dataarray - - def write_header(self) -> bytearray: - self.data_offset = len(self.column_data) + len(self.rows_data) + len(self.strings) + len(self.binary) + 0x18 + self.binary = b"" + self._get_strings() + + def _write_header(self) -> bytearray: + self.data_offset = ( + len(self.column_data) + + len(self.rows_data) + + len(self.strings) + + len(self.binary) + + 0x18 + ) datalen = self.data_offset if self.data_offset % 8 != 0: self.data_offset = self.data_offset + (8 - self.data_offset % 8) if len(self.binary) == 0: binary_offset = self.data_offset else: - binary_offset = datalen-len(self.binary) + binary_offset = datalen - len(self.binary) header = UTFChunkHeader.pack( - b'@UTF', # @UTF - self.data_offset, # Chunk size. - len(self.column_data)+0x18, # Rows offset. - datalen-len(self.strings)-len(self.binary), # String offset. - binary_offset, # Binary data offset. - 0 if self.strings.startswith(bytes(self.table_name, self.encoding)) else self.strings.index(b"\x00" + bytes(self.table_name, self.encoding) + b"\x00") + 1, # Table name pointer. - len(self.stflag), # Num columns. - sum([calcsize(self.stringtypes(x[1])) for x in self.stflag if x[0] == 0x50]), # Num rows. - len(self.dictarray) # Rows length. - ) + b"@UTF", # @UTF + self.data_offset, # Chunk size. + len(self.column_data) + 0x18, # Rows offset. + datalen - len(self.strings) - len(self.binary), # String offset. + binary_offset, # Binary data offset. + ( + 0 + if self.strings.startswith(bytes(self.table_name, self.encoding)) + else self.strings.index( + b"\x00" + bytes(self.table_name, self.encoding) + b"\x00" + ) + + 1 + ), # Table name pointer. + len(self.stflag), # Num columns. + sum( + [calcsize(self._stringtypes(x[1])) for x in self.stflag if x[0] == 0x50] + ), # Num rows. + len(self.dictarray), # Rows length. + ) return bytearray(header) - - def write_rows(self) -> bytearray: + + def _write_rows(self) -> bytearray: rows = bytearray() for dict in self.dictarray: for data in self.stflag: if data[0] == 0x50: if data[1] not in [0xA, 0xB]: - rows += pack(">"+self.stringtypes(data[1]), dict[data[2]][1]) + rows += pack(">" + self._stringtypes(data[1]), dict[data[2]][1]) elif data[1] == 0xA: if bytes(dict[data[2]][1], self.encoding) == b"": - idx = self.strings.index(b'\x00\x00') + 1 - rows += pack(">"+self.stringtypes(data[1]), idx) + idx = self.strings.index(b"\x00\x00") + 1 + rows += pack(">" + self._stringtypes(data[1]), idx) else: - rows += pack(">"+self.stringtypes(data[1]), self.strings.index(b"\x00" + bytes(dict[data[2]][1], self.encoding) + b"\x00") + 1) + rows += pack( + ">" + self._stringtypes(data[1]), + self.strings.index( + b"\x00" + + bytes(dict[data[2]][1], self.encoding) + + b"\x00" + ) + + 1, + ) else: - rows += pack(">"+self.stringtypes(data[1]), self.binary.index(dict[data[2]][1]), len(dict[data[2]][1])) + rows += pack( + ">" + self._stringtypes(data[1]), + self.binary.index(dict[data[2]][1]), + len(dict[data[2]][1]), + ) return rows - def write_columns(self) -> bytearray: + def _write_columns(self) -> bytearray: columns = bytearray() for data in self.stflag: columns += int.to_bytes(data[0] | data[1], 1, "big") if data[0] in [0x10, 0x50]: - columns += int.to_bytes(self.strings.index(b"\x00" + bytes(data[2], self.encoding) + b"\x00") + 1, 4, "big") + columns += int.to_bytes( + self.strings.index( + b"\x00" + bytes(data[2], self.encoding) + b"\x00" + ) + + 1, + 4, + "big", + ) else: if data[1] not in [0xA, 0xB]: - columns += int.to_bytes(self.strings.index(b"\x00" + bytes(data[2], self.encoding) + b"\x00") + 1, 4, "big")+int.to_bytes(data[3], calcsize(self.stringtypes(data[1])), "big") + columns += int.to_bytes( + self.strings.index( + b"\x00" + bytes(data[2], self.encoding) + b"\x00" + ) + + 1, + 4, + "big", + ) + int.to_bytes( + data[3], calcsize(self._stringtypes(data[1])), "big" + ) elif data[1] == 0xA: - columns += int.to_bytes(self.strings.index(b"\x00" + bytes(data[2], self.encoding) + b"\x00") + 1, 4, "big") + (b"\x00\x00\x00\x00" if self.strings.startswith(bytes(data[3], self.encoding) + b"\x00") else (int.to_bytes(self.strings.index(b"\x00" + bytes(data[3], self.encoding) + b"\x00") + 1, 4, "big"))) + columns += int.to_bytes( + self.strings.index( + b"\x00" + bytes(data[2], self.encoding) + b"\x00" + ) + + 1, + 4, + "big", + ) + ( + b"\x00\x00\x00\x00" + if self.strings.startswith( + bytes(data[3], self.encoding) + b"\x00" + ) + else ( + int.to_bytes( + self.strings.index( + b"\x00" + bytes(data[3], self.encoding) + b"\x00" + ) + + 1, + 4, + "big", + ) + ) + ) else: - columns += int.to_bytes(self.strings.index(b"\x00" + bytes(data[2], self.encoding) + b"\x00") + 1, 4, "big")+int.to_bytes(self.binary.index(data[3]), 4, "big")+int.to_bytes(len(data[3]), 4, "big") + columns += ( + int.to_bytes( + self.strings.index( + b"\x00" + bytes(data[2], self.encoding) + b"\x00" + ) + + 1, + 4, + "big", + ) + + int.to_bytes(self.binary.index(data[3]), 4, "big") + + int.to_bytes(len(data[3]), 4, "big") + ) return columns - def get_stflag(self): + def _get_stflag(self): to_match = [(x, y) for x, y in self.dictarray[0].items()] UTFTypeValuesList = list(UTFTypeValues) self.stflag = [] @@ -299,24 +491,39 @@ def get_stflag(self): if len(self.dictarray) != 1: for dict in self.dictarray: if dict[val[0]][1] != val[1][1]: - self.stflag.append((0x50, UTFTypeValuesList.index(val[1][0]), val[0])) + self.stflag.append( + (0x50, UTFTypeValuesList.index(val[1][0]), val[0]) + ) break else: if val[1][1] == None: - self.stflag.append((0x10, UTFTypeValuesList.index(val[1][0]), val[0])) + self.stflag.append( + (0x10, UTFTypeValuesList.index(val[1][0]), val[0]) + ) else: - self.stflag.append((0x30, UTFTypeValuesList.index(val[1][0]), val[0], val[1][1])) + self.stflag.append( + ( + 0x30, + UTFTypeValuesList.index(val[1][0]), + val[0], + val[1][1], + ) + ) else: # It seems that when there is only one dictionary, there will be no element of type 0x30 flag # Otherwise all of them would be either 0x30 or 0x10 flags with no length to the rows. if val[1][1] == None or val[1][1] == "": - self.stflag.append((0x10, UTFTypeValuesList.index(val[1][0]), val[0])) + self.stflag.append( + (0x10, UTFTypeValuesList.index(val[1][0]), val[0]) + ) else: - self.stflag.append((0x50, UTFTypeValuesList.index(val[1][0]), val[0])) + self.stflag.append( + (0x50, UTFTypeValuesList.index(val[1][0]), val[0]) + ) - def get_strings(self): + def _get_strings(self): strings = [] - binary = b'' + binary = b"" for dict in self.dictarray: for key, value in dict.items(): @@ -326,7 +533,9 @@ def get_strings(self): for key, value in dict.items(): if type(value[1]) == str and value[1] not in strings: strings.append(value[1]) - if (type(value[1]) == bytearray or type(value[1]) == bytes) and value[1] not in binary: + if (type(value[1]) == bytearray or type(value[1]) == bytes) and value[ + 1 + ] not in binary: binary += value[1] self.binary = binary @@ -338,18 +547,158 @@ def get_strings(self): for i in range(len(strings)): val = strings[i].encode(self.encoding) - if b'\x00' in val: - raise ValueError(f"Encoding of {self.encoding} for '{strings[i]}' results in string with a null byte.") + if b"\x00" in val: + raise ValueError( + f"Encoding of {self.encoding} for '{strings[i]}' results in string with a null byte." + ) else: strings[i] = val - - self.strings = b'\x00'.join(strings) + b"\x00" - - def stringtypes(self, type: int) -> str: + + self.strings = b"\x00".join(strings) + b"\x00" + + def _stringtypes(self, type: int) -> str: types = "BbHhIiQqfdI" if type != 0xB: return types[type] elif type == 0xB: - return("II") + return "II" else: - raise Exception("Unkown data type.") \ No newline at end of file + raise Exception("Unkown data type.") + + def bytes(self) -> bytearray: + """Returns a @UTF bytearray Table from the provided payload dict.""" + self._get_stflag() + self.column_data = self._write_columns() + self.rows_data = self._write_rows() + header_data = self._write_header() + dataarray = ( + header_data + self.column_data + self.rows_data + self.strings + self.binary + ) + if len(dataarray) % 8 != 0: + dataarray = dataarray[:8] + dataarray[8:].ljust( + self.data_offset, b"\x00" + ) # Padding. + if self.encrypt: + dataarray = memoryview(dataarray) + m = 0x655F + t = 0x4115 + for i in range(len(dataarray)): + dataarray[i] ^= 0xFF & m + m = (m * t) & 0xFFFFFFFF + dataarray = bytearray(dataarray) + return dataarray + +class UTFViewer: + _payload: dict + + def __init__(self, payload): + """Construct a non-owning read-write, deletable view of a UTF table dictarray. + Nested classes are supported. + Sorting (using .sort()) is done in-place and affects the original payload. + + Example: + ```python + class CueNameTable(UTFViewer): + CueName : str + CueIndex : int + class ACBTable(UTFViewer): + CueNameTable : List[CueNameTable] + Awb : AWB + src = ACB(ACB_sample) + payload = ACBTable(src.payload) + >>> Referencing items through Python is allowed + name = payload.CueNameTable + >>> Lists can be indexed + name_str = name[0].CueName + >>> Deleting items from lists is also allowed + src.view.CueNameTable.pop(1) + src.view.CueTable.pop(1) + >>> The changes will be reflected in the original UTF payload + + See __new__ for the actual constructor. + ``` + """ + assert isinstance(payload, dict), "Payload must be a dictionary." + super().__setattr__("_payload", payload) + + def __getattr__(self, item): + annotations = super().__getattribute__("__annotations__") + # Nested definitions + if item in annotations: + sub = annotations[item] + reduced = getattr(sub, "__args__", [None])[0] + reduced = reduced or sub + if issubclass(reduced, UTFViewer): + typeof_or_name, value = self._payload[item] + assert ( + type(typeof_or_name) == str and type(value) == list + ), "payload is not expanded. parse with UTF(..., recursive=True)" + return self._view_as(value, reduced) + payload = super().__getattribute__("_payload") + if item not in payload: + return super().__getattribute__(item) + _, value = payload[item] + return value + + def __setattr__(self, item, value): + payload = super().__getattribute__("_payload") + if item not in payload: + raise AttributeError(f"{item} not in payload") + typeof, _ = payload[item] + payload[item] = (typeof, value) + + def __dir__(self): + annotations = super().__getattribute__("__annotations__") + return list(annotations.keys()) + list(super().__dir__()) + + @staticmethod + def _view_as(payload: dict, clazz: Type[T]) -> T: + if not issubclass(clazz, UTFViewer): + raise TypeError("class must be a subclass of UTFViewer") + return clazz(payload) + + class ListView(list): + _payload : List[dict] + def __init__(self, payload: list[T]): + self._payload = payload + super().__init__([UTFViewer(item) for item in payload]) + + def pop(self, index = -1): + self._payload.pop(index) + return super().pop(index) + + def append(self, o : "UTFViewer"): + if len(self): + assert type(self[0]) == type(o), "all items in the list must be of the same type." + self._payload.append(o._payload) + return super().append(o) + + def extend(self, iterable): + for item in iterable: + self.append(item) + + def insert(self, index, o : "UTFViewer"): + if len(self): + assert type(self[0]) == type(o), "all items in the list must be of the same type." + self._payload.insert(index, o._payload) + return super().insert(index, o) + + def clear(self): + self._payload.clear() + return super().clear() + + def count(self, value): + raise NotImplementedError("count is not supported on views") + + def remove(self, value): + raise NotImplementedError("remove is not supported on views. use pop(index).") + + def sort(self, key : callable): + p = sorted([(self[i], i) for i in range(len(self))], key=lambda x: key(x[0])) + self._payload[:] = [self._payload[i] for x,i in p] + self[:] = [x for x,i in p] + + def __new__(cls: Type[T], payload: list | dict, **args) -> T | List[T]: + if isinstance(payload, list): + return UTFViewer.ListView(payload) + return super().__new__(cls) diff --git a/README.md b/README.md index f736d43..d8e6f5b 100644 --- a/README.md +++ b/README.md @@ -1,144 +1,54 @@ # PyCriCodecs -Python frontend with a C++ backend for managing Criware formats. -Although for some tasks, python is used purely. - -## Supporting -I am running this on Python 3.11, although other earlier versions might work - - -So far this lib supports in terms of: - -Extracting: -- ACB/AWB (Incorrect filenames in extraction.) -- USM (Any USM there is) -- CPK (Most CPK's) - -Decoding: -- ADX (All versions) -- HCA (All versions) - -Building: -- CPK (All CPK modes) -- AWB (Anything) -- USM (VP9 support only, ADX or HCA Audio support, multiple audio streams support as well, VP9/ADX only tested and for sure working!) - -Encoding: -- HCA (HCA Version 2.0) -- ADX (All versions, any bitdepth, any blocksize, any HighPass Frequence, All encoding versions) - -## Installation and Usage -To install run -``` -python setup.py install -``` -or alternatively -``` -pip install . -``` - -Note: all libs (except ADX) here are standardized to take either a filename/path or bytes/bytearray, so you can swap both. - -Also, for audio related codecs, the looping input and output is defined in the metadata, the WAV file will not loop, but it will have a "smpl" chunk in the header, same if you want to encode a looping HCA or an ADX, the WAV must have a smpl chunk. - -Otherwise it will loop normally. - -### Usage: - -##### For ADX decoding and encoding: -```python -from PyCriCodecs import * -# Decoding: -adx_data = open("path_to_adx_file.adx", "rb").read() -wavfilebytes = ADX.decode(adx_data) # Decode will return bytes object containing decoded ADX data as a wav file. - -# Encoding: -wav_data = open("path_to_wav_file.wav", "rb").read() -adxbytes = ADX.encode(wav_data) # Returns an ADX file as bytes, check the wiki for more options. -``` -##### For HCA decoding and encoding: -```python -from PyCriCodecs import * -hcaObj = HCA("filename.hca", key=0xCF222F1FE0748978) # You can change the key, or remove it if the HCA is not encrypted. Key can be a hex string. -wavfile = hcaObj.decode() # Gets you the wav file after decoding. - -wavObj = HCA("filename.wav") -hcabytes = wavObj.encode(encrypt=True) # and you will get an HCA file. -# You can provide a key from when initializing, otherwise it will default to the default key, you can also encrypt keyless with keyless=true. -# You can also force disable looping on HCA output by force_not_looping = True. - -wavObj.encrypt() -# or -hcaObj.decrypt() -# Any works, given it can be decrypted or encrypted as an HCA. Would do it. You can also pass a key to ".encrypt()", ".decrypt()" uses the init key. -``` -##### For CPK extraction and building: -```python -from PyCriCodecs import * -# Extraction: -CpkObj = CPK("filename.cpk") -CpkObj.extract() # Will extract files to a dir names "filename" -CpkObj.extract_file() # Extract a file from a given filename (or an ID for CPKMode 0) - -# Building: -CPKBuilder("dirname", "outfile.cpk", CpkMode=1) # CpkMode is important sometimes, get your target mode by extracting a sample table. -# Given a directory, it will take that directory as root, and builds a CPK for the directories and files inside. -# Output would be a cpk file as specified. -``` -##### For USM extraction and Building: --Note that USM building might be a little bit unstable due to bad code, feel free to open any issues if something did went wrong. -```python -from PyCriCodecs import * -# Extraction: -usmObj = USM("filename.cpk") # or bytes, you can add a key by key="KEYINHEXGOESHERE" -usmObj.extract() # extracts all USM contents in the current directory. You can add a directory with extract(dirname = "Example") - -# You can also demux the USM internally and manage with the output bytes all you want. -usmObj.demux() # Then you have access to output property. -usmObj.output # This is a dict containing all chunks in the USM, each key has a value of a list with bytearrays. - -usmObj.get_metadata() # Not for the user specifically, but if you want to look at the info inside, this is one way. - -# Building: -# Needs at least a video to be able to build one USM file, you can add audio pretty easily too. -usmObj = USMBuilder("filename.ivf", "filename.wav", key=0xKEYGOESINHERE, audio_codec="adx", encryptAudio=True) # Basic USM -# You can add a list of audio paths/filenames as well instead of filenames, and that will be added into audio streams in order. -usmObj.build() # Due to bad code, this is heavy on performance and will take some seconds based of the input files. -usmbytes = usmObj.get_usm() # Will return the USM file as bytes. -``` -##### For ACB or AWB extraction: -```python -from PyCriCodecs import * -# ACB Extraction: -acbObj = ACB("filename.acb") # It will attempt to open "filename.awb" as well if there are no sub-banks in the ACB. -acbObj.extract(dirname="dirname", decode=True, key=key) # You can turn off decoding by decode=False. -# AWB Extraction: -awbObj = AWB("filename.awb") -# You can either loop through the audios inside with: -for file in awbObj.getfiles(): - file # file bytes. - open("file.hca or anything", wb).write(file) - -# or you can call the extract function, not advised. -awbObj.extract() -``` - -Check the [Wiki](https://github.com/LittleChungi/PyCriCodecs/wiki/Docs-and-Thoughts) for my thoughts, plans, more options, and some details as well for documentation. - -## TODO List -- Add ACB building. -- Add correct ACB extraction. - -### Currently Known Bugs -- USM Building is not stable, and currently does not work at all because I changed the ADX api. +A continuation of @Youjose's work on Criware formats. Feautres are still in flux and subject to change. When in doubt, Refer to the [original repo](https://github.com/Youjose/PyCriCodecs) for more information. + +# Installation +This is not available at PyPI yet. Meanwhile, you can install it manually from the source. +```bash +pip install -U git+https://github.com/mos9527/PyCriCodecs.git +``` + +For USM features, you need `ffmpeg` installed and available in your PATH. See also https://github.com/kkroening/ffmpeg-python?tab=readme-ov-file#installing-ffmpeg + +## Features +If not otherwise mentioned, all features marked with [x] are considered working, and has been verified with official tools. + +### ACB Cue sheets (also AWB) +- [x] Editing & Saving (Scripting APIs. Helper functions TODO. see examples in [Tests](https://github.com/mos9527/PyCriCodecs/tree/main/Tests)) +### USM Sofdec2 (Encode & Decode) +#### Audio Stream +- [x] HCA +- [x] ADX +#### Video Stream +**NOTE**: You definitely want to tweak these encode settings a bit. +- [x] Sofdec Prime (MPEG1, from `.mp4` container) + - Prepare source file with: `ffmpeg -i -c:v mpeg1video -an .mp4` +- [x] H264 (from `.h264` raw container) + - Prepare source file with: `ffmpeg -i -c:v libx264 -an .h264` +- [x] VP9 (from `.ivf` container) + - Prepare source file with: `ffmpeg -i -c:v libvpx -an .ivf` +### HCA Audio Codec +- [x] Decoding (up to version 3.0) +- [x] Encoding (up to version 3.0) +### ADX Audio Codec +- [x] Decoding +- [x] Encoding +### CPK +- [x] Unpacking +- [x] Packing + +## Roadmap +- [ ] ACB Extraction (Massive TODO. see also https://github.com/mos9527/PyCriCodecs/blob/main/Research/ACBSchema.py) +- [ ] Interface for encode tasks (CLI then maybe GUI?) +- [ ] Documentation +- [ ] C/C++ port + FFI +## Currently Known Bugs +- USM seeking does not work. Though most games don't use it anyways. - Not important, and might not fix: ADX encoding and decoding at higher bitdepths (11-15) adds popping noise. - Some CPK's that has the same filename for every file in the entry will overwrite each other. - Probably many more I am unaware of, report if you find any. # Credits -- [vgmstream](https://github.com/vgmstream/vgmstream) for HCA code. -- [VGAudio](https://github.com/Thealexbarney/VGAudio) for ADX codec, and HCA encoding, both of which I ported into C++ but modified the ADX lib greatly. -- [K0lb3](https://github.com/K0lb3) for helping a lot with python and Cpython, as well as helping me writing some of the code. -- [bnnm](https://github.com/bnnm) for his various contributions on audio formats, helped me a lot with adding ADX and HCA support. -- [Nyagamon](https://github.com/Nyagamon) for a lot of what he did for criware formats. -- [donmai](https://github.com/donmai-me) and his [writeup](https://listed.to/@donmai/24921/criware-s-usm-format-part-1) of CriWare's UTF format. -- 9th for also helping me with some python knowledge. +- https://github.com/Youjose/PyCriCodecs +- https://github.com/Mikewando/PyCriCodecs ([PR#1 on USM](https://github.com/mos9527/PyCriCodecs/pull/1)) +- https://github.com/donmai-me/WannaCRI +- https://github.com/vgmstream/vgmstream \ No newline at end of file diff --git a/setup.py b/setup.py index e391fbe..e7fde4d 100644 --- a/setup.py +++ b/setup.py @@ -11,5 +11,8 @@ [os.path.join("CriCodecs", "CriCodecs.cpp")], include_dirs=[os.path.realpath("CriCodecs")], extra_compile_args=["-std=c++11", "-O3"] - )] + )], + requires=[ + 'ffmpeg-python' + ] ) \ No newline at end of file