diff --git a/pythonlib/formats/rebirth/btlsub_to_hdr.py b/pythonlib/formats/rebirth/btlsub_to_hdr.py index 3b21ce4..5f6fe91 100644 --- a/pythonlib/formats/rebirth/btlsub_to_hdr.py +++ b/pythonlib/formats/rebirth/btlsub_to_hdr.py @@ -4,82 +4,12 @@ from typing import TextIO import urllib.error import urllib.request from io import StringIO -import re from dataclasses import dataclass -import string - +from ..srt import TimeStamp, str_to_timestamp +from .text_util import text_to_cstr, indent_lines URL = "https://docs.google.com/spreadsheets/d/1-XwzS7F0SaLlXwv1KS6RcTEYYORH2DDb1bMRy5VM5oo/gviz/tq?tqx=out:csv&sheet=Subs&range=A:M" - -NAMES = { - "Veigue": 1, - "Mao": 2, - "Eugene": 3, - "Annie": 4, - "Tytree": 5, - "Hilda": 6, - "Claire": 7, - "Agarte": 8, - "Annie (NPC)": 9, - "Leader": 0x1FFF, -} - -COLORS = { - "Blue": 1, - "Red": 2, - "Purple": 3, - "Green": 4, - "Cyan": 5, - "Yellow": 6, - "White": 7, - "Grey": 8, - "Black": 9, -} - -ITALICS = { - "/Italic": 0, - "Italic": 10, -} - -TAGS = { - "nl": 0x1, - "cr": 0x2, - "var": 0x4, - "color": 0x5, - "scale": 0x6, - "speed": 0x7, - "italic": 0x8, - "nmb": 0x9, - "ptr": 0xA, - "name": 0xB, - "item": 0xC, - "icon": 0xD, - "font": 0xE, - "voice": 0xF, - "unk13": 0x13, - "unk14": 0x14, - "unk15": 0x15, - "unk16": 0x16, - "unk17": 0x17, - "unk18": 0x18, - "unk19": 0x19, - "unk1A": 0x1A, -} - -FRIENDLY_TAGS = dict() -FRIENDLY_TAGS.update(NAMES) -FRIENDLY_TAGS.update(COLORS) -FRIENDLY_TAGS.update(ITALICS) - -INDENT_CHAR = " " - -COMMON_TAG = r"(<[\w/]+:?\w+>)" -HEX_TAG = r"(\{[0-9A-F]{2}\})" -PRINTABLE_CHARS = "".join( - (string.digits, string.ascii_letters, string.punctuation, " ") -) - SUBTITLE_TYPES = [ "TYPE_NORMAL", "TYPE_BOTTOM", @@ -87,14 +17,6 @@ SUBTITLE_TYPES = [ ] -@dataclass -class TimeStamp: - hours: int - minutes: int - seconds: int - milis: int - - @dataclass class SubEntry: bd_file: int @@ -107,91 +29,6 @@ class SubEntry: text: str -# TODO: Perhaps move this -rTIME_STAMP = r"(\d+):(\d+):(\d+)[.,](\d+)" -rTIME_STAMP_NO_HOUR = r"(\d+):(\d+)[.,](\d+)" - - -def str_to_timestamp(time_stamp: str) -> TimeStamp: - colon_cnt = time_stamp.count(":") - if colon_cnt == 2: - times = [int(s) for s in re.findall(rTIME_STAMP, time_stamp)[0]] - hours = times[0] - minutes = times[1] - seconds = times[2] - milis = times[3] - if colon_cnt == 1: - times = [int(s) for s in re.findall(rTIME_STAMP_NO_HOUR, time_stamp)[0]] - hours = 0 - minutes = times[0] - seconds = times[1] - milis = times[2] - if colon_cnt == 0 or time_stamp.isspace(): - hours = 0 - minutes = 0 - seconds = 0 - milis = 0 - - return TimeStamp(hours, minutes, seconds, milis) - - -def indent_lines(lines: list[str], level: int) -> list[str]: - new_lines = list() - for line in lines: - new_lines.append(f"{INDENT_CHAR * level}{line}") - return new_lines - - -def indent_line(line: str, level: int) -> str: - return f"{INDENT_CHAR * level}{line}" - - -def text_to_cstr(text: str, is_name: bool = False) -> str: - output = "" - multi_regex = HEX_TAG + "|" + COMMON_TAG + r"|(\n)" - tokens = [sh for sh in re.split(multi_regex, text) if sh] - - for token in tokens: - # Hex literals - if re.match(HEX_TAG, token): - output += f" \\x{int(token[1:3], 16):02X}" - - # Tags - elif re.match(COMMON_TAG, token): - tag, param, *_ = token[1:-1].split(":") + [None] - - # (In)Sanity check - if "unk" in tag.lower(): - raise ValueError( - f"Don't use sce tags, makes no sense!\nProblem text -> {text}" - ) - - if param is not None: - param_bytes = int(param, 16).to_bytes(4, byteorder="little") - raw = ",".join([f"{b:02X}" for b in param_bytes]) - output += f" {tag.upper()}({raw}) " - else: - if is_name and tag in NAMES: - ntag = tag.replace("(", "").replace(")", "") - output += f" NAME({ntag.upper()}) " - elif tag in TAGS: - output += f'"\\x{TAGS[tag]:02X}"' - elif tag in FRIENDLY_TAGS: - if tag == "/Italic": - output += " NO_ITALIC " - else: - output += f" {tag.upper()} " - elif token == "\n": - output += " NL " - else: - if is_name and token != "&": - output += f'NAME("{token}")' - else: - output += f'"{token}"' - - return output.strip() - - def row_to_subentry(row: dict) -> SubEntry: type = int(row["type"]) priority = int(row["priority"]) diff --git a/pythonlib/formats/rebirth/text_util.py b/pythonlib/formats/rebirth/text_util.py new file mode 100644 index 0000000..641a077 --- /dev/null +++ b/pythonlib/formats/rebirth/text_util.py @@ -0,0 +1,133 @@ +import re +import string + +NAMES = { + "Veigue": 1, + "Mao": 2, + "Eugene": 3, + "Annie": 4, + "Tytree": 5, + "Hilda": 6, + "Claire": 7, + "Agarte": 8, + "Annie (NPC)": 9, + "Leader": 0x1FFF, +} + +COLORS = { + "Blue": 1, + "Red": 2, + "Purple": 3, + "Green": 4, + "Cyan": 5, + "Yellow": 6, + "White": 7, + "Grey": 8, + "Black": 9, +} + +ITALICS = { + "/Italic": 0, + "Italic": 10, +} + +TAGS = { + "nl": 0x1, + "cr": 0x2, + "var": 0x4, + "color": 0x5, + "scale": 0x6, + "speed": 0x7, + "italic": 0x8, + "nmb": 0x9, + "ptr": 0xA, + "name": 0xB, + "item": 0xC, + "icon": 0xD, + "font": 0xE, + "voice": 0xF, + "unk13": 0x13, + "unk14": 0x14, + "unk15": 0x15, + "unk16": 0x16, + "unk17": 0x17, + "unk18": 0x18, + "unk19": 0x19, + "unk1A": 0x1A, +} + +FRIENDLY_TAGS = dict() +FRIENDLY_TAGS.update(NAMES) +FRIENDLY_TAGS.update(COLORS) +FRIENDLY_TAGS.update(ITALICS) + +INDENT_CHAR = " " + +COMMON_TAG = r"(<[\w/]+:?\w+>)" +HEX_TAG = r"(\{[0-9A-F]{2}\})" +PRINTABLE_CHARS = "".join( + (string.digits, string.ascii_letters, string.punctuation, " ") +) + + +def indent_lines(lines: list[str], level: int) -> list[str]: + new_lines = list() + for line in lines: + new_lines.append(f"{INDENT_CHAR * level}{line}") + return new_lines + + +def indent_line(line: str, level: int) -> str: + return f"{INDENT_CHAR * level}{line}" + + +def text_to_cstr(text: str, is_name: bool = False) -> str: + output = "" + multi_regex = HEX_TAG + "|" + COMMON_TAG + r"|(\n)" + r"|(ー)" + tokens = [sh for sh in re.split(multi_regex, text) if sh] + + for token in tokens: + # Hex literals + if re.match(HEX_TAG, token): + output += f" \\x{int(token[1:3], 16):02X}" + + # Tags + elif re.match(COMMON_TAG, token): + tag, param, *_ = token[1:-1].split(":") + [None] + + # (In)Sanity check + if "unk" in tag.lower(): + raise ValueError( + f"Don't use sce tags, makes no sense!\nProblem text -> {text}" + ) + + if param is not None: + param_bytes = int(param, 16).to_bytes(4, byteorder="little") + raw = ",".join([f"{b:02X}" for b in param_bytes]) + output += f" {tag.upper()}({raw}) " + else: + if is_name and tag in NAMES: + ntag = tag.replace("(", "").replace(")", "") + output += f" NAME({ntag.upper()}) " + elif tag in TAGS: + output += f'"\\x{TAGS[tag]:02X}"' + elif tag in FRIENDLY_TAGS: + if tag == "/Italic": + output += " NO_ITALIC " + else: + output += f" {tag.upper()} " + elif token == "\n": + output += " NL\n" + elif token == "ー": + output += " EM_DASH " + else: + if is_name: + if token == "&": + output += '" and "' + else: + names = [f'NAME("{x}")' for x in token.split("&")] + output += '" and "'.join(names) + else: + output += f'"{token}"' + + return output.strip() diff --git a/pythonlib/formats/srt.py b/pythonlib/formats/srt.py new file mode 100644 index 0000000..2e7c642 --- /dev/null +++ b/pythonlib/formats/srt.py @@ -0,0 +1,61 @@ +from itertools import groupby +from pathlib import Path +import re +from dataclasses import dataclass + +@dataclass +class TimeStamp: + hours: int + minutes: int + seconds: int + milis: int + +@dataclass +class SrtSub: + number: int + start: TimeStamp + end: TimeStamp + content: str + +rTIME_STAMP = r"(\d+):(\d+):(\d+)[.,](\d+)" +rTIME_STAMP_NO_HOUR = r"(\d+):(\d+)[.,](\d+)" + +def str_to_timestamp(time_stamp: str) -> TimeStamp: + colon_cnt = time_stamp.count(":") + if colon_cnt == 2: + times = [int(s) for s in re.findall(rTIME_STAMP, time_stamp)[0]] + hours = times[0] + minutes = times[1] + seconds = times[2] + milis = times[3] + if colon_cnt == 1: + times = [int(s) for s in re.findall(rTIME_STAMP_NO_HOUR, time_stamp)[0]] + hours = 0 + minutes = times[0] + seconds = times[1] + milis = times[2] + if colon_cnt == 0 or time_stamp.isspace(): + hours = 0 + minutes = 0 + seconds = 0 + milis = 0 + + return TimeStamp(hours, minutes, seconds, milis) + +def get_subs(filename: Path) -> list[SrtSub]: + # simple srt parser from: https://stackoverflow.com/a/23620587 + # "chunk" our input file, delimited by blank lines + with open(filename, encoding="utf-8-sig") as f: + res = [list(g) for b, g in groupby(f, lambda x: bool(x.strip())) if b] + + subs = list() + + for sub in res: + assert len(sub) >= 3, "Invalid subtitle entry in file: %s" % filename + sub = [x.strip() for x in sub] + number = sub[0] + start, end = [str_to_timestamp(t) for t in sub[1].split(" --> ")] + content = "\n".join(sub[2:]) + subs.append(SrtSub(int(number), start, end, content)) + + return subs