Rework subs

2026-02-13 15:25:50 -08:00 · 2024-12-05 19:49:39 -05:00
parent 74501bebd1
commit 6ad3c6d7ad
3 changed files with 196 additions and 165 deletions
--- a/pythonlib/formats/rebirth/btlsub_to_hdr.py
+++ b/pythonlib/formats/rebirth/btlsub_to_hdr.py
@@ -4,82 +4,12 @@ from typing import TextIO
 import urllib.error
 import urllib.request
 from io import StringIO
-import re
 from dataclasses import dataclass
-import string
-
+from ..srt import TimeStamp, str_to_timestamp
+from .text_util import text_to_cstr, indent_lines

 URL = "https://docs.google.com/spreadsheets/d/1-XwzS7F0SaLlXwv1KS6RcTEYYORH2DDb1bMRy5VM5oo/gviz/tq?tqx=out:csv&sheet=Subs&range=A:M"

-
-NAMES = {
-    "Veigue": 1,
-    "Mao": 2,
-    "Eugene": 3,
-    "Annie": 4,
-    "Tytree": 5,
-    "Hilda": 6,
-    "Claire": 7,
-    "Agarte": 8,
-    "Annie (NPC)": 9,
-    "Leader": 0x1FFF,
-}
-
-COLORS = {
-    "Blue": 1,
-    "Red": 2,
-    "Purple": 3,
-    "Green": 4,
-    "Cyan": 5,
-    "Yellow": 6,
-    "White": 7,
-    "Grey": 8,
-    "Black": 9,
-}
-
-ITALICS = {
-    "/Italic": 0,
-    "Italic": 10,
-}
-
-TAGS = {
-    "nl": 0x1,
-    "cr": 0x2,
-    "var": 0x4,
-    "color": 0x5,
-    "scale": 0x6,
-    "speed": 0x7,
-    "italic": 0x8,
-    "nmb": 0x9,
-    "ptr": 0xA,
-    "name": 0xB,
-    "item": 0xC,
-    "icon": 0xD,
-    "font": 0xE,
-    "voice": 0xF,
-    "unk13": 0x13,
-    "unk14": 0x14,
-    "unk15": 0x15,
-    "unk16": 0x16,
-    "unk17": 0x17,
-    "unk18": 0x18,
-    "unk19": 0x19,
-    "unk1A": 0x1A,
-}
-
-FRIENDLY_TAGS = dict()
-FRIENDLY_TAGS.update(NAMES)
-FRIENDLY_TAGS.update(COLORS)
-FRIENDLY_TAGS.update(ITALICS)
-
-INDENT_CHAR = "    "
-
-COMMON_TAG = r"(<[\w/]+:?\w+>)"
-HEX_TAG = r"(\{[0-9A-F]{2}\})"
-PRINTABLE_CHARS = "".join(
-    (string.digits, string.ascii_letters, string.punctuation, " ")
-)
-
 SUBTITLE_TYPES = [
    "TYPE_NORMAL",
    "TYPE_BOTTOM",
@@ -87,14 +17,6 @@ SUBTITLE_TYPES = [
 ]


-@dataclass
-class TimeStamp:
-    hours: int
-    minutes: int
-    seconds: int
-    milis: int
-
-
@dataclass
 class SubEntry:
    bd_file: int
@@ -107,91 +29,6 @@ class SubEntry:
    text: str


-# TODO: Perhaps move this
-rTIME_STAMP = r"(\d+):(\d+):(\d+)[.,](\d+)"
-rTIME_STAMP_NO_HOUR = r"(\d+):(\d+)[.,](\d+)"
-
-
-def str_to_timestamp(time_stamp: str) -> TimeStamp:
-    colon_cnt = time_stamp.count(":")
-    if colon_cnt == 2:
-        times = [int(s) for s in re.findall(rTIME_STAMP, time_stamp)[0]]
-        hours = times[0]
-        minutes = times[1]
-        seconds = times[2]
-        milis = times[3]
-    if colon_cnt == 1:
-        times = [int(s) for s in re.findall(rTIME_STAMP_NO_HOUR, time_stamp)[0]]
-        hours = 0
-        minutes = times[0]
-        seconds = times[1]
-        milis = times[2]
-    if colon_cnt == 0 or time_stamp.isspace():
-        hours = 0
-        minutes = 0
-        seconds = 0
-        milis = 0
-
-    return TimeStamp(hours, minutes, seconds, milis)
-
-
-def indent_lines(lines: list[str], level: int) -> list[str]:
-    new_lines = list()
-    for line in lines:
-        new_lines.append(f"{INDENT_CHAR * level}{line}")
-    return new_lines
-
-
-def indent_line(line: str, level: int) -> str:
-    return f"{INDENT_CHAR * level}{line}"
-
-
-def text_to_cstr(text: str, is_name: bool = False) -> str:
-    output = ""
-    multi_regex = HEX_TAG + "|" + COMMON_TAG + r"|(\n)"
-    tokens = [sh for sh in re.split(multi_regex, text) if sh]
-
-    for token in tokens:
-        # Hex literals
-        if re.match(HEX_TAG, token):
-            output += f" \\x{int(token[1:3], 16):02X}"
-
-        # Tags
-        elif re.match(COMMON_TAG, token):
-            tag, param, *_ = token[1:-1].split(":") + [None]
-
-            # (In)Sanity check
-            if "unk" in tag.lower():
-                raise ValueError(
-                    f"Don't use sce tags, makes no sense!\nProblem text -> {text}"
-                )
-
-            if param is not None:
-                param_bytes = int(param, 16).to_bytes(4, byteorder="little")
-                raw = ",".join([f"{b:02X}" for b in param_bytes])
-                output += f" {tag.upper()}({raw}) "
-            else:
-                if is_name and tag in NAMES:
-                    ntag = tag.replace("(", "").replace(")", "")
-                    output += f" NAME({ntag.upper()}) "
-                elif tag in TAGS:
-                    output += f'"\\x{TAGS[tag]:02X}"'
-                elif tag in FRIENDLY_TAGS:
-                    if tag == "/Italic":
-                        output += " NO_ITALIC "
-                    else:
-                        output += f" {tag.upper()} "
-        elif token == "\n":
-            output += " NL "
-        else:
-            if is_name and token != "&":
-                output += f'NAME("{token}")'
-            else: 
-                output += f'"{token}"'
-
-    return output.strip()
-
-
 def row_to_subentry(row: dict) -> SubEntry:
    type = int(row["type"])
    priority = int(row["priority"])
--- a/pythonlib/formats/rebirth/text_util.py
+++ b/pythonlib/formats/rebirth/text_util.py
@@ -0,0 +1,133 @@
+import re
+import string
+
+NAMES = {
+    "Veigue": 1,
+    "Mao": 2,
+    "Eugene": 3,
+    "Annie": 4,
+    "Tytree": 5,
+    "Hilda": 6,
+    "Claire": 7,
+    "Agarte": 8,
+    "Annie (NPC)": 9,
+    "Leader": 0x1FFF,
+}
+
+COLORS = {
+    "Blue": 1,
+    "Red": 2,
+    "Purple": 3,
+    "Green": 4,
+    "Cyan": 5,
+    "Yellow": 6,
+    "White": 7,
+    "Grey": 8,
+    "Black": 9,
+}
+
+ITALICS = {
+    "/Italic": 0,
+    "Italic": 10,
+}
+
+TAGS = {
+    "nl": 0x1,
+    "cr": 0x2,
+    "var": 0x4,
+    "color": 0x5,
+    "scale": 0x6,
+    "speed": 0x7,
+    "italic": 0x8,
+    "nmb": 0x9,
+    "ptr": 0xA,
+    "name": 0xB,
+    "item": 0xC,
+    "icon": 0xD,
+    "font": 0xE,
+    "voice": 0xF,
+    "unk13": 0x13,
+    "unk14": 0x14,
+    "unk15": 0x15,
+    "unk16": 0x16,
+    "unk17": 0x17,
+    "unk18": 0x18,
+    "unk19": 0x19,
+    "unk1A": 0x1A,
+}
+
+FRIENDLY_TAGS = dict()
+FRIENDLY_TAGS.update(NAMES)
+FRIENDLY_TAGS.update(COLORS)
+FRIENDLY_TAGS.update(ITALICS)
+
+INDENT_CHAR = "    "
+
+COMMON_TAG = r"(<[\w/]+:?\w+>)"
+HEX_TAG = r"(\{[0-9A-F]{2}\})"
+PRINTABLE_CHARS = "".join(
+    (string.digits, string.ascii_letters, string.punctuation, " ")
+)
+
+
+def indent_lines(lines: list[str], level: int) -> list[str]:
+    new_lines = list()
+    for line in lines:
+        new_lines.append(f"{INDENT_CHAR * level}{line}")
+    return new_lines
+
+
+def indent_line(line: str, level: int) -> str:
+    return f"{INDENT_CHAR * level}{line}"
+
+
+def text_to_cstr(text: str, is_name: bool = False) -> str:
+    output = ""
+    multi_regex = HEX_TAG + "|" + COMMON_TAG + r"|(\n)" + r"|(ー)"
+    tokens = [sh for sh in re.split(multi_regex, text) if sh]
+
+    for token in tokens:
+        # Hex literals
+        if re.match(HEX_TAG, token):
+            output += f" \\x{int(token[1:3], 16):02X}"
+
+        # Tags
+        elif re.match(COMMON_TAG, token):
+            tag, param, *_ = token[1:-1].split(":") + [None]
+
+            # (In)Sanity check
+            if "unk" in tag.lower():
+                raise ValueError(
+                    f"Don't use sce tags, makes no sense!\nProblem text -> {text}"
+                )
+
+            if param is not None:
+                param_bytes = int(param, 16).to_bytes(4, byteorder="little")
+                raw = ",".join([f"{b:02X}" for b in param_bytes])
+                output += f" {tag.upper()}({raw}) "
+            else:
+                if is_name and tag in NAMES:
+                    ntag = tag.replace("(", "").replace(")", "")
+                    output += f" NAME({ntag.upper()}) "
+                elif tag in TAGS:
+                    output += f'"\\x{TAGS[tag]:02X}"'
+                elif tag in FRIENDLY_TAGS:
+                    if tag == "/Italic":
+                        output += " NO_ITALIC "
+                    else:
+                        output += f" {tag.upper()} "
+        elif token == "\n":
+            output += " NL\n"
+        elif token == "ー":
+            output += " EM_DASH "
+        else:
+            if is_name:
+                if token == "&":
+                    output += '" and "'
+                else:
+                    names = [f'NAME("{x}")' for x in token.split("&")]
+                    output += '" and "'.join(names)
+            else:
+                output += f'"{token}"'
+
+    return output.strip()
--- a/pythonlib/formats/srt.py
+++ b/pythonlib/formats/srt.py
@@ -0,0 +1,61 @@
+from itertools import groupby
+from pathlib import Path
+import re
+from dataclasses import dataclass
+
+@dataclass
+class TimeStamp:
+    hours: int
+    minutes: int
+    seconds: int
+    milis: int
+
+@dataclass
+class SrtSub:
+    number: int
+    start: TimeStamp
+    end: TimeStamp
+    content: str
+
+rTIME_STAMP = r"(\d+):(\d+):(\d+)[.,](\d+)"
+rTIME_STAMP_NO_HOUR = r"(\d+):(\d+)[.,](\d+)"
+
+def str_to_timestamp(time_stamp: str) -> TimeStamp:
+    colon_cnt = time_stamp.count(":")
+    if colon_cnt == 2:
+        times = [int(s) for s in re.findall(rTIME_STAMP, time_stamp)[0]]
+        hours = times[0]
+        minutes = times[1]
+        seconds = times[2]
+        milis = times[3]
+    if colon_cnt == 1:
+        times = [int(s) for s in re.findall(rTIME_STAMP_NO_HOUR, time_stamp)[0]]
+        hours = 0
+        minutes = times[0]
+        seconds = times[1]
+        milis = times[2]
+    if colon_cnt == 0 or time_stamp.isspace():
+        hours = 0
+        minutes = 0
+        seconds = 0
+        milis = 0
+
+    return TimeStamp(hours, minutes, seconds, milis)
+
+def get_subs(filename: Path) -> list[SrtSub]:
+    # simple srt parser from: https://stackoverflow.com/a/23620587
+    # "chunk" our input file, delimited by blank lines
+    with open(filename, encoding="utf-8-sig") as f:
+        res = [list(g) for b, g in groupby(f, lambda x: bool(x.strip())) if b]
+
+    subs = list()
+
+    for sub in res:
+        assert len(sub) >= 3, "Invalid subtitle entry in file: %s" % filename
+        sub = [x.strip() for x in sub]
+        number = sub[0]
+        start, end = [str_to_timestamp(t) for t in sub[1].split(" --> ")]
+        content = "\n".join(sub[2:])
+        subs.append(SrtSub(int(number), start, end, content))
+
+    return subs