Rework subs

This commit is contained in:
Mc-muffin
2024-12-05 19:49:39 -05:00
parent 74501bebd1
commit 6ad3c6d7ad
3 changed files with 196 additions and 165 deletions

View File

@@ -4,82 +4,12 @@ from typing import TextIO
import urllib.error
import urllib.request
from io import StringIO
import re
from dataclasses import dataclass
import string
from ..srt import TimeStamp, str_to_timestamp
from .text_util import text_to_cstr, indent_lines
URL = "https://docs.google.com/spreadsheets/d/1-XwzS7F0SaLlXwv1KS6RcTEYYORH2DDb1bMRy5VM5oo/gviz/tq?tqx=out:csv&sheet=Subs&range=A:M"
NAMES = {
"Veigue": 1,
"Mao": 2,
"Eugene": 3,
"Annie": 4,
"Tytree": 5,
"Hilda": 6,
"Claire": 7,
"Agarte": 8,
"Annie (NPC)": 9,
"Leader": 0x1FFF,
}
COLORS = {
"Blue": 1,
"Red": 2,
"Purple": 3,
"Green": 4,
"Cyan": 5,
"Yellow": 6,
"White": 7,
"Grey": 8,
"Black": 9,
}
ITALICS = {
"/Italic": 0,
"Italic": 10,
}
TAGS = {
"nl": 0x1,
"cr": 0x2,
"var": 0x4,
"color": 0x5,
"scale": 0x6,
"speed": 0x7,
"italic": 0x8,
"nmb": 0x9,
"ptr": 0xA,
"name": 0xB,
"item": 0xC,
"icon": 0xD,
"font": 0xE,
"voice": 0xF,
"unk13": 0x13,
"unk14": 0x14,
"unk15": 0x15,
"unk16": 0x16,
"unk17": 0x17,
"unk18": 0x18,
"unk19": 0x19,
"unk1A": 0x1A,
}
FRIENDLY_TAGS = dict()
FRIENDLY_TAGS.update(NAMES)
FRIENDLY_TAGS.update(COLORS)
FRIENDLY_TAGS.update(ITALICS)
INDENT_CHAR = " "
COMMON_TAG = r"(<[\w/]+:?\w+>)"
HEX_TAG = r"(\{[0-9A-F]{2}\})"
PRINTABLE_CHARS = "".join(
(string.digits, string.ascii_letters, string.punctuation, " ")
)
SUBTITLE_TYPES = [
"TYPE_NORMAL",
"TYPE_BOTTOM",
@@ -87,14 +17,6 @@ SUBTITLE_TYPES = [
]
@dataclass
class TimeStamp:
hours: int
minutes: int
seconds: int
milis: int
@dataclass
class SubEntry:
bd_file: int
@@ -107,91 +29,6 @@ class SubEntry:
text: str
# TODO: Perhaps move this
rTIME_STAMP = r"(\d+):(\d+):(\d+)[.,](\d+)"
rTIME_STAMP_NO_HOUR = r"(\d+):(\d+)[.,](\d+)"
def str_to_timestamp(time_stamp: str) -> TimeStamp:
colon_cnt = time_stamp.count(":")
if colon_cnt == 2:
times = [int(s) for s in re.findall(rTIME_STAMP, time_stamp)[0]]
hours = times[0]
minutes = times[1]
seconds = times[2]
milis = times[3]
if colon_cnt == 1:
times = [int(s) for s in re.findall(rTIME_STAMP_NO_HOUR, time_stamp)[0]]
hours = 0
minutes = times[0]
seconds = times[1]
milis = times[2]
if colon_cnt == 0 or time_stamp.isspace():
hours = 0
minutes = 0
seconds = 0
milis = 0
return TimeStamp(hours, minutes, seconds, milis)
def indent_lines(lines: list[str], level: int) -> list[str]:
new_lines = list()
for line in lines:
new_lines.append(f"{INDENT_CHAR * level}{line}")
return new_lines
def indent_line(line: str, level: int) -> str:
return f"{INDENT_CHAR * level}{line}"
def text_to_cstr(text: str, is_name: bool = False) -> str:
output = ""
multi_regex = HEX_TAG + "|" + COMMON_TAG + r"|(\n)"
tokens = [sh for sh in re.split(multi_regex, text) if sh]
for token in tokens:
# Hex literals
if re.match(HEX_TAG, token):
output += f" \\x{int(token[1:3], 16):02X}"
# Tags
elif re.match(COMMON_TAG, token):
tag, param, *_ = token[1:-1].split(":") + [None]
# (In)Sanity check
if "unk" in tag.lower():
raise ValueError(
f"Don't use sce tags, makes no sense!\nProblem text -> {text}"
)
if param is not None:
param_bytes = int(param, 16).to_bytes(4, byteorder="little")
raw = ",".join([f"{b:02X}" for b in param_bytes])
output += f" {tag.upper()}({raw}) "
else:
if is_name and tag in NAMES:
ntag = tag.replace("(", "").replace(")", "")
output += f" NAME({ntag.upper()}) "
elif tag in TAGS:
output += f'"\\x{TAGS[tag]:02X}"'
elif tag in FRIENDLY_TAGS:
if tag == "/Italic":
output += " NO_ITALIC "
else:
output += f" {tag.upper()} "
elif token == "\n":
output += " NL "
else:
if is_name and token != "&":
output += f'NAME("{token}")'
else:
output += f'"{token}"'
return output.strip()
def row_to_subentry(row: dict) -> SubEntry:
type = int(row["type"])
priority = int(row["priority"])

View File

@@ -0,0 +1,133 @@
import re
import string
NAMES = {
"Veigue": 1,
"Mao": 2,
"Eugene": 3,
"Annie": 4,
"Tytree": 5,
"Hilda": 6,
"Claire": 7,
"Agarte": 8,
"Annie (NPC)": 9,
"Leader": 0x1FFF,
}
COLORS = {
"Blue": 1,
"Red": 2,
"Purple": 3,
"Green": 4,
"Cyan": 5,
"Yellow": 6,
"White": 7,
"Grey": 8,
"Black": 9,
}
ITALICS = {
"/Italic": 0,
"Italic": 10,
}
TAGS = {
"nl": 0x1,
"cr": 0x2,
"var": 0x4,
"color": 0x5,
"scale": 0x6,
"speed": 0x7,
"italic": 0x8,
"nmb": 0x9,
"ptr": 0xA,
"name": 0xB,
"item": 0xC,
"icon": 0xD,
"font": 0xE,
"voice": 0xF,
"unk13": 0x13,
"unk14": 0x14,
"unk15": 0x15,
"unk16": 0x16,
"unk17": 0x17,
"unk18": 0x18,
"unk19": 0x19,
"unk1A": 0x1A,
}
FRIENDLY_TAGS = dict()
FRIENDLY_TAGS.update(NAMES)
FRIENDLY_TAGS.update(COLORS)
FRIENDLY_TAGS.update(ITALICS)
INDENT_CHAR = " "
COMMON_TAG = r"(<[\w/]+:?\w+>)"
HEX_TAG = r"(\{[0-9A-F]{2}\})"
PRINTABLE_CHARS = "".join(
(string.digits, string.ascii_letters, string.punctuation, " ")
)
def indent_lines(lines: list[str], level: int) -> list[str]:
new_lines = list()
for line in lines:
new_lines.append(f"{INDENT_CHAR * level}{line}")
return new_lines
def indent_line(line: str, level: int) -> str:
return f"{INDENT_CHAR * level}{line}"
def text_to_cstr(text: str, is_name: bool = False) -> str:
output = ""
multi_regex = HEX_TAG + "|" + COMMON_TAG + r"|(\n)" + r"|(ー)"
tokens = [sh for sh in re.split(multi_regex, text) if sh]
for token in tokens:
# Hex literals
if re.match(HEX_TAG, token):
output += f" \\x{int(token[1:3], 16):02X}"
# Tags
elif re.match(COMMON_TAG, token):
tag, param, *_ = token[1:-1].split(":") + [None]
# (In)Sanity check
if "unk" in tag.lower():
raise ValueError(
f"Don't use sce tags, makes no sense!\nProblem text -> {text}"
)
if param is not None:
param_bytes = int(param, 16).to_bytes(4, byteorder="little")
raw = ",".join([f"{b:02X}" for b in param_bytes])
output += f" {tag.upper()}({raw}) "
else:
if is_name and tag in NAMES:
ntag = tag.replace("(", "").replace(")", "")
output += f" NAME({ntag.upper()}) "
elif tag in TAGS:
output += f'"\\x{TAGS[tag]:02X}"'
elif tag in FRIENDLY_TAGS:
if tag == "/Italic":
output += " NO_ITALIC "
else:
output += f" {tag.upper()} "
elif token == "\n":
output += " NL\n"
elif token == "":
output += " EM_DASH "
else:
if is_name:
if token == "&":
output += '" and "'
else:
names = [f'NAME("{x}")' for x in token.split("&")]
output += '" and "'.join(names)
else:
output += f'"{token}"'
return output.strip()

61
pythonlib/formats/srt.py Normal file
View File

@@ -0,0 +1,61 @@
from itertools import groupby
from pathlib import Path
import re
from dataclasses import dataclass
@dataclass
class TimeStamp:
hours: int
minutes: int
seconds: int
milis: int
@dataclass
class SrtSub:
number: int
start: TimeStamp
end: TimeStamp
content: str
rTIME_STAMP = r"(\d+):(\d+):(\d+)[.,](\d+)"
rTIME_STAMP_NO_HOUR = r"(\d+):(\d+)[.,](\d+)"
def str_to_timestamp(time_stamp: str) -> TimeStamp:
colon_cnt = time_stamp.count(":")
if colon_cnt == 2:
times = [int(s) for s in re.findall(rTIME_STAMP, time_stamp)[0]]
hours = times[0]
minutes = times[1]
seconds = times[2]
milis = times[3]
if colon_cnt == 1:
times = [int(s) for s in re.findall(rTIME_STAMP_NO_HOUR, time_stamp)[0]]
hours = 0
minutes = times[0]
seconds = times[1]
milis = times[2]
if colon_cnt == 0 or time_stamp.isspace():
hours = 0
minutes = 0
seconds = 0
milis = 0
return TimeStamp(hours, minutes, seconds, milis)
def get_subs(filename: Path) -> list[SrtSub]:
# simple srt parser from: https://stackoverflow.com/a/23620587
# "chunk" our input file, delimited by blank lines
with open(filename, encoding="utf-8-sig") as f:
res = [list(g) for b, g in groupby(f, lambda x: bool(x.strip())) if b]
subs = list()
for sub in res:
assert len(sub) >= 3, "Invalid subtitle entry in file: %s" % filename
sub = [x.strip() for x in sub]
number = sub[0]
start, end = [str_to_timestamp(t) for t in sub[1].split(" --> ")]
content = "\n".join(sub[2:])
subs.append(SrtSub(int(number), start, end, content))
return subs