You've already forked PythonLib
mirror of
https://github.com/lifebottle/PythonLib.git
synced 2026-02-13 15:25:50 -08:00
Rework subs
This commit is contained in:
@@ -4,82 +4,12 @@ from typing import TextIO
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from io import StringIO
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
import string
|
||||
|
||||
from ..srt import TimeStamp, str_to_timestamp
|
||||
from .text_util import text_to_cstr, indent_lines
|
||||
|
||||
URL = "https://docs.google.com/spreadsheets/d/1-XwzS7F0SaLlXwv1KS6RcTEYYORH2DDb1bMRy5VM5oo/gviz/tq?tqx=out:csv&sheet=Subs&range=A:M"
|
||||
|
||||
|
||||
NAMES = {
|
||||
"Veigue": 1,
|
||||
"Mao": 2,
|
||||
"Eugene": 3,
|
||||
"Annie": 4,
|
||||
"Tytree": 5,
|
||||
"Hilda": 6,
|
||||
"Claire": 7,
|
||||
"Agarte": 8,
|
||||
"Annie (NPC)": 9,
|
||||
"Leader": 0x1FFF,
|
||||
}
|
||||
|
||||
COLORS = {
|
||||
"Blue": 1,
|
||||
"Red": 2,
|
||||
"Purple": 3,
|
||||
"Green": 4,
|
||||
"Cyan": 5,
|
||||
"Yellow": 6,
|
||||
"White": 7,
|
||||
"Grey": 8,
|
||||
"Black": 9,
|
||||
}
|
||||
|
||||
ITALICS = {
|
||||
"/Italic": 0,
|
||||
"Italic": 10,
|
||||
}
|
||||
|
||||
TAGS = {
|
||||
"nl": 0x1,
|
||||
"cr": 0x2,
|
||||
"var": 0x4,
|
||||
"color": 0x5,
|
||||
"scale": 0x6,
|
||||
"speed": 0x7,
|
||||
"italic": 0x8,
|
||||
"nmb": 0x9,
|
||||
"ptr": 0xA,
|
||||
"name": 0xB,
|
||||
"item": 0xC,
|
||||
"icon": 0xD,
|
||||
"font": 0xE,
|
||||
"voice": 0xF,
|
||||
"unk13": 0x13,
|
||||
"unk14": 0x14,
|
||||
"unk15": 0x15,
|
||||
"unk16": 0x16,
|
||||
"unk17": 0x17,
|
||||
"unk18": 0x18,
|
||||
"unk19": 0x19,
|
||||
"unk1A": 0x1A,
|
||||
}
|
||||
|
||||
FRIENDLY_TAGS = dict()
|
||||
FRIENDLY_TAGS.update(NAMES)
|
||||
FRIENDLY_TAGS.update(COLORS)
|
||||
FRIENDLY_TAGS.update(ITALICS)
|
||||
|
||||
INDENT_CHAR = " "
|
||||
|
||||
COMMON_TAG = r"(<[\w/]+:?\w+>)"
|
||||
HEX_TAG = r"(\{[0-9A-F]{2}\})"
|
||||
PRINTABLE_CHARS = "".join(
|
||||
(string.digits, string.ascii_letters, string.punctuation, " ")
|
||||
)
|
||||
|
||||
SUBTITLE_TYPES = [
|
||||
"TYPE_NORMAL",
|
||||
"TYPE_BOTTOM",
|
||||
@@ -87,14 +17,6 @@ SUBTITLE_TYPES = [
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class TimeStamp:
|
||||
hours: int
|
||||
minutes: int
|
||||
seconds: int
|
||||
milis: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class SubEntry:
|
||||
bd_file: int
|
||||
@@ -107,91 +29,6 @@ class SubEntry:
|
||||
text: str
|
||||
|
||||
|
||||
# TODO: Perhaps move this
|
||||
rTIME_STAMP = r"(\d+):(\d+):(\d+)[.,](\d+)"
|
||||
rTIME_STAMP_NO_HOUR = r"(\d+):(\d+)[.,](\d+)"
|
||||
|
||||
|
||||
def str_to_timestamp(time_stamp: str) -> TimeStamp:
|
||||
colon_cnt = time_stamp.count(":")
|
||||
if colon_cnt == 2:
|
||||
times = [int(s) for s in re.findall(rTIME_STAMP, time_stamp)[0]]
|
||||
hours = times[0]
|
||||
minutes = times[1]
|
||||
seconds = times[2]
|
||||
milis = times[3]
|
||||
if colon_cnt == 1:
|
||||
times = [int(s) for s in re.findall(rTIME_STAMP_NO_HOUR, time_stamp)[0]]
|
||||
hours = 0
|
||||
minutes = times[0]
|
||||
seconds = times[1]
|
||||
milis = times[2]
|
||||
if colon_cnt == 0 or time_stamp.isspace():
|
||||
hours = 0
|
||||
minutes = 0
|
||||
seconds = 0
|
||||
milis = 0
|
||||
|
||||
return TimeStamp(hours, minutes, seconds, milis)
|
||||
|
||||
|
||||
def indent_lines(lines: list[str], level: int) -> list[str]:
|
||||
new_lines = list()
|
||||
for line in lines:
|
||||
new_lines.append(f"{INDENT_CHAR * level}{line}")
|
||||
return new_lines
|
||||
|
||||
|
||||
def indent_line(line: str, level: int) -> str:
|
||||
return f"{INDENT_CHAR * level}{line}"
|
||||
|
||||
|
||||
def text_to_cstr(text: str, is_name: bool = False) -> str:
|
||||
output = ""
|
||||
multi_regex = HEX_TAG + "|" + COMMON_TAG + r"|(\n)"
|
||||
tokens = [sh for sh in re.split(multi_regex, text) if sh]
|
||||
|
||||
for token in tokens:
|
||||
# Hex literals
|
||||
if re.match(HEX_TAG, token):
|
||||
output += f" \\x{int(token[1:3], 16):02X}"
|
||||
|
||||
# Tags
|
||||
elif re.match(COMMON_TAG, token):
|
||||
tag, param, *_ = token[1:-1].split(":") + [None]
|
||||
|
||||
# (In)Sanity check
|
||||
if "unk" in tag.lower():
|
||||
raise ValueError(
|
||||
f"Don't use sce tags, makes no sense!\nProblem text -> {text}"
|
||||
)
|
||||
|
||||
if param is not None:
|
||||
param_bytes = int(param, 16).to_bytes(4, byteorder="little")
|
||||
raw = ",".join([f"{b:02X}" for b in param_bytes])
|
||||
output += f" {tag.upper()}({raw}) "
|
||||
else:
|
||||
if is_name and tag in NAMES:
|
||||
ntag = tag.replace("(", "").replace(")", "")
|
||||
output += f" NAME({ntag.upper()}) "
|
||||
elif tag in TAGS:
|
||||
output += f'"\\x{TAGS[tag]:02X}"'
|
||||
elif tag in FRIENDLY_TAGS:
|
||||
if tag == "/Italic":
|
||||
output += " NO_ITALIC "
|
||||
else:
|
||||
output += f" {tag.upper()} "
|
||||
elif token == "\n":
|
||||
output += " NL "
|
||||
else:
|
||||
if is_name and token != "&":
|
||||
output += f'NAME("{token}")'
|
||||
else:
|
||||
output += f'"{token}"'
|
||||
|
||||
return output.strip()
|
||||
|
||||
|
||||
def row_to_subentry(row: dict) -> SubEntry:
|
||||
type = int(row["type"])
|
||||
priority = int(row["priority"])
|
||||
|
||||
133
pythonlib/formats/rebirth/text_util.py
Normal file
133
pythonlib/formats/rebirth/text_util.py
Normal file
@@ -0,0 +1,133 @@
|
||||
import re
|
||||
import string
|
||||
|
||||
NAMES = {
|
||||
"Veigue": 1,
|
||||
"Mao": 2,
|
||||
"Eugene": 3,
|
||||
"Annie": 4,
|
||||
"Tytree": 5,
|
||||
"Hilda": 6,
|
||||
"Claire": 7,
|
||||
"Agarte": 8,
|
||||
"Annie (NPC)": 9,
|
||||
"Leader": 0x1FFF,
|
||||
}
|
||||
|
||||
COLORS = {
|
||||
"Blue": 1,
|
||||
"Red": 2,
|
||||
"Purple": 3,
|
||||
"Green": 4,
|
||||
"Cyan": 5,
|
||||
"Yellow": 6,
|
||||
"White": 7,
|
||||
"Grey": 8,
|
||||
"Black": 9,
|
||||
}
|
||||
|
||||
ITALICS = {
|
||||
"/Italic": 0,
|
||||
"Italic": 10,
|
||||
}
|
||||
|
||||
TAGS = {
|
||||
"nl": 0x1,
|
||||
"cr": 0x2,
|
||||
"var": 0x4,
|
||||
"color": 0x5,
|
||||
"scale": 0x6,
|
||||
"speed": 0x7,
|
||||
"italic": 0x8,
|
||||
"nmb": 0x9,
|
||||
"ptr": 0xA,
|
||||
"name": 0xB,
|
||||
"item": 0xC,
|
||||
"icon": 0xD,
|
||||
"font": 0xE,
|
||||
"voice": 0xF,
|
||||
"unk13": 0x13,
|
||||
"unk14": 0x14,
|
||||
"unk15": 0x15,
|
||||
"unk16": 0x16,
|
||||
"unk17": 0x17,
|
||||
"unk18": 0x18,
|
||||
"unk19": 0x19,
|
||||
"unk1A": 0x1A,
|
||||
}
|
||||
|
||||
FRIENDLY_TAGS = dict()
|
||||
FRIENDLY_TAGS.update(NAMES)
|
||||
FRIENDLY_TAGS.update(COLORS)
|
||||
FRIENDLY_TAGS.update(ITALICS)
|
||||
|
||||
INDENT_CHAR = " "
|
||||
|
||||
COMMON_TAG = r"(<[\w/]+:?\w+>)"
|
||||
HEX_TAG = r"(\{[0-9A-F]{2}\})"
|
||||
PRINTABLE_CHARS = "".join(
|
||||
(string.digits, string.ascii_letters, string.punctuation, " ")
|
||||
)
|
||||
|
||||
|
||||
def indent_lines(lines: list[str], level: int) -> list[str]:
|
||||
new_lines = list()
|
||||
for line in lines:
|
||||
new_lines.append(f"{INDENT_CHAR * level}{line}")
|
||||
return new_lines
|
||||
|
||||
|
||||
def indent_line(line: str, level: int) -> str:
|
||||
return f"{INDENT_CHAR * level}{line}"
|
||||
|
||||
|
||||
def text_to_cstr(text: str, is_name: bool = False) -> str:
|
||||
output = ""
|
||||
multi_regex = HEX_TAG + "|" + COMMON_TAG + r"|(\n)" + r"|(ー)"
|
||||
tokens = [sh for sh in re.split(multi_regex, text) if sh]
|
||||
|
||||
for token in tokens:
|
||||
# Hex literals
|
||||
if re.match(HEX_TAG, token):
|
||||
output += f" \\x{int(token[1:3], 16):02X}"
|
||||
|
||||
# Tags
|
||||
elif re.match(COMMON_TAG, token):
|
||||
tag, param, *_ = token[1:-1].split(":") + [None]
|
||||
|
||||
# (In)Sanity check
|
||||
if "unk" in tag.lower():
|
||||
raise ValueError(
|
||||
f"Don't use sce tags, makes no sense!\nProblem text -> {text}"
|
||||
)
|
||||
|
||||
if param is not None:
|
||||
param_bytes = int(param, 16).to_bytes(4, byteorder="little")
|
||||
raw = ",".join([f"{b:02X}" for b in param_bytes])
|
||||
output += f" {tag.upper()}({raw}) "
|
||||
else:
|
||||
if is_name and tag in NAMES:
|
||||
ntag = tag.replace("(", "").replace(")", "")
|
||||
output += f" NAME({ntag.upper()}) "
|
||||
elif tag in TAGS:
|
||||
output += f'"\\x{TAGS[tag]:02X}"'
|
||||
elif tag in FRIENDLY_TAGS:
|
||||
if tag == "/Italic":
|
||||
output += " NO_ITALIC "
|
||||
else:
|
||||
output += f" {tag.upper()} "
|
||||
elif token == "\n":
|
||||
output += " NL\n"
|
||||
elif token == "ー":
|
||||
output += " EM_DASH "
|
||||
else:
|
||||
if is_name:
|
||||
if token == "&":
|
||||
output += '" and "'
|
||||
else:
|
||||
names = [f'NAME("{x}")' for x in token.split("&")]
|
||||
output += '" and "'.join(names)
|
||||
else:
|
||||
output += f'"{token}"'
|
||||
|
||||
return output.strip()
|
||||
61
pythonlib/formats/srt.py
Normal file
61
pythonlib/formats/srt.py
Normal file
@@ -0,0 +1,61 @@
|
||||
from itertools import groupby
|
||||
from pathlib import Path
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class TimeStamp:
|
||||
hours: int
|
||||
minutes: int
|
||||
seconds: int
|
||||
milis: int
|
||||
|
||||
@dataclass
|
||||
class SrtSub:
|
||||
number: int
|
||||
start: TimeStamp
|
||||
end: TimeStamp
|
||||
content: str
|
||||
|
||||
rTIME_STAMP = r"(\d+):(\d+):(\d+)[.,](\d+)"
|
||||
rTIME_STAMP_NO_HOUR = r"(\d+):(\d+)[.,](\d+)"
|
||||
|
||||
def str_to_timestamp(time_stamp: str) -> TimeStamp:
|
||||
colon_cnt = time_stamp.count(":")
|
||||
if colon_cnt == 2:
|
||||
times = [int(s) for s in re.findall(rTIME_STAMP, time_stamp)[0]]
|
||||
hours = times[0]
|
||||
minutes = times[1]
|
||||
seconds = times[2]
|
||||
milis = times[3]
|
||||
if colon_cnt == 1:
|
||||
times = [int(s) for s in re.findall(rTIME_STAMP_NO_HOUR, time_stamp)[0]]
|
||||
hours = 0
|
||||
minutes = times[0]
|
||||
seconds = times[1]
|
||||
milis = times[2]
|
||||
if colon_cnt == 0 or time_stamp.isspace():
|
||||
hours = 0
|
||||
minutes = 0
|
||||
seconds = 0
|
||||
milis = 0
|
||||
|
||||
return TimeStamp(hours, minutes, seconds, milis)
|
||||
|
||||
def get_subs(filename: Path) -> list[SrtSub]:
|
||||
# simple srt parser from: https://stackoverflow.com/a/23620587
|
||||
# "chunk" our input file, delimited by blank lines
|
||||
with open(filename, encoding="utf-8-sig") as f:
|
||||
res = [list(g) for b, g in groupby(f, lambda x: bool(x.strip())) if b]
|
||||
|
||||
subs = list()
|
||||
|
||||
for sub in res:
|
||||
assert len(sub) >= 3, "Invalid subtitle entry in file: %s" % filename
|
||||
sub = [x.strip() for x in sub]
|
||||
number = sub[0]
|
||||
start, end = [str_to_timestamp(t) for t in sub[1].split(" --> ")]
|
||||
content = "\n".join(sub[2:])
|
||||
subs.append(SrtSub(int(number), start, end, content))
|
||||
|
||||
return subs
|
||||
Reference in New Issue
Block a user