From 075c5e1ef6957d3f2eba696b02d4b54f845b5e2b Mon Sep 17 00:00:00 2001 From: Mc-muffin <8714476+Mc-muffin@users.noreply.github.com> Date: Sat, 10 Jun 2023 23:38:14 -0500 Subject: [PATCH] Move transfer stuff --- pythonlib/games/ToolsTOR.py | 104 +-------------------------- text_transfer.py | 140 ++++++++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+), 103 deletions(-) create mode 100644 text_transfer.py diff --git a/pythonlib/games/ToolsTOR.py b/pythonlib/games/ToolsTOR.py index fd4e19d..9d737b6 100644 --- a/pythonlib/games/ToolsTOR.py +++ b/pythonlib/games/ToolsTOR.py @@ -84,110 +84,8 @@ class ToolsTOR(ToolsTales): # byteCode self.story_byte_code = b"\xF8" self.string_opcode = InstructionType.STRING - self.list_status_insertion = ['Done', 'Proofreading', 'Editing'] - + self.list_status_insertion: list[str] = ['Done', 'Proofreading', 'Editing'] - # Replace n occurences of a string starting from the right - def rreplace(self, s, old, new, occurrence): - li = s.rsplit(old, occurrence) - return new.join(li) - - def add_line_break(self, text): - temp = "" - currentLineSize = 0 - - text_size = len(text) - max_size = 32 - split_space = text.split(" ") - - for word in split_space: - currentLineSize += (len(word) + 1) - - if currentLineSize <= max_size: - temp = temp + word + ' ' - - else: - temp = temp + '\n' + word + ' ' - currentLineSize = 0 - - temp = temp.replace(" \n", "\n") - temp = self.rreplace(temp, " ", "", 1) - - return temp - def clean_text(self, text): - text = re.sub(r"\n ", "\n", text) - text = re.sub(r"\n", "", text) - text = re.sub(r"(<\w+:?\w+>)", "", text) - text = re.sub(r"\[\w+=*\w+\]", "", text) - text = re.sub(r" ", "", text) - text = re.sub(u'\u3000', '', text) - text = re.sub(r" ", "", text) - return text - - # Extract/Transform Lauren translation - def extract_Lauren_Translation(self): - - # Load Lauren's googlesheet data inside a dataframe - df = self.extract_Google_Sheets("1-XwzS7F0SaLlXwv1KS6RcTEYYORH2DDb1bMRy5VM5oo", "Story") - - # 1) Make some renaming and transformations - df = df.rename(columns={"KEY": "File", "Japanese": "JapaneseText", "Lauren's Script": "EnglishText"}) - - # 2) Filter only relevant rows and columns from the googlesheet - df = df.loc[(df['EnglishText'] != "") & (df['JapaneseText'] != ""), :] - df = df[['File', 'JapaneseText', 'EnglishText']] - - # 3) Make some transformations to the JapaneseText so we can better match with XML - df['File'] = df['File'].apply(lambda x: x.split("_")[0] + ".xml") - df['JapaneseText'] = df['JapaneseText'].apply(lambda x: self.clean_text(x)) - return df - - # Transfer Lauren translation - def transfer_Lauren_Translation(self): - - df_lauren = self.extract_Lauren_Translation() - - # Distinct list of XMLs file - xml_files = list(set(df_lauren['File'].tolist())) - - for file in xml_files: - cond = df_lauren['File'] == file - lauren_translations = dict(df_lauren[cond][['JapaneseText', 'EnglishText']].values) - file_path = self.story_XML_new + 'XML/' + file - - if os.path.exists(file_path): - tree = etree.parse(file_path) - root = tree.getroot() - need_save = False - - for key, item in lauren_translations.items(): - - for entry_node in root.iter("Entry"): - xml_jap = entry_node.find("JapaneseText").text or '' - xml_eng = entry_node.find("EnglishText").text or '' - xml_jap_cleaned = self.clean_text(xml_jap) - - if key == xml_jap_cleaned: - item = self.add_line_break(item) - - if xml_eng != item: - entry_node.find("EnglishText").text = item - need_save = True - - if entry_node.find("Status").text == "To Do": - entry_node.find("Status").text = "Editing" - - # else: - # print("File: {} - {}".format(file, key)) - - if need_save: - txt = etree.tostring(root, encoding="UTF-8", pretty_print=True, xml_declaration=False) - - with open(file_path, 'wb') as xml_file: - xml_file.write(txt) - - else: - print("File {} skipped because file is not found".format(file)) # Extract the story files def extract_all_story(self, replace=False) -> None: diff --git a/text_transfer.py b/text_transfer.py new file mode 100644 index 0000000..422ad21 --- /dev/null +++ b/text_transfer.py @@ -0,0 +1,140 @@ +import re +import pandas as pd +import os +import pygsheets +import lxml.etree as etree +from requests import HTTPError + +# Replace n occurences of a string starting from the right +def rreplace(s, old, new, occurrence): + li = s.rsplit(old, occurrence) + return new.join(li) + +def add_line_break(text): + temp = "" + currentLineSize = 0 + + text_size = len(text) + max_size = 32 + split_space = text.split(" ") + + for word in split_space: + currentLineSize += (len(word) + 1) + + if currentLineSize <= max_size: + temp = temp + word + ' ' + + else: + temp = temp + '\n' + word + ' ' + currentLineSize = 0 + + temp = temp.replace(" \n", "\n") + temp = rreplace(temp, " ", "", 1) + + return temp +def clean_text(text): + text = re.sub(r"\n ", "\n", text) + text = re.sub(r"\n", "", text) + text = re.sub(r"(<\w+:?\w+>)", "", text) + text = re.sub(r"\[\w+=*\w+\]", "", text) + text = re.sub(r" ", "", text) + text = re.sub(u'\u3000', '', text) + text = re.sub(r" ", "", text) + return text + + +def extract_Google_Sheets(googlesheet_id, sheet_name): + + creds_path = r"..\gsheet.json" + + if os.path.exists(creds_path): + + try: + gc = pygsheets.authorize(service_file=creds_path) + sh = gc.open_by_key(googlesheet_id) + sheets = sh.worksheets() + id_sheet = [ ele.index for ele in sheets if ele.title == sheet_name ] + + if len(id_sheet) > 0: + wks = sh[id_sheet[0]] + df = pd.DataFrame(wks.get_all_records()) + + if len(df) > 0: + return df + else: + print("Python didn't find any table with rows in this sheet") + + else: + print("{} was not found in the googlesheet {}".format(sheet_name, googlesheet_id)) + + except HTTPError as e: + print(e) + + else: + print("{} was not found to authenticate to Googlesheet API".format(creds_path)) + + +# Extract/Transform Lauren translation +def extract_Lauren_Translation(): + + # Load Lauren's googlesheet data inside a dataframe + df = extract_Google_Sheets("1-XwzS7F0SaLlXwv1KS6RcTEYYORH2DDb1bMRy5VM5oo", "Story") + + # 1) Make some renaming and transformations + df = df.rename(columns={"KEY": "File", "Japanese": "JapaneseText", "Lauren's Script": "EnglishText"}) + + # 2) Filter only relevant rows and columns from the googlesheet + df = df.loc[(df['EnglishText'] != "") & (df['JapaneseText'] != ""), :] + df = df[['File', 'JapaneseText', 'EnglishText']] + + # 3) Make some transformations to the JapaneseText so we can better match with XML + df['File'] = df['File'].apply(lambda x: x.split("_")[0] + ".xml") + df['JapaneseText'] = df['JapaneseText'].apply(lambda x: clean_text(x)) + return df + +# Transfer Lauren translation +def transfer_Lauren_Translation(): + + df_lauren = extract_Lauren_Translation() + + # Distinct list of XMLs file + xml_files = list(set(df_lauren['File'].tolist())) + + for file in xml_files: + cond = df_lauren['File'] == file + lauren_translations = dict(df_lauren[cond][['JapaneseText', 'EnglishText']].values) + file_path = self.story_XML_new + 'XML/' + file + + if os.path.exists(file_path): + tree = etree.parse(file_path) + root = tree.getroot() + need_save = False + + for key, item in lauren_translations.items(): + + for entry_node in root.iter("Entry"): + xml_jap = entry_node.find("JapaneseText").text or '' + xml_eng = entry_node.find("EnglishText").text or '' + xml_jap_cleaned = clean_text(xml_jap) + + if key == xml_jap_cleaned: + item = add_line_break(item) + + if xml_eng != item: + entry_node.find("EnglishText").text = item + need_save = True + + if entry_node.find("Status").text == "To Do": + entry_node.find("Status").text = "Editing" + + # else: + # print("File: {} - {}".format(file, key)) + + if need_save: + txt = etree.tostring(root, encoding="UTF-8", pretty_print=True, xml_declaration=False) + + with open(file_path, 'wb') as xml_file: + xml_file.write(txt) + + else: + print("File {} skipped because file is not found".format(file)) # Replace n occurences of a string starting from the right