Diddy-Kong-Racing/tools/python/split_data_regions.py

import os
import re
from bisect import bisect
from collections import OrderedDict

from file_util import FileUtil

DATA_FILE_PATH = 'data/dkr.data.s'
GLABEL_REGEX = r'D_[0-9A-F]{8}'
GLABEL_DEF_REGEX = r'glabel (%s)' % GLABEL_REGEX
RODATA_START = 'D_800E49DC'  # i.e. the end of .data
BSS_START = 'D_800E98D0'  # i.e. the end of .rodata
# List of labels that are not used in the file they are defined in.
# This throws off the splitter algorithm, so the troublesome ones
# must be individually blacklisted for now.
IGNORE_GLABELS = ['D_800E0001', 'D_800E63E0', 'D_800E94D0']

def _rom_offset(vaddr):
    """
    Returns the ROM offset of the corresponding virtual address given.
    Parameters:
        vaddr: can be a string or integer. If string, it is assumed to be in
            hex.
    """
    if type(vaddr) == str:
        vaddr = int(vaddr, 16)
    return vaddr - 0x7FFFF400

def _get_glabels():
    """
    Returns all the glabel definitions in the data file, split into .data,
    .rodata, and .bss.
    """
    data_file = FileUtil.get_text_from_file(DATA_FILE_PATH)
    glabels = re.findall(GLABEL_DEF_REGEX, data_file)
    glabels = [glabel for glabel in glabels if glabel not in IGNORE_GLABELS]
    rodata_idx = glabels.index(RODATA_START)
    bss_idx = glabels.index(BSS_START)
    return glabels[:rodata_idx], glabels[rodata_idx:bss_idx], glabels[bss_idx:]

def _get_file_offset(file, contents):
    """
    Returns the ROM offset of the given file. Throws exception upon error.
    Parameters:
        file: filename. Must be a .c or .s file.
        contents: the contents of file.
    """
    if file.endswith('.c'):
        return _rom_offset(re.search('/\* RAM_POS: 0x([0-9A-F]{8}) \*/', contents)[1])
    elif file.endswith('.s'):
        return int(re.search('/\* ([0-9A-F]{6}) [0-9A-F]{8} [0-9A-F]{8} \*/', contents)[1], 16)
    else:
        raise exception('cannot find offset for file ' + file)

def _log_glabel_usage(glabels):
    """
    Returns:
        usage: A sorted map from glabel names to a sorted list of all the ROM
            addresses it is accessed from.
        c_file_offsets: A list of (filename, ROM offset) tuples from all the c
            files used.
    Parameters:
        glabels: output from _get_glabels.
    """
    usage = OrderedDict([(glabel, set()) for glabel in glabels])
    files = FileUtil.get_filenames_from_directory_recursive('.', ('.c', '.s'))
    c_file_offsets = []
    for file in files:
        contents = FileUtil.get_text_from_file(file)
        try:
            offset = _get_file_offset(file, contents)
            if file.endswith('.c'):
                c_file_offsets.append((file, offset))
            matches = re.findall(GLABEL_REGEX, contents)
            for glabel in matches:
                if glabel in usage:
                    usage[glabel].add(offset)
        except:
            pass
    for glabel in usage:
        usage[glabel] = sorted(list(usage[glabel]))
    c_file_offsets.sort(key=lambda f: f[1])
    return usage, c_file_offsets

def _filter_glabel_usage(glabel_usage):
    """
    Returns a sorted (by ROM offset) list of (glabel name, ROM offset), where
        the ROM offset is the estimated location the glabel is defined at. Note
        that this is an estimate; the algorithm used is greedy and may
        overpredict.
    Parameters:
        glabel_usage: output from _log_glabel_usage.
    """
    filtered_usage = []
    cur_offset = min(glabel_usage[next(iter(glabel_usage))])
    for glabel in glabel_usage:
        usage = glabel_usage[glabel]
        valid_offsets = usage[bisect(usage, cur_offset):]
        if len(valid_offsets) > 0:
            cur_offset = valid_offsets[0]
        filtered_usage.append((glabel, cur_offset))
    return filtered_usage

def _split_glabel_files(glabel_usage, c_file_offsets):
    """
    Returns a sorted (by file offset) list of (file name, file offset, glabel name)
        for every file, where glabel name is the name of the first glabel that
        lives within the ROM address domain of the corresponding file.
    Parameters:
        glabel_usage: output from _filter_glabel_usage.
        c_file_offsets: output from _log_glabel_usage.
    """
    file_splits = []
    glabel_idx = 0
    for i in range(len(c_file_offsets)):
        file = c_file_offsets[i]
        while glabel_idx < len(glabel_usage) and glabel_usage[glabel_idx][1] < file[1]:
            glabel_idx += 1
        if glabel_idx < len(glabel_usage) and i < len(c_file_offsets) - 1:
            glabel = glabel_usage[glabel_idx]
            glabel_name = glabel[0] if glabel[1] < c_file_offsets[i + 1][1] else None
        else:
            glabel_name = None
        file_splits.append((file[0], file[1], glabel_name))
    return file_splits

def main():
    FileUtil.set_working_dir_to_project_base()
    data_glabels, rodata_glabels, bss_glabels = _get_glabels()
    for section in [('.data', data_glabels), ('.rodata', rodata_glabels), ('.bss', bss_glabels)]:
        glabels = section[1]
        usage, c_file_offsets = _log_glabel_usage(glabels)
        filtered_usage = _filter_glabel_usage(usage)
        file_splits = _split_glabel_files(filtered_usage, c_file_offsets)
        print('File splits for %s:' % section[0])
        for split in file_splits:
            print('%s (%06X): %s' % split)
        print()

if __name__ == '__main__':
    main()