Source code for romsearch.modules.romparser

import copy
import os
import re
from packaging.version import Version

import romsearch
from ..util import (
    centred_string,
    left_aligned_string,
    setup_logger,
    get_file_time,
    load_yml,
    load_json,
    match_retool_search_terms,
    get_short_name,
    get_sanitized_version,
)

DICT_DEFAULT_VALS = {"bool": False, "str": "", "list": []}
USE_TITLE_POS = [
    "languages",
]


def find_pattern(regex, search_str, group_number=0):
    """
    Take a regex pattern and find potential matches within a search string
    """
    regex_search_str = None

    regex_search = re.search(regex, search_str)
    if regex_search:
        regex_search_str = regex_search.group(group_number)

    return regex_search_str


def get_pattern_val(
    regex,
    tag,
    regex_type,
    pattern_mappings=None,
    title_pos=None,
    use_title_pos=False,
):
    """Get values out from a regex pattern, optionally mapping back to something more readable for lists

    Args:
        regex: Regex pattern
        tag: Found tag
        regex_type: Regex pattern type. Can be str, bool, list
        pattern_mappings: Mapping from regex pattern to more readable values
        title_pos: Position of title for compilations. Defaults to None
        use_title_pos: Use title_pos? Defaults to False
    """

    pattern_string = find_pattern(regex, tag)

    if pattern_string is not None:
        pattern_string = pattern_string.strip("()")

        # Split out to the specific languages, but only if they're marked correctly
        if title_pos is not None and use_title_pos and "+" in pattern_string:
            pattern_string = pattern_string.split("+")[title_pos - 1]

        if regex_type == "bool":
            pattern_val = True
        elif regex_type == "str":
            pattern_val = pattern_string
        elif regex_type == "list":

            if pattern_mappings is not None:
                parsed_pattern_string = []
                # Match to pattern mappings
                for p in pattern_mappings:
                    if re.search(pattern_mappings[p], pattern_string):
                        parsed_pattern_string.append(p)
            else:
                # Split, and remove and trailing whitespace
                parsed_pattern_string = pattern_string.split(",")
                parsed_pattern_string = [s.strip() for s in parsed_pattern_string]
            pattern_val = parsed_pattern_string
        else:
            raise ValueError("regex_type should be one of 'bool', 'str', or 'list'")

    else:
        pattern_val = None

    return pattern_val


def apply_filters(
    file_dict,
):
    """Apply any filters we may have

    Args:
        file_dict (dict): Dictionary of file properties
    """

    # Flag supersets
    flag_as_superset = file_dict.get("flag_as_superset", None)
    if flag_as_superset is not None:
        file_dict["flag_as_superset"] = flag_as_superset

    return file_dict


def is_ra_subset(name):
    """Check if a name is a RetroAchievements subset

    Args:
        name (str): Name to check
    """

    match_pattern = "\\[Subset.*\\]"

    match = find_pattern(match_pattern, name)
    is_subset = False
    if match is not None:
        is_subset = True

    return is_subset


def check_match(i, j, checks_passed=None):
    """Check if two bools/strings/lists match

    For lists, we simply check if there's any subset that matches

    Args:
        i: Input 1
        j: Input 2
        checks_passed: If not None, will inherit this as initial start.
            Else, will default to True
    """

    if checks_passed is None:
        checks_passed = True
    if not isinstance(checks_passed, bool):
        raise ValueError("checks_passed should be a boolean value")

    # If we have a bool or string, then they should match
    if isinstance(i, bool) or isinstance(i, str):
        if not i == j:
            checks_passed = False

    # If a list, treat differently
    elif isinstance(i, list):
        s_i = set(i)
        s_j = set(j)
        s_k = s_i.intersection(s_j)

        # Only fail on a 0-length intersection if at least one of the inputs has
        # non-zero length
        if len(s_k) == 0 and len(s_i) > 0 and len(s_j) > 0:
            checks_passed = False

        # If we have the case where both entries have more than 1 item, then ensure
        # they *all* match in at least one of the lists
        if len(s_i) > 1 and len(s_j) > 1:
            min_n_match = min(len(s_i), len(s_j))
            if len(s_k) < min_n_match:
                checks_passed = False

    else:
        t = type(i)
        raise ValueError(f"Do not know how to check against type {t}")

    return checks_passed


def set_english_friendly(
    file_dict,
):
    """Set English as a language if English-friendly is flagged"""

    # Only change things if we're flagged
    is_english_friendly = file_dict.get("english_friendly", False)

    if not is_english_friendly:
        return file_dict

    if "English" not in file_dict["languages"]:
        file_dict["languages"].append("English")

    return file_dict


[docs] class ROMParser: def __init__( self, platform, game, dat=None, retool=None, ra_hashes=None, config_file=None, config=None, platform_config=None, default_config=None, regex_config=None, logger=None, log_line_sep="=", log_line_length=100, ): """ROM parser tool This works per-game, per-platform, so must be specified here Args: platform (str): Platform name game (str): Game name dat (dict): Parsed dat dictionary. Defaults to None, which will try to load the dat file if it exists retool (dict): Retool dictionary. Defaults to None, which will try to load the file if it exists ra_hashes (dict): RA hash dictionary. Defaults to None, which will try to load the file if it exists config_file (str, optional): path to config file. Defaults to None. config (dict, optional): configuration dictionary. Defaults to None. platform_config (dict, optional): platform configuration dictionary. Defaults to None. default_config (dict, optional): default configuration dictionary. Defaults to None. regex_config (dict, optional): regex configuration dictionary. Defaults to None. logger (logging.Logger, optional): logger instance. Defaults to None. log_line_length (int, optional): Line length of log. Defaults to 100 TODO: For the RetroAchievements, there are hacks and unlicensed stuff that seems to work differently """ if platform is None: raise ValueError("platform must be specified") self.platform = platform if config_file is None and config is None: raise ValueError("config_file or config must be specified") if config is None: config = load_yml(config_file) self.config = config self.game = game if logger is None: log_dir = self.config.get("dirs", {}).get( "log_dir", os.path.join(os.getcwd(), "logs") ) logger_add_dir = str(os.path.join(platform, game)) log_level = self.config.get("logger", {}).get("level", "info") logger = setup_logger( log_level=log_level, script_name=f"ROMParser", log_dir=log_dir, additional_dir=logger_add_dir, ) self.logger = logger mod_dir = os.path.dirname(romsearch.__file__) if default_config is None: default_file = os.path.join(mod_dir, "configs", "defaults.yml") default_config = load_yml(default_file) self.default_config = default_config self.ra_file_exts = self.default_config.get("ra_file_exts", []) self.ra_labels = self.default_config.get("ra_labels", []) self.ra_patch_checks = self.default_config.get("ra_patch_checks", []) if regex_config is None: regex_file = os.path.join(mod_dir, "configs", "regex.yml") regex_config = load_yml(regex_file) self.regex_config = regex_config if platform_config is None: platform_config_file = os.path.join( mod_dir, "configs", "platforms", f"{platform}.yml" ) platform_config = load_yml(platform_config_file) self.platform_config = platform_config self.raw_dir = self.config.get("dirs", {}).get("raw_dir", None) if not self.raw_dir: raise ValueError("raw_dir must be specified in config.yml") self.use_dat = self.config.get("romparser", {}).get("use_dat", True) self.use_retool = self.config.get("romparser", {}).get("use_retool", True) self.use_ra_hashes = self.config.get("romparser", {}).get( "use_ra_hashes", False ) self.use_filename = self.config.get("romparser", {}).get("use_filename", True) self.dry_run = self.config.get("romparser", {}).get("dry_run", False) # If we're using the dat file, pull it out here self.dat = dat if self.use_dat and self.dat is None: dat_dir = self.config.get("dirs", {}).get("parsed_dat_dir", None) if dat_dir is None: raise ValueError("parsed_dat_dir must be specified in config.yml") dat_file = os.path.join(dat_dir, f"{platform} (dat parsed).json") if os.path.exists(dat_file): self.dat = load_json(dat_file) # If we're using the retool file, pull it out here self.retool = retool if self.use_retool and self.retool is None: dat_dir = self.config.get("dirs", {}).get("parsed_dat_dir", None) if dat_dir is None: raise ValueError("parsed_dat_dir must be specified in config.yml") retool_file = os.path.join(dat_dir, f"{platform} (retool).json") if os.path.exists(retool_file): retool = load_json(retool_file) self.retool = retool["variants"] # If we're using the RA hashes, pull it out here self.ra_hashes = ra_hashes self.ra_dict = None if self.use_ra_hashes and self.ra_hashes is None: ra_hash_dir = self.config.get("dirs", {}).get("ra_hash_dir", None) if ra_hash_dir is None: raise ValueError("ra_hash_dir must be specified in config.yml") ra_hash_file = os.path.join(ra_hash_dir, f"{platform}.json") if os.path.exists(ra_hash_file): self.ra_hashes = load_json(ra_hash_file) self.hash_method = self.platform_config.get("ra_hash_method", None) self.log_line_sep = log_line_sep self.log_line_length = log_line_length
[docs] def run( self, files, ): """Run the ROM parser""" game_dict = copy.deepcopy(files) self.logger.debug(f"{self.log_line_sep * self.log_line_length}") self.logger.debug( centred_string( f"Running ROMParser for {self.game}", total_length=self.log_line_length ) ) self.logger.debug(f"{self.log_line_sep * self.log_line_length}") for f in files: # # Get the potential title position out for compilations title_pos = files[f].get("title_pos", None) f_parsed = self.parse_file( f=f, file_dict=copy.deepcopy(files[f]), title_pos=title_pos, ) game_dict[f].update(f_parsed) return game_dict
[docs] def parse_file( self, f=None, file_dict=None, title_pos=None, ): """Parse useful info out of a specific file Args: f (str): Filename. Will only use this if something more suitable isn't found file_dict (dict): Dictionary of file properties title_pos (int, optional): Title position for compilations. Defaults to None. """ if file_dict is None: file_dict = {} if self.use_filename: file_dict = self.parse_filename( f=f, file_dict=file_dict, title_pos=title_pos, ) if self.use_retool: file_dict = self.parse_retool(file_dict=file_dict) if self.use_dat: file_dict = self.parse_dat( f=f, file_dict=file_dict, ) # Apply any filters that wouldn't have been applied here file_dict = apply_filters(file_dict) file_dict["has_cheevos"] = False file_dict["patch_file"] = "" if self.use_ra_hashes: file_dict = self.parse_ra_hashes( file_dict=file_dict, ) # Any last minute finalisations self.finalise_file_dict(file_dict) # File modification time full_file_path = os.path.join( self.raw_dir, self.platform, file_dict.get("original_name", f) ) file_time = get_file_time( full_file_path, datetime_format=self.default_config["datetime_format"], ) file_dict["file_mod_time"] = file_time # And note that this thing has been parsed file_dict["is_parsed"] = True # Log out these tags in a nice readable way self.logger.debug(centred_string(f"{f}:", total_length=self.log_line_length)) # Track the various tags we can have true_tags = [] false_tags = [] none_tags = [] str_tags = {} int_tags = {} list_tags = {} for key in file_dict: if isinstance(file_dict[key], bool): if file_dict[key]: true_tags.append(key) else: false_tags.append(key) elif isinstance(file_dict[key], str): str_tags[key] = file_dict[key] elif isinstance(file_dict[key], list): list_tags[key] = file_dict[key] elif isinstance(file_dict[key], int): int_tags[key] = file_dict[key] elif file_dict[key] is None: none_tags.append(key) else: raise ValueError( f"{file_dict[key]} is not something I know how to parse" ) # Log the string tags self.logger.debug( left_aligned_string(f"String tags:", total_length=self.log_line_length) ) for tag in str_tags: if str_tags[tag] == "": continue self.logger.debug( left_aligned_string( f"-> {tag}: {str_tags[tag]}", total_length=self.log_line_length ) ) # Log the list tags self.logger.debug( left_aligned_string(f"List tags:", total_length=self.log_line_length) ) for tag in list_tags: if not list_tags[tag]: continue self.logger.debug( left_aligned_string( f"-> {tag}: {', '.join(str(i) for i in list_tags[tag])}", total_length=self.log_line_length, ) ) # Log the list tags self.logger.debug( left_aligned_string(f"Number tags:", total_length=self.log_line_length) ) for tag in int_tags: self.logger.debug( left_aligned_string( f"-> {tag}: {int_tags[tag]}", total_length=self.log_line_length, ) ) # Log the True bool tags self.logger.debug( left_aligned_string(f"Tagged:", total_length=self.log_line_length) ) for tag in true_tags: self.logger.debug( left_aligned_string(f"-> {tag}", total_length=self.log_line_length) ) # Log the False bool tags self.logger.debug( left_aligned_string(f"Not tagged:", total_length=self.log_line_length) ) for tag in false_tags: self.logger.debug( left_aligned_string(f"-> {tag}", total_length=self.log_line_length) ) # Log any None tags self.logger.debug( left_aligned_string(f"None:", total_length=self.log_line_length) ) for tag in none_tags: self.logger.debug( left_aligned_string(f"-> {tag}", total_length=self.log_line_length) ) self.logger.debug(f"{'-' * self.log_line_length}") return file_dict
[docs] def parse_retool(self, file_dict=None): """Parse info out of the retool file""" if file_dict is None: file_dict = {} if self.retool is None: self.logger.warning(f"{self.log_line_sep * self.log_line_length}") self.logger.warning( centred_string( f"No retool file found for {self.platform}. Skipping", total_length=self.log_line_length, ) ) self.logger.warning(f"{self.log_line_sep * self.log_line_length}") return file_dict # Loop over the variants, see if we get a match found_cat = False for retool_dict in self.retool: if found_cat: continue # If we don't have titles within the dupe dict, skip if "titles" not in retool_dict: continue # Match properly given search terms full_name = copy.deepcopy(file_dict["full_name"]) short_name = copy.deepcopy(file_dict["short_name"]) region_free_name = copy.deepcopy(file_dict["region_free_name"]) for t in retool_dict["titles"]: search_term = t["searchTerm"] match_type = t.get("nameType", None) found_retool_variant = match_retool_search_terms( full_name=full_name, search_term=search_term, short_name=short_name, region_free_name=region_free_name, match_type=match_type, ) if found_retool_variant: retool_cats = retool_dict.get("categories", []) for retool_cat in retool_cats: file_cat = retool_cat.lower().replace(" ", "_") file_dict[file_cat] = True return file_dict
[docs] def parse_dat( self, f=None, file_dict=None, ): """Parse info out of the dat file Args: f (str): Fallback filename file_dict (dict): Dictionary of file info """ if file_dict is None: file_dict = {} f = copy.deepcopy(file_dict.get("original_name", f)) if self.dat is None: self.logger.warning(f"{self.log_line_sep * self.log_line_length}") self.logger.warning( centred_string( f"No dat file found for {self.platform}. Skipping", total_length=self.log_line_length, ) ) self.logger.warning(f"{self.log_line_sep * self.log_line_length}") return file_dict # Remember there aren't zips in the dat entries dat_entry = self.dat.get(f.rstrip(".zip"), None) if not dat_entry: self.logger.warning(f"{self.log_line_sep * self.log_line_length}") self.logger.warning( centred_string( f"No dat entry found for {f}. Skipping", total_length=self.log_line_length, ) ) self.logger.warning(f"{self.log_line_sep * self.log_line_length}") return file_dict dat_categories = self.default_config.get("dat_categories", []) for dat_cat in dat_categories: dat_val = dat_entry.get("category", "") cat_val = dat_val == dat_cat dat_cat_dict = dat_cat.lower().replace(" ", "_") if dat_cat_dict in file_dict: file_dict[dat_cat_dict] = file_dict[dat_cat_dict] | cat_val else: file_dict[dat_cat_dict] = cat_val # Get the checksums out checksums = self.default_config.get("dat_checksums", []) for checksum in checksums: # Because sometimes we have multiple files within the ROM, loop over and append them all rom_entries = dat_entry.get("rom", []) if isinstance(rom_entries, dict): rom_entries = [rom_entries] for rom_entry in rom_entries: if checksum in rom_entry: if checksum not in file_dict: file_dict[checksum] = [] file_dict[checksum].append(rom_entry[checksum]) return file_dict
[docs] def parse_ra_hashes( self, file_dict=None, ): """See if we can find ROMs that support RetroAchievements Note that this requires a bunch of parsing to have already occurred """ # If we don't have a dictionary already, then this won't work if file_dict is None: file_dict = {} return file_dict if self.hash_method is None: self.logger.warning( centred_string( f"RA hash method not defined for {self.platform}", total_length=self.log_line_length, ) ) return file_dict file_dict = self.match_hashes(file_dict) return file_dict
[docs] def match_hashes( self, file_dict, ): """Get whether ROM has cheevos by various potential hash methods Args: f (str): Filename file_dict (dict): Dictionary of ROM descriptions """ has_cheevos = False patch_file = "" if self.hash_method not in ["md5", "custom"]: self.logger.warning( centred_string( f"Cannot currently handle {self.hash_method} hash method", total_length=self.log_line_length, ) ) return has_cheevos, patch_file # Get the hash dict, if we don't already have it if self.ra_dict is None: self.ra_dict = self.get_ra_dict() # Get the potential RA match by name (this won't include potentially patched ROMs) has_cheevos, patch_file = self.get_ra_match( file_dict=file_dict, ) # If we've found something, stop here if has_cheevos: file_dict["has_cheevos"] = has_cheevos file_dict["patch_file"] = patch_file return file_dict # If we're on a custom hash, and we haven't found anything, now look # via parsing the names if self.hash_method == "custom": has_cheevos, patch_file = self.get_parsed_match( file_dict=file_dict, want_patched_files=False ) if has_cheevos: file_dict["has_cheevos"] = has_cheevos file_dict["patch_file"] = patch_file return file_dict # If we still haven't, now look through files to see if we just need # a patch (i.e. the hash will change, but we have the file) has_cheevos, patch_file = self.get_parsed_match( file_dict=file_dict, want_patched_files=True, ) file_dict["has_cheevos"] = has_cheevos file_dict["patch_file"] = patch_file return file_dict
[docs] def get_ra_dict( self, ): """Get a big dictionary of RA hashes with useful info""" # Pull out the particular key we need if self.hash_method == "md5": key = "MD5" elif self.hash_method == "custom": key = "Name" else: raise ValueError(f"Cannot currently handle {self.hash_method} hash method") # Because of inconsistencies between naming schemes, just pull a huge dictionary out here rather than try # to be clever ra_dict = {} for r in self.ra_hashes: for h in self.ra_hashes[r]["Hashes"]: # If the RA list is a subset, then skip if is_ra_subset(r): continue # Use the md5 as the unique key, and then name as the thing we'll match to. # Ensure we lowercase the hash, just to be sure md5 = copy.deepcopy(h["MD5"].lower()) id_name = copy.deepcopy(h[key]) # If for some weird reason there's no ID name, just skip if id_name is None: continue # Also just pull out the ROM name, since we need that later rom_name = copy.deepcopy(h["Name"]) rom_name = rom_name.strip() # Ensure we also lowercase the hash here, if we need to if key in ["MD5"]: id_name = id_name.lower() # If we're dealing with names, there might # be file extensions to strip if key in ["Name"]: for ext in self.ra_file_exts: if id_name.endswith(ext): id_name = id_name.rstrip(ext) # FIXME: Here as a catch-all, hopefully won't be a problem if md5 in ra_dict: raise ValueError(f"Hash {md5} multiply defined") ra_dict[md5] = { "name": id_name, "full_name": rom_name, "dir_name": rom_name.split(" (")[0], "patch_url": h["PatchUrl"], } return ra_dict
[docs] def get_ra_match( self, file_dict, ): """Match a file to RetroAchievements supported files Args: file_dict (dict): Dictionary of ROM descriptions """ has_cheevos = False patch_file = "" # Pull out the particular key we need if self.hash_method == "md5": match_list = file_dict.get("md5", []) elif self.hash_method == "custom": match_list = [file_dict["original_name"].rstrip(".zip")] else: self.logger.warning( centred_string( f"Cannot currently handle {self.hash_method} hash method", total_length=self.log_line_length, ) ) return has_cheevos, patch_file # If we've got nothing, don't waste time if len(match_list) == 0: return has_cheevos, patch_file if self.ra_dict is None: self.ra_dict = self.get_ra_dict() # Again, if there's nothing here just return if len(self.ra_dict) == 0: return has_cheevos, patch_file for m in match_list: for r in self.ra_dict: if m == self.ra_dict[r]["name"]: has_cheevos = True patch_file = copy.deepcopy(self.ra_dict[r]["patch_url"]) if patch_file is None: patch_file = "" return has_cheevos, patch_file
[docs] def get_parsed_match( self, file_dict, want_patched_files=True, ): """Match a file to RetroAchievements supported files that potentially need patches Args: file_dict (dict): Dictionary of ROM descriptions want_patched_files (bool): Whether we're looking for hashes with patches or not. Defaults to True """ has_cheevos = False patch_file = "" if self.ra_dict is None: self.ra_dict = self.get_ra_dict() # Again, if there's nothing here just return if len(self.ra_dict) == 0: return has_cheevos, patch_file multiple_patch_files_found = False for r in self.ra_dict: # If we want patch files, and we don't have them, skip if want_patched_files and self.ra_dict[r]["patch_url"] is None: continue # Conversely, if we don't want patch files, and we do have them, # skip if not want_patched_files and self.ra_dict[r]["patch_url"] is not None: continue if multiple_patch_files_found: continue # If we do have a patch file, make sure it's not a translation if want_patched_files and self.ra_dict[r]["patch_url"] is not None: if "Translation" in self.ra_dict[r]["patch_url"]: continue # Start by ensuring the names up to the first bracket at least match if file_dict["dir_name"] == self.ra_dict[r]["dir_name"]: # Make sure we're not parsing this every time r_is_parsed = self.ra_dict[r].get("is_parsed", False) if not r_is_parsed: r_parsed = self.parse_filename(f=self.ra_dict[r]["full_name"]) r_parsed["is_parsed"] = True self.ra_dict[r].update(r_parsed) r_parsed = self.ra_dict.get(r) # If we're a superset, then ensure the short names also match, since # we need to be more stringent is_superset = file_dict.get("is_superset", False) ra_dict_short_name = get_short_name( self.ra_dict[r]["full_name"], regex_config=self.regex_config, default_config=self.default_config, ) if is_superset and not file_dict["short_name"] == ra_dict_short_name: continue # Force some version info in here, if the RA name doesn't have it if r_parsed["version_no"] == "" and file_dict["version_no"] != "": f_sanitized = get_sanitized_version(file_dict["version_no"]) if Version(f_sanitized) == Version("1"): r_parsed["version_no"] = copy.deepcopy(file_dict["version_no"]) # Now, make sure all the useful checks pass ra_checks_passed = True for check in self.ra_patch_checks: # If we've already failed, then just skip if not ra_checks_passed: continue ra_checks_passed = check_match( file_dict[check], r_parsed[check], checks_passed=ra_checks_passed, ) # After this first pass, also see if any of the regex checks are grouped, # and double-check the sublevel below. This is because we could have e.g. # mismatched modern types (like a GameCube version vs a Wii U Virtual Console # version), which inevitably won't match hashes if ra_checks_passed: if check not in self.regex_config: for r_c in self.regex_config: if not ra_checks_passed: continue r_c_group = self.regex_config[r_c].get("group", None) if r_c_group == check: ra_checks_passed = check_match( file_dict[r_c], r_parsed[r_c], checks_passed=ra_checks_passed, ) if ra_checks_passed: # If we seem to have multiple patch files defined, # then raise a warning and assume there isn't a patch if patch_file != "": self.logger.warning( centred_string( f"Multiple potential patch files found for {file_dict['original_name']}", total_length=self.log_line_length, ) ) has_cheevos = False patch_file = None multiple_patch_files_found = True else: has_cheevos = True patch_file = copy.deepcopy(self.ra_dict[r]["patch_url"]) if patch_file is None: patch_file = "" if patch_file is None: patch_file = "" return has_cheevos, patch_file
[docs] def finalise_file_dict( self, file_dict, ): """Do any last minute finalisation to the file dict""" file_dict = self.set_game_category(file_dict) file_dict = self.set_implicit_languages(file_dict) file_dict = set_english_friendly(file_dict) return file_dict
[docs] def set_game_category( self, file_dict, ): """If a dat category hasn't been set, set it to game""" dat_categories = self.default_config.get("dat_categories", []) for d in dat_categories: d_sanitized = d.lower().replace(" ", "_") if d_sanitized not in file_dict: file_dict[d_sanitized] = False if all( [file_dict[d.lower().replace(" ", "_")] is False for d in dat_categories] ): file_dict["games"] = True return file_dict
[docs] def set_implicit_languages( self, file_dict, ): """Set implicit language from region, if we don't already have languages""" implied_languages = self.default_config.get("implied_languages", {}) # Only set if languages is an empty list if not file_dict["languages"]: for r in file_dict["regions"]: if r in implied_languages: file_dict["languages"].append(implied_languages[r]) return file_dict
[docs] def parse_filename( self, f=None, file_dict=None, title_pos=None, ): """Parse info out of filename Args: f (str): filename. Defaults to None, which will pull the original name out of the dict title_pos (int): Title position for compilations. Defaults to None file_dict (dict): Existing file dictionary. Defaults to None, which will create an empty one """ if file_dict is None: file_dict = {} if "full_name" not in file_dict and f is None: raise ValueError( "Either f needs to be defined, or full_name needs to be in the file dictionary" ) if f is None: # Pull the filename out, which is the full name f = copy.deepcopy(file_dict["full_name"]) # Split file into tags tags = [f"({x}" for x in f.rstrip(".zip").split(" (")][1:] for regex_key in self.regex_config: # Are we potentially using the title position? use_title_pos = False if regex_key in USE_TITLE_POS: use_title_pos = True # Is this something problematic we should be skipping? ignore_names = self.regex_config[regex_key].get("ignore_names", []) if len(ignore_names) != 0: found_ignore_name = False for ignore_name in ignore_names: if found_ignore_name: continue if re.match(ignore_name, f) is not None: found_ignore_name = True if found_ignore_name: continue regex_type = self.regex_config[regex_key].get("type", "bool") search_tags = self.regex_config[regex_key].get("search_tags", True) group = self.regex_config[regex_key].get("group", None) regex_flags = self.regex_config[regex_key].get("flags", "I") transform_pattern = self.regex_config[regex_key].get( "transform_pattern", None ) transform_repl = self.regex_config[regex_key].get("transform_repl", None) dict_default_val = DICT_DEFAULT_VALS.get(regex_type, None) if dict_default_val is None: raise ValueError( f"regex_type should be one of {list(DICT_DEFAULT_VALS.keys())}" ) if regex_key not in file_dict: file_dict[regex_key] = copy.deepcopy(dict_default_val) if regex_flags == "NOFLAG": regex_flags = re.NOFLAG elif regex_flags == "I": regex_flags = re.I else: raise ValueError("regex_flags should be one of 'NOFLAG', 'I'") pattern = self.regex_config[regex_key]["pattern"] pattern_mappings = None if regex_type == "list": if isinstance(self.default_config[regex_key], dict): str_to_join = [ self.default_config[regex_key][key] for key in self.default_config[regex_key] ] pattern_mappings = self.default_config[regex_key] else: str_to_join = copy.deepcopy(self.default_config[regex_key]) pattern = pattern.replace(f"[{regex_key}]", "|".join(str_to_join)) regex = re.compile(pattern, flags=regex_flags) if search_tags: found_tag = False for tag in tags: if found_tag: continue pattern_string = get_pattern_val( regex, tag, regex_type, pattern_mappings=pattern_mappings, title_pos=title_pos, use_title_pos=use_title_pos, ) if pattern_string is not None: if transform_pattern is not None: pattern_string = re.sub( transform_pattern, transform_repl, pattern_string ) file_dict[regex_key] = pattern_string found_tag = True else: pattern_string = get_pattern_val( regex, f, regex_type, pattern_mappings=pattern_mappings, title_pos=title_pos, use_title_pos=use_title_pos, ) if pattern_string is not None: file_dict[regex_key] = pattern_string # Update groups, if needed if group is not None: # We can have multiple groups per-tag, so take that into account if isinstance(group, str): group = [group] for g in group: if g not in file_dict: file_dict[g] = dict_default_val if regex_type == "bool": file_dict[g] = file_dict[g] | file_dict[regex_key] elif regex_type == "str": if file_dict[g] and file_dict[regex_key]: raise ValueError( "Can't combine multiple groups with type str" ) else: file_dict[g] += file_dict[regex_key] elif regex_type == "list": file_dict[g].extend(file_dict[regex_key]) else: raise ValueError( f"regex_type should be one of {list(DICT_DEFAULT_VALS.keys())}" ) return file_dict