cbz-volume-combiner/cbz_volume_combiner/parsing.py

import os
import re

def normalize_filename(filename):
    """Normalize a filename to handle encoding issues and special characters."""
    # Replace common problematic characters
    replacements = {
        '?': "'",  # Replace question marks that might be incorrectly encoded apostrophes
        '?': "'",  # Another possible encoding of apostrophe
        '?': '"',  # Possible encoding of double quote
        '?': '-',  # Possible encoding of dash
        '?': ' '   # Possible encoding of space
    }

    for bad_char, good_char in replacements.items():
        filename = filename.replace(bad_char, good_char)

    return filename

def parse_manga_filename(filename):
    """Extract volume, chapter and title information from a manga filename."""
    # Pattern to match: manga_name v## c### [optional title] [optional group]
    base_filename = os.path.basename(filename)

    # Try to normalize the filename to handle encoding issues
    normalized_filename = normalize_filename(base_filename)

    # Updated pattern to handle decimal chapter numbers like c019.1
    pattern = r'(.*?)\s+v(\d+)\s+c(\d+(?:\.\d+)?)\s*(.*)\.cbz$'
    match = re.match(pattern, normalized_filename)

    if match:
        manga_name = match.group(1).strip()
        volume = int(match.group(2))
        chapter_str = match.group(3)
        rest = match.group(4).strip()

        # Try to extract title and group if available
        group_match = re.search(r'\[(.*?)\]$', rest)
        if group_match:
            group = group_match.group(1)
            title = rest[:rest.rfind('[')-1].strip()
        else:
            group = ""
            title = rest.strip()

        # Handle chapter numbers like "005.5" or "005-006"
        try:
            chapter = float(chapter_str)
        except ValueError:
            try:
                chapter = float(chapter_str.split('-')[0])  # Take first number for ranges
            except ValueError:
                chapter = 0  # Fallback for unparseable chapter numbers

        return {
            'manga_name': manga_name,
            'volume': volume,
            'chapter': chapter,
            'chapter_str': chapter_str,
            'title': title,
            'group': group,
            'filename': filename
        }

    # Try an alternative pattern for filenames like "Manga Name v04 c021.1.cbz" (no title/group)
    alt_pattern = r'(.*?)\s+v(\d+)\s+c(\d+(?:\.\d+)?)\.cbz$'
    alt_match = re.match(alt_pattern, normalized_filename)

    if alt_match:
        manga_name = alt_match.group(1).strip()
        volume = int(alt_match.group(2))
        chapter_str = alt_match.group(3)

        try:
            chapter = float(chapter_str)
        except ValueError:
            chapter = 0

        return {
            'manga_name': manga_name,
            'volume': volume,
            'chapter': chapter,
            'chapter_str': chapter_str,
            'title': '',
            'group': '',
            'filename': filename
        }

    # Add a new pattern for "Vol. XX Ch. YYY - Title.cbz" format
    vol_ch_pattern = r'Vol\.\s*(\d+)\s+Ch\.\s*(\d+(?:\.\d+)?)\s*(?:-\s*(.*))?\.cbz$'
    vol_ch_match = re.match(vol_ch_pattern, normalized_filename, re.IGNORECASE)

    if vol_ch_match:
        # For this format, we need to extract the manga name from the directory
        manga_name = os.path.basename(os.path.dirname(filename))
        volume = int(vol_ch_match.group(1))
        chapter_str = vol_ch_match.group(2)
        title = vol_ch_match.group(3) if vol_ch_match.group(3) else ""

        try:
            chapter = float(chapter_str)
        except ValueError:
            chapter = 0

        return {
            'manga_name': manga_name,
            'volume': volume,
            'chapter': chapter,
            'chapter_str': chapter_str,
            'title': title,
            'group': '',
            'filename': filename
        }

    # New pattern for underscore_format_vXXcYY_(ScanGroup)[tag].cbz
    underscore_pattern = r'(.+?)_v(\d+)c(\d+[a-z]?(?:\.\d+)?)(?:_\((.+?)\))?(?:\[(.+?)\])?\.cbz$'
    underscore_match = re.match(underscore_pattern, normalized_filename)

    if underscore_match:
        # Extract parts from the filename
        manga_name = underscore_match.group(1).replace('_', ' ').strip()
        volume = int(underscore_match.group(2))
        chapter_str = underscore_match.group(3)
        publisher = underscore_match.group(4) if underscore_match.group(4) else ""
        tag = underscore_match.group(5) if underscore_match.group(5) else ""

        # Handle special case for titles with volume TPB format
        if '_-_' in normalized_filename and '(Dark_Horse_TPB)' in normalized_filename:
            # This is likely a tankōbon/volume title format like "Shadow_Star_v04_-_Nothing_But_the_Truth_(Dark_Horse_TPB)[m-s].cbz"
            title_match = re.search(r'_v\d+_-_(.+?)_\(', normalized_filename)
            title = title_match.group(1).replace('_', ' ') if title_match else ""
        else:
            title = ""

        # Create group/publisher string
        group = f"{publisher} [{tag}]" if publisher and tag else publisher or tag

        # Handle chapter numbers with letter suffixes like "50b"
        try:
            # Extract just the numeric part for sorting
            numeric_part = re.match(r'(\d+(?:\.\d+)?)', chapter_str)
            if (numeric_part):
                chapter = float(numeric_part.group(1))
            else:
                chapter = 0
        except ValueError:
            chapter = 0

        return {
            'manga_name': manga_name,
            'volume': volume,
            'chapter': chapter,
            'chapter_str': chapter_str,
            'title': title,
            'group': group,
            'filename': filename
        }

    # Add a new pattern for "Manga Name Vol XX,  YYYcTitle.cbz" format
    comma_vol_pattern = r'^(.*?)\s+Vol\s+(\d+),\s+(\d+(?:\.\d+)?)[cC](.*?)\.cbz$'
    comma_vol_match = re.match(comma_vol_pattern, normalized_filename, re.IGNORECASE)

    if comma_vol_match:
        manga_name = comma_vol_match.group(1).strip()
        volume = int(comma_vol_match.group(2))
        chapter_str = comma_vol_match.group(3)
        title = comma_vol_match.group(4).strip()

        try:
            chapter = float(chapter_str)
        except ValueError:
            chapter = 0

        return {
            'manga_name': manga_name,
            'volume': volume,
            'chapter': chapter,
            'chapter_str': chapter_str,
            'title': title,
            'group': '',
            'filename': filename
        }

    if os.path.exists(filename):
        # For debugging: print the filename that couldn't be parsed
        print(f"WARNING: Could not parse filename: {base_filename}")

    return None