cbz-volume-combiner/cbz_volume_combiner/file_utils.py

import os
import re
import unicodedata
from .parsing import parse_manga_filename

def is_emoji(char):
    """Check if a character is an emoji or other problematic symbol."""
    return (
        unicodedata.category(char) in ('So', 'Sm') or  # Symbol categories
        ord(char) > 0x1F000  # Emoji range
    )

def has_problematic_characters(filename):
    """Check if filename contains emoji or problematic characters."""
    basename = os.path.basename(filename)
    for char in basename:
        if is_emoji(char) or char == '?':
            return True, char
    return False, None

def find_cbz_files(folder_path, recursive=False, extra_verbose=False):
    """Find all CBZ files in the given folder."""
    cbz_files = []

    if recursive:
        if extra_verbose:
            print(f"Recursively searching for CBZ files in {folder_path}")

        for root, _, files in os.walk(folder_path):
            for file in files:
                if file.lower().endswith('.cbz'):
                    cbz_files.append(os.path.join(root, file))
    else:
        if extra_verbose:
            print(f"Searching for CBZ files in {folder_path} (non-recursive)")

        for file in os.listdir(folder_path):
            if file.lower().endswith('.cbz'):
                cbz_files.append(os.path.join(folder_path, file))

    if extra_verbose:
        print(f"Found {len(cbz_files)} CBZ files")

    return cbz_files

def fix_missing_files(chapter_infos, folder_path, extra_verbose=False):
    """Attempt to find missing files by searching for similar filenames."""
    fixed_chapters = []
    unresolved_chapters = []

    for chapter_info in chapter_infos:
        # Check if file exists first
        if os.path.exists(chapter_info['filename']):
            fixed_chapters.append(chapter_info)
            continue

        # If file doesn't exist, look for it by volume and chapter
        directory = os.path.dirname(chapter_info['filename'])
        vol_num = chapter_info['volume']
        chap_num = chapter_info['chapter_str']

        if extra_verbose:
            print(f"Looking for alternative for {os.path.basename(chapter_info['filename'])}")

        # Find file by volume and chapter numbers directly
        actual_file = find_file_by_volume_chapter(directory, vol_num, chap_num, extra_verbose)

        if actual_file and os.path.exists(actual_file):
            if extra_verbose:
                print(f"Found alternative file: {os.path.basename(actual_file)}")

            new_chapter_info = dict(chapter_info)
            new_chapter_info['filename'] = actual_file
            fixed_chapters.append(new_chapter_info)
        else:
            if extra_verbose:
                print(f"No alternative found for volume {vol_num}, chapter {chap_num}")
            unresolved_chapters.append(chapter_info)

    return fixed_chapters, unresolved_chapters

def find_file_by_volume_chapter(directory, volume_num, chapter_num, extra_verbose=False):
    """Find a file in a directory by its volume and chapter number only."""
    if not os.path.exists(directory):
        if extra_verbose:
            print(f"Directory does not exist: {directory}")
        return None

    if extra_verbose:
        print(f"Looking for volume {volume_num}, chapter {chapter_num} in {directory}")

    try:
        files = [f for f in os.listdir(directory) if f.lower().endswith('.cbz')]

        # We'll only focus on the essential part: finding files by volume and chapter numbers
        # regardless of special characters or encoding issues

        # First: Look for exact pattern matches
        for file in files:
            # Look for both zero-padded and non-zero-padded versions
            v_patterns = [f"v{volume_num:02d}", f"v{volume_num}"]
            c_patterns = [f"c{chapter_num}", f"c{chapter_num.zfill(3)}" if chapter_num.isdigit() else f"c{chapter_num}"]

            for v_pattern in v_patterns:
                for c_pattern in c_patterns:
                    if v_pattern in file and c_pattern in file:
                        if extra_verbose:
                            print(f"Found exact match: {file}")
                        return os.path.join(directory, file)

        # Second: Use regex pattern matching
        for file in files:
            v_match = re.search(fr'v0*{volume_num}[^0-9]', file.lower())
            c_match = re.search(fr'c0*{chapter_num}[^0-9]', file.lower())

            if v_match and c_match:
                if extra_verbose:
                    print(f"Found regex match: {file}")
                return os.path.join(directory, file)

        # Third: Last resort - simplified alphanumeric comparison
        for file in files:
            clean_file = ''.join(c.lower() for c in file if c.isalnum())
            if f"v{volume_num}" in clean_file and f"c{chapter_num}" in clean_file:
                if extra_verbose:
                    print(f"Found simplified match: {file}")
                return os.path.join(directory, file)

        if extra_verbose:
            print(f"No match found for volume {volume_num}, chapter {chapter_num}")
            print("Available files:")
            for file in sorted(files)[:10]:
                print(f"  - {file}")
            if len(files) > 10:
                print(f"  ... and {len(files) - 10} more")

        return None
    except Exception as e:
        if extra_verbose:
            print(f"Error searching for file: {e}")
        return None

def create_clean_filename_mapping(cbz_files, extra_verbose=False):
    """Create a mapping of problematic filenames to clean alternatives."""
    filename_mapping = {}

    for file_path in cbz_files:
        has_problem, _ = has_problematic_characters(file_path)

        if has_problem:
            # Get the directory and filename
            directory = os.path.dirname(file_path)
            filename = os.path.basename(file_path)

            # Create a clean version of the filename by removing problematic characters
            clean_filename = ''.join(c if not is_emoji(c) and c != '?' else '_' for c in filename)

            # Make sure we don't create duplicates
            base, ext = os.path.splitext(clean_filename)
            counter = 1
            while os.path.exists(os.path.join(directory, clean_filename)):
                clean_filename = f"{base}_{counter}{ext}"
                counter += 1

            # Add to mapping
            filename_mapping[file_path] = os.path.join(directory, clean_filename)

            if extra_verbose:
                print(f"Mapped: {filename} -> {clean_filename}")

    return filename_mapping