v0.2.2 resolve emoji issue in file names

2025-03-22 00:37:10 +00:00
parent 9fa11aea72
commit fab2207eb8
5 changed files with 214 additions and 91 deletions
--- a/cbz_volume_combiner/init.py
+++ b/cbz_volume_combiner/init.py
@@ -4,4 +4,4 @@ from .file_utils import find_cbz_files, fix_missing_files
 from .core import organize_by_volume
 from .volume import create_volume_cbz
-__version__ = "0.2.0"
+__version__ = "0.2.2"
--- a/cbz_volume_combiner/core.py
+++ b/cbz_volume_combiner/core.py
@@ -2,12 +2,23 @@
 import os
 from collections import defaultdict
 from .parsing import parse_manga_filename
 from .file_utils import has_problematic_characters, find_file_by_volume_chapter
 def organize_by_volume(cbz_files, extra_verbose=False):
    """Group CBZ files by manga name and volume."""
    volumes = defaultdict(lambda: defaultdict(list))
    unparsed_files = []
    # First, identify any files with problematic characters
    problematic_files = []
    for cbz_file in cbz_files:
        has_problem, _ = has_problematic_characters(cbz_file)
        if has_problem:
            problematic_files.append(cbz_file)
    if problematic_files and extra_verbose:
        print(f"\nWARNING: Found {len(problematic_files)} problematic filenames that might need special handling.")
    for cbz_file in cbz_files:
        info = parse_manga_filename(cbz_file)
        if info:
@@ -15,6 +26,51 @@ def organize_by_volume(cbz_files, extra_verbose=False):
            volumes[manga_key][info['volume']].append(info)
        else:
            unparsed_files.append(cbz_file)
            # For unparsed files that have problematic characters,
            # try to get volume and chapter from filename pattern directly
            has_problem, _ = has_problematic_characters(cbz_file)
            if has_problem:
                if extra_verbose:
                    print(f"Attempting alternative parsing for problematic file: {os.path.basename(cbz_file)}")
                # Extract basic info using more lenient pattern
                base_filename = os.path.basename(cbz_file)
                # Look for v## and c### patterns
                vol_match = re.search(r'v(\d+)', base_filename)
                chap_match = re.search(r'c(\d+(?:\.\d+)?)', base_filename)
                if vol_match and chap_match:
                    # Extract manga name (everything before v##)
                    vol_pos = base_filename.find(f"v{vol_match.group(1)}")
                    manga_name = base_filename[:vol_pos].strip()
                    # Create a basic info dict
                    vol_num = int(vol_match.group(1))
                    chap_str = chap_match.group(1)
                    try:
                        chap_num = float(chap_str)
                    except ValueError:
                        chap_num = 0
                    if extra_verbose:
                        print(f"  Extracted: manga={manga_name}, vol={vol_num}, chap={chap_str}")
                    info = {
                        'manga_name': manga_name,
                        'volume': vol_num,
                        'chapter': chap_num,
                        'chapter_str': chap_str,
                        'title': '',
                        'group': '',
                        'filename': cbz_file
                    }
                    manga_key = manga_name.lower()
                    volumes[manga_key][vol_num].append(info)
                    # Remove from unparsed files since we handled it
                    unparsed_files.remove(cbz_file)
    # Sort chapters within each volume
    for manga in volumes:
--- a/cbz_volume_combiner/file_utils.py
+++ b/cbz_volume_combiner/file_utils.py
@@ -1,8 +1,23 @@
 # File: /home/code/projects/manga-organizer-1/cbz-volume-combiner/cbz_volume_combiner/file_utils.py
 import os
 import re
 import unicodedata
 from .parsing import parse_manga_filename
 def is_emoji(char):
    """Check if a character is an emoji or other problematic symbol."""
    return (
        unicodedata.category(char) in ('So', 'Sm') or  # Symbol categories
        ord(char) > 0x1F000  # Emoji range
    )
 def has_problematic_characters(filename):
    """Check if filename contains emoji or problematic characters."""
    basename = os.path.basename(filename)
    for char in basename:
        if is_emoji(char) or char == '?':
            return True, char
    return False, None
 def find_cbz_files(folder_path, recursive=False, extra_verbose=False):
    """Find all CBZ files in the given folder."""
    cbz_files = []
@@ -39,87 +54,118 @@ def fix_missing_files(chapter_infos, folder_path, extra_verbose=False):
            fixed_chapters.append(chapter_info)
            continue
-        # If we're here, the file doesn't exist - get the problematic filename
+        # If file doesn't exist, look for it by volume and chapter
        base_filename = os.path.basename(chapter_info['filename'])
        directory = os.path.dirname(chapter_info['filename'])
        vol_num = chapter_info['volume']
        chap_num = chapter_info['chapter_str']
        if extra_verbose:
-            print(f"Trying to find replacement for: {base_filename}")
+            print(f"Looking for alternative for {os.path.basename(chapter_info['filename'])}")
-        # First approach: Direct check with normalized path
+        # Find file by volume and chapter numbers directly
-        # This is for cases where Python's path handling might be different from the filesystem
+        actual_file = find_file_by_volume_chapter(directory, vol_num, chap_num, extra_verbose)
        found_replacement = False
-        # Look for similar files in the directory
+        if actual_file and os.path.exists(actual_file):
        try:
            # Get all CBZ files in the directory
            cbz_files_in_dir = [f for f in os.listdir(directory) if f.lower().endswith('.cbz')]
            # First try: Look for exact matches with volume and chapter numbers
            vol_num = chapter_info['volume']
            chap_num = chapter_info['chapter_str']
            # Improved pattern matching for volume and chapter numbers
            vol_pattern = f"v{vol_num:02d}"  # e.g., "v07"
            chap_pattern = f"c{chap_num}"    # e.g., "c037"
            # Try to find a direct match first
            for file in cbz_files_in_dir:
                # Check volume and chapter patterns
                if vol_pattern in file and chap_pattern in file:
                    potential_path = os.path.join(directory, file)
                    if os.path.exists(potential_path):
                        if extra_verbose:
                            print(f"  Found direct match: {file}")
                        new_chapter_info = dict(chapter_info)
                        new_chapter_info['filename'] = potential_path
                        fixed_chapters.append(new_chapter_info)
                        found_replacement = True
                        break
            # If no direct match, try more flexible matching
            if not found_replacement:
                # Fuzzy match approach
                chapter_pattern = rf"v0*{vol_num}\s+c0*{chap_num.lstrip('0')}"
                for file in cbz_files_in_dir:
                    # Remove special characters for comparison
                    clean_file = file
                    for char in "'?,":
                        clean_file = clean_file.replace(char, '')
                    # Strip special characters from the pattern too
                    clean_pattern = chapter_pattern
                    for char in "'?,":
                        clean_pattern = clean_pattern.replace(char, '')
                    # Try matching with cleaned strings
                    if re.search(chapter_pattern, file, re.IGNORECASE) or re.search(clean_pattern, clean_file, re.IGNORECASE):
                        potential_path = os.path.join(directory, file)
                        if os.path.exists(potential_path):
                            if extra_verbose:
                                print(f"  Found fuzzy match: {file}")
                            new_chapter_info = dict(chapter_info)
                            new_chapter_info['filename'] = potential_path
                            fixed_chapters.append(new_chapter_info)
                            found_replacement = True
                            break
            # Last resort: List all files and let the user see what's available
            if not found_replacement and extra_verbose:
                print("  No match found. Available files in directory:")
                for idx, file in enumerate(sorted(cbz_files_in_dir)):
                    if idx < 20:  # Limit to first 20 files to avoid spam
                        print(f"    - {file}")
                    else:
                        print(f"    ... and {len(cbz_files_in_dir) - 20} more files")
                        break
        except Exception as e:
            if extra_verbose:
-                print(f"  Error while searching for replacement: {str(e)}")
+                print(f"Found alternative file: {os.path.basename(actual_file)}")
-        if not found_replacement:
+            new_chapter_info = dict(chapter_info)
            new_chapter_info['filename'] = actual_file
            fixed_chapters.append(new_chapter_info)
        else:
            if extra_verbose:
                print(f"No alternative found for volume {vol_num}, chapter {chap_num}")
            unresolved_chapters.append(chapter_info)
-    return fixed_chapters, unresolved_chapters
+    return fixed_chapters, unresolved_chapters
 def find_file_by_volume_chapter(directory, volume_num, chapter_num, extra_verbose=False):
    """Find a file in a directory by its volume and chapter number only."""
    if not os.path.exists(directory):
        if extra_verbose:
            print(f"Directory does not exist: {directory}")
        return None
    if extra_verbose:
        print(f"Looking for volume {volume_num}, chapter {chapter_num} in {directory}")
    try:
        files = [f for f in os.listdir(directory) if f.lower().endswith('.cbz')]
        # We'll only focus on the essential part: finding files by volume and chapter numbers
        # regardless of special characters or encoding issues
        # First: Look for exact pattern matches
        for file in files:
            # Look for both zero-padded and non-zero-padded versions
            v_patterns = [f"v{volume_num:02d}", f"v{volume_num}"]
            c_patterns = [f"c{chapter_num}", f"c{chapter_num.zfill(3)}" if chapter_num.isdigit() else f"c{chapter_num}"]
            for v_pattern in v_patterns:
                for c_pattern in c_patterns:
                    if v_pattern in file and c_pattern in file:
                        if extra_verbose:
                            print(f"Found exact match: {file}")
                        return os.path.join(directory, file)
        # Second: Use regex pattern matching
        for file in files:
            v_match = re.search(fr'v0*{volume_num}[^0-9]', file.lower())
            c_match = re.search(fr'c0*{chapter_num}[^0-9]', file.lower())
            if v_match and c_match:
                if extra_verbose:
                    print(f"Found regex match: {file}")
                return os.path.join(directory, file)
        # Third: Last resort - simplified alphanumeric comparison 
        for file in files:
            clean_file = ''.join(c.lower() for c in file if c.isalnum())
            if f"v{volume_num}" in clean_file and f"c{chapter_num}" in clean_file:
                if extra_verbose:
                    print(f"Found simplified match: {file}")
                return os.path.join(directory, file)
        if extra_verbose:
            print(f"No match found for volume {volume_num}, chapter {chapter_num}")
            print("Available files:")
            for file in sorted(files)[:10]:
                print(f"  - {file}")
            if len(files) > 10:
                print(f"  ... and {len(files) - 10} more")
        return None
    except Exception as e:
        if extra_verbose:
            print(f"Error searching for file: {e}")
        return None
 def create_clean_filename_mapping(cbz_files, extra_verbose=False):
    """Create a mapping of problematic filenames to clean alternatives."""
    filename_mapping = {}
    for file_path in cbz_files:
        has_problem, _ = has_problematic_characters(file_path)
        if has_problem:
            # Get the directory and filename
            directory = os.path.dirname(file_path)
            filename = os.path.basename(file_path)
            # Create a clean version of the filename by removing problematic characters
            clean_filename = ''.join(c if not is_emoji(c) and c != '?' else '_' for c in filename)
            # Make sure we don't create duplicates
            base, ext = os.path.splitext(clean_filename)
            counter = 1
            while os.path.exists(os.path.join(directory, clean_filename)):
                clean_filename = f"{base}_{counter}{ext}"
                counter += 1
            # Add to mapping
            filename_mapping[file_path] = os.path.join(directory, clean_filename)
            if extra_verbose:
                print(f"Mapped: {filename} -> {clean_filename}")
    return filename_mapping
--- a/cbz_volume_combiner/volume.py
+++ b/cbz_volume_combiner/volume.py
@@ -4,7 +4,7 @@ import zipfile
 import tempfile
 import shutil
 from tqdm import tqdm
-from .file_utils import fix_missing_files
+from .file_utils import fix_missing_files, find_file_by_volume_chapter, has_problematic_characters
 def create_volume_cbz(manga_name, volume_num, chapter_infos, output_dir=None, force=False, verbose=False, extra_verbose=False):
    """Combine multiple chapter CBZ files into a single volume CBZ."""
@@ -88,11 +88,36 @@ def create_volume_cbz(manga_name, volume_num, chapter_infos, output_dir=None, fo
                if verbose:
                    print(f"Extracting chapter {chapter_info['chapter_str']}")
                # IMPORTANT: Handle file matching right before extraction
                file_exists = os.path.exists(chapter_info['filename'])
                if not file_exists:
                    # Try to find the file by volume and chapter
                    directory = os.path.dirname(chapter_info['filename'])
                    vol_num = chapter_info['volume']
                    chap_num = chapter_info['chapter_str']
                    if extra_verbose:
                        print(f"File not found: {chapter_info['filename']}")
                        print(f"Looking for alternative file with volume {vol_num}, chapter {chap_num}")
                    # Find by volume and chapter numbers only
                    actual_file = find_file_by_volume_chapter(directory, vol_num, chap_num, extra_verbose)
                    if actual_file and os.path.exists(actual_file):
                        chapter_info['filename'] = actual_file
                        file_exists = True
                        if extra_verbose:
                            print(f"Found alternative file: {actual_file}")
                # Skip if file still doesn't exist
                if not file_exists:
                    if extra_verbose:
                        print(f"Skipping chapter {chapter_info['chapter_str']} - file not found")
                    continue
                if extra_verbose:
                    print(f"File: {chapter_info['filename']}")
                    if not os.path.exists(chapter_info['filename']):
                        print(f"  ERROR: File does not exist!")
                        continue
                try:
                    # Extract the chapter
@@ -100,16 +125,13 @@ def create_volume_cbz(manga_name, volume_num, chapter_infos, output_dir=None, fo
                        file_list = sorted(zf.namelist())
                        if extra_verbose:
-                            print(f"  Contains {len(file_list)} files:")
+                            print(f"  Contains {len(file_list)} files")
                        for i, file_name in enumerate(file_list):
                            if file_name.endswith('/'):  # Skip directories
                                continue
-                            if extra_verbose and i < 10:  # Show first 10 files only
+                            # Extract with a standardized naming pattern
                                print(f"    - {file_name}")
                            # Extract with a standardized naming pattern: chapterXXX_pageYYY.ext
                            base, ext = os.path.splitext(os.path.basename(file_name))
                            new_name = f"chapter{chapter_info['chapter_str'].zfill(3)}_{i+1:03d}{ext}"
@@ -120,11 +142,10 @@ def create_volume_cbz(manga_name, volume_num, chapter_infos, output_dir=None, fo
                            except Exception as file_error:
                                if extra_verbose:
                                    print(f"    ERROR extracting {file_name}: {str(file_error)}")
                        if extra_verbose and len(file_list) > 10:
                            print(f"    ... and {len(file_list) - 10} more files")
                except Exception as e:
-                    return False, f"Error extracting chapter {chapter_info['chapter_str']}: {str(e)}"
+                    if extra_verbose:
                        print(f"Error extracting chapter: {e}")
                    continue  # Skip this chapter but continue with others
            # Create the volume CBZ
            if verbose:
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
 setup(
    name="cbz-volume-combiner",
-    version="0.2.0",
+    version="0.2.2",
    packages=find_packages(),
    scripts=['bin/cbz-volume-combiner'],
    install_requires=[