v0.2.2 resolve emoji issue in file names

2025-03-22 00:37:10 +00:00
parent 9fa11aea72
commit fab2207eb8
5 changed files with 214 additions and 91 deletions
--- a/cbz_volume_combiner/file_utils.py
+++ b/cbz_volume_combiner/file_utils.py
@@ -1,8 +1,23 @@
-# File: /home/code/projects/manga-organizer-1/cbz-volume-combiner/cbz_volume_combiner/file_utils.py
 import os
 import re
+import unicodedata
 from .parsing import parse_manga_filename

+def is_emoji(char):
+    """Check if a character is an emoji or other problematic symbol."""
+    return (
+        unicodedata.category(char) in ('So', 'Sm') or  # Symbol categories
+        ord(char) > 0x1F000  # Emoji range
+    )
+
+def has_problematic_characters(filename):
+    """Check if filename contains emoji or problematic characters."""
+    basename = os.path.basename(filename)
+    for char in basename:
+        if is_emoji(char) or char == '?':
+            return True, char
+    return False, None
+
 def find_cbz_files(folder_path, recursive=False, extra_verbose=False):
    """Find all CBZ files in the given folder."""
    cbz_files = []
@@ -39,87 +54,118 @@ def fix_missing_files(chapter_infos, folder_path, extra_verbose=False):
            fixed_chapters.append(chapter_info)
            continue
            
-        # If we're here, the file doesn't exist - get the problematic filename
-        base_filename = os.path.basename(chapter_info['filename'])
+        # If file doesn't exist, look for it by volume and chapter
        directory = os.path.dirname(chapter_info['filename'])
+        vol_num = chapter_info['volume']
+        chap_num = chapter_info['chapter_str']
        
        if extra_verbose:
-            print(f"Trying to find replacement for: {base_filename}")
+            print(f"Looking for alternative for {os.path.basename(chapter_info['filename'])}")
        
-        # First approach: Direct check with normalized path
-        # This is for cases where Python's path handling might be different from the filesystem
-        found_replacement = False
+        # Find file by volume and chapter numbers directly
+        actual_file = find_file_by_volume_chapter(directory, vol_num, chap_num, extra_verbose)
        
-        # Look for similar files in the directory
-        try:
-            # Get all CBZ files in the directory
-            cbz_files_in_dir = [f for f in os.listdir(directory) if f.lower().endswith('.cbz')]
-            
-            # First try: Look for exact matches with volume and chapter numbers
-            vol_num = chapter_info['volume']
-            chap_num = chapter_info['chapter_str']
-            
-            # Improved pattern matching for volume and chapter numbers
-            vol_pattern = f"v{vol_num:02d}"  # e.g., "v07"
-            chap_pattern = f"c{chap_num}"    # e.g., "c037"
-            
-            # Try to find a direct match first
-            for file in cbz_files_in_dir:
-                # Check volume and chapter patterns
-                if vol_pattern in file and chap_pattern in file:
-                    potential_path = os.path.join(directory, file)
-                    if os.path.exists(potential_path):
-                        if extra_verbose:
-                            print(f"  Found direct match: {file}")
-                        new_chapter_info = dict(chapter_info)
-                        new_chapter_info['filename'] = potential_path
-                        fixed_chapters.append(new_chapter_info)
-                        found_replacement = True
-                        break
-            
-            # If no direct match, try more flexible matching
-            if not found_replacement:
-                # Fuzzy match approach
-                chapter_pattern = rf"v0*{vol_num}\s+c0*{chap_num.lstrip('0')}"
-                
-                for file in cbz_files_in_dir:
-                    # Remove special characters for comparison
-                    clean_file = file
-                    for char in "'?,":
-                        clean_file = clean_file.replace(char, '')
-                    
-                    # Strip special characters from the pattern too
-                    clean_pattern = chapter_pattern
-                    for char in "'?,":
-                        clean_pattern = clean_pattern.replace(char, '')
-                    
-                    # Try matching with cleaned strings
-                    if re.search(chapter_pattern, file, re.IGNORECASE) or re.search(clean_pattern, clean_file, re.IGNORECASE):
-                        potential_path = os.path.join(directory, file)
-                        if os.path.exists(potential_path):
-                            if extra_verbose:
-                                print(f"  Found fuzzy match: {file}")
-                            new_chapter_info = dict(chapter_info)
-                            new_chapter_info['filename'] = potential_path
-                            fixed_chapters.append(new_chapter_info)
-                            found_replacement = True
-                            break
-            
-            # Last resort: List all files and let the user see what's available
-            if not found_replacement and extra_verbose:
-                print("  No match found. Available files in directory:")
-                for idx, file in enumerate(sorted(cbz_files_in_dir)):
-                    if idx < 20:  # Limit to first 20 files to avoid spam
-                        print(f"    - {file}")
-                    else:
-                        print(f"    ... and {len(cbz_files_in_dir) - 20} more files")
-                        break
-                        
-        except Exception as e:
+        if actual_file and os.path.exists(actual_file):
            if extra_verbose:
-                print(f"  Error while searching for replacement: {str(e)}")
+                print(f"Found alternative file: {os.path.basename(actual_file)}")
                
-        if not found_replacement:
+            new_chapter_info = dict(chapter_info)
+            new_chapter_info['filename'] = actual_file
+            fixed_chapters.append(new_chapter_info)
+        else:
+            if extra_verbose:
+                print(f"No alternative found for volume {vol_num}, chapter {chap_num}")
            unresolved_chapters.append(chapter_info)
            
-    return fixed_chapters, unresolved_chapters
+    return fixed_chapters, unresolved_chapters
+
+def find_file_by_volume_chapter(directory, volume_num, chapter_num, extra_verbose=False):
+    """Find a file in a directory by its volume and chapter number only."""
+    if not os.path.exists(directory):
+        if extra_verbose:
+            print(f"Directory does not exist: {directory}")
+        return None
+    
+    if extra_verbose:
+        print(f"Looking for volume {volume_num}, chapter {chapter_num} in {directory}")
+    
+    try:
+        files = [f for f in os.listdir(directory) if f.lower().endswith('.cbz')]
+        
+        # We'll only focus on the essential part: finding files by volume and chapter numbers
+        # regardless of special characters or encoding issues
+        
+        # First: Look for exact pattern matches
+        for file in files:
+            # Look for both zero-padded and non-zero-padded versions
+            v_patterns = [f"v{volume_num:02d}", f"v{volume_num}"]
+            c_patterns = [f"c{chapter_num}", f"c{chapter_num.zfill(3)}" if chapter_num.isdigit() else f"c{chapter_num}"]
+            
+            for v_pattern in v_patterns:
+                for c_pattern in c_patterns:
+                    if v_pattern in file and c_pattern in file:
+                        if extra_verbose:
+                            print(f"Found exact match: {file}")
+                        return os.path.join(directory, file)
+        
+        # Second: Use regex pattern matching
+        for file in files:
+            v_match = re.search(fr'v0*{volume_num}[^0-9]', file.lower())
+            c_match = re.search(fr'c0*{chapter_num}[^0-9]', file.lower())
+            
+            if v_match and c_match:
+                if extra_verbose:
+                    print(f"Found regex match: {file}")
+                return os.path.join(directory, file)
+                
+        # Third: Last resort - simplified alphanumeric comparison 
+        for file in files:
+            clean_file = ''.join(c.lower() for c in file if c.isalnum())
+            if f"v{volume_num}" in clean_file and f"c{chapter_num}" in clean_file:
+                if extra_verbose:
+                    print(f"Found simplified match: {file}")
+                return os.path.join(directory, file)
+        
+        if extra_verbose:
+            print(f"No match found for volume {volume_num}, chapter {chapter_num}")
+            print("Available files:")
+            for file in sorted(files)[:10]:
+                print(f"  - {file}")
+            if len(files) > 10:
+                print(f"  ... and {len(files) - 10} more")
+        
+        return None
+    except Exception as e:
+        if extra_verbose:
+            print(f"Error searching for file: {e}")
+        return None
+
+def create_clean_filename_mapping(cbz_files, extra_verbose=False):
+    """Create a mapping of problematic filenames to clean alternatives."""
+    filename_mapping = {}
+    
+    for file_path in cbz_files:
+        has_problem, _ = has_problematic_characters(file_path)
+        
+        if has_problem:
+            # Get the directory and filename
+            directory = os.path.dirname(file_path)
+            filename = os.path.basename(file_path)
+            
+            # Create a clean version of the filename by removing problematic characters
+            clean_filename = ''.join(c if not is_emoji(c) and c != '?' else '_' for c in filename)
+            
+            # Make sure we don't create duplicates
+            base, ext = os.path.splitext(clean_filename)
+            counter = 1
+            while os.path.exists(os.path.join(directory, clean_filename)):
+                clean_filename = f"{base}_{counter}{ext}"
+                counter += 1
+            
+            # Add to mapping
+            filename_mapping[file_path] = os.path.join(directory, clean_filename)
+            
+            if extra_verbose:
+                print(f"Mapped: {filename} -> {clean_filename}")
+    
+    return filename_mapping