# File: /home/code/projects/manga-organizer-1/cbz-volume-combiner/cbz_volume_combiner/parsing.py import os import re def normalize_filename(filename): """Normalize a filename to handle encoding issues and special characters.""" # Replace common problematic characters replacements = { '?': "'", # Replace question marks that might be incorrectly encoded apostrophes '?': "'", # Another possible encoding of apostrophe '?': '"', # Possible encoding of double quote '?': '-', # Possible encoding of dash '?': ' ' # Possible encoding of space } for bad_char, good_char in replacements.items(): filename = filename.replace(bad_char, good_char) return filename def parse_manga_filename(filename): """Extract volume, chapter and title information from a manga filename.""" # Pattern to match: manga_name v## c### [optional title] [optional group] base_filename = os.path.basename(filename) # Try to normalize the filename to handle encoding issues normalized_filename = normalize_filename(base_filename) # Use a more flexible pattern to handle apostrophes and other special characters pattern = r'(.*?)\s+v(\d+)\s+c(\d+[.\d-]*)\s+(.*)\.cbz$' match = re.match(pattern, normalized_filename) if match: manga_name = match.group(1).strip() volume = int(match.group(2)) chapter_str = match.group(3) rest = match.group(4) # Try to extract title and group if available group_match = re.search(r'\[(.*?)\]$', rest) if group_match: group = group_match.group(1) title = rest[:rest.rfind('[')-1].strip() else: group = "" title = rest.strip() # Handle chapter numbers like "005.5" or "005-006" try: chapter = float(chapter_str) except ValueError: try: chapter = float(chapter_str.split('-')[0]) # Take first number for ranges except ValueError: chapter = 0 # Fallback for unparseable chapter numbers return { 'manga_name': manga_name, 'volume': volume, 'chapter': chapter, 'chapter_str': chapter_str, 'title': title, 'group': group, 'filename': filename } if os.path.exists(filename): # For debugging: print the filename that couldn't be parsed print(f"WARNING: Could not parse filename: {base_filename}") return None