import os import re def normalize_filename(filename): """Normalize a filename to handle encoding issues and special characters.""" # Replace common problematic characters replacements = { '?': "'", # Replace question marks that might be incorrectly encoded apostrophes '?': "'", # Another possible encoding of apostrophe '?': '"', # Possible encoding of double quote '?': '-', # Possible encoding of dash '?': ' ' # Possible encoding of space } for bad_char, good_char in replacements.items(): filename = filename.replace(bad_char, good_char) return filename def parse_manga_filename(filename): """Extract volume, chapter and title information from a manga filename.""" # Pattern to match: manga_name v## c### [optional title] [optional group] base_filename = os.path.basename(filename) # Try to normalize the filename to handle encoding issues normalized_filename = normalize_filename(base_filename) # Updated pattern to handle decimal chapter numbers like c019.1 pattern = r'(.*?)\s+v(\d+)\s+c(\d+(?:\.\d+)?)\s*(.*)\.cbz$' match = re.match(pattern, normalized_filename) if match: manga_name = match.group(1).strip() volume = int(match.group(2)) chapter_str = match.group(3) rest = match.group(4).strip() # Try to extract title and group if available group_match = re.search(r'\[(.*?)\]$', rest) if group_match: group = group_match.group(1) title = rest[:rest.rfind('[')-1].strip() else: group = "" title = rest.strip() # Handle chapter numbers like "005.5" or "005-006" try: chapter = float(chapter_str) except ValueError: try: chapter = float(chapter_str.split('-')[0]) # Take first number for ranges except ValueError: chapter = 0 # Fallback for unparseable chapter numbers return { 'manga_name': manga_name, 'volume': volume, 'chapter': chapter, 'chapter_str': chapter_str, 'title': title, 'group': group, 'filename': filename } # Try an alternative pattern for filenames like "Manga Name v04 c021.1.cbz" (no title/group) alt_pattern = r'(.*?)\s+v(\d+)\s+c(\d+(?:\.\d+)?)\.cbz$' alt_match = re.match(alt_pattern, normalized_filename) if alt_match: manga_name = alt_match.group(1).strip() volume = int(alt_match.group(2)) chapter_str = alt_match.group(3) try: chapter = float(chapter_str) except ValueError: chapter = 0 return { 'manga_name': manga_name, 'volume': volume, 'chapter': chapter, 'chapter_str': chapter_str, 'title': '', 'group': '', 'filename': filename } # Add a new pattern for "Vol. XX Ch. YYY - Title.cbz" format vol_ch_pattern = r'Vol\.\s*(\d+)\s+Ch\.\s*(\d+(?:\.\d+)?)\s*(?:-\s*(.*))?\.cbz$' vol_ch_match = re.match(vol_ch_pattern, normalized_filename, re.IGNORECASE) if vol_ch_match: # For this format, we need to extract the manga name from the directory manga_name = os.path.basename(os.path.dirname(filename)) volume = int(vol_ch_match.group(1)) chapter_str = vol_ch_match.group(2) title = vol_ch_match.group(3) if vol_ch_match.group(3) else "" try: chapter = float(chapter_str) except ValueError: chapter = 0 return { 'manga_name': manga_name, 'volume': volume, 'chapter': chapter, 'chapter_str': chapter_str, 'title': title, 'group': '', 'filename': filename } # New pattern for underscore_format_vXXcYY_(ScanGroup)[tag].cbz underscore_pattern = r'(.+?)_v(\d+)c(\d+[a-z]?(?:\.\d+)?)(?:_\((.+?)\))?(?:\[(.+?)\])?\.cbz$' underscore_match = re.match(underscore_pattern, normalized_filename) if underscore_match: # Extract parts from the filename manga_name = underscore_match.group(1).replace('_', ' ').strip() volume = int(underscore_match.group(2)) chapter_str = underscore_match.group(3) publisher = underscore_match.group(4) if underscore_match.group(4) else "" tag = underscore_match.group(5) if underscore_match.group(5) else "" # Handle special case for titles with volume TPB format if '_-_' in normalized_filename and '(Dark_Horse_TPB)' in normalized_filename: # This is likely a tankōbon/volume title format like "Shadow_Star_v04_-_Nothing_But_the_Truth_(Dark_Horse_TPB)[m-s].cbz" title_match = re.search(r'_v\d+_-_(.+?)_\(', normalized_filename) title = title_match.group(1).replace('_', ' ') if title_match else "" else: title = "" # Create group/publisher string group = f"{publisher} [{tag}]" if publisher and tag else publisher or tag # Handle chapter numbers with letter suffixes like "50b" try: # Extract just the numeric part for sorting numeric_part = re.match(r'(\d+(?:\.\d+)?)', chapter_str) if (numeric_part): chapter = float(numeric_part.group(1)) else: chapter = 0 except ValueError: chapter = 0 return { 'manga_name': manga_name, 'volume': volume, 'chapter': chapter, 'chapter_str': chapter_str, 'title': title, 'group': group, 'filename': filename } # Add a new pattern for "Manga Name Vol XX, YYYcTitle.cbz" format comma_vol_pattern = r'^(.*?)\s+Vol\s+(\d+),\s+(\d+(?:\.\d+)?)[cC](.*?)\.cbz$' comma_vol_match = re.match(comma_vol_pattern, normalized_filename, re.IGNORECASE) if comma_vol_match: manga_name = comma_vol_match.group(1).strip() volume = int(comma_vol_match.group(2)) chapter_str = comma_vol_match.group(3) title = comma_vol_match.group(4).strip() try: chapter = float(chapter_str) except ValueError: chapter = 0 return { 'manga_name': manga_name, 'volume': volume, 'chapter': chapter, 'chapter_str': chapter_str, 'title': title, 'group': '', 'filename': filename } if os.path.exists(filename): # For debugging: print the filename that couldn't be parsed print(f"WARNING: Could not parse filename: {base_filename}") return None