From 4a00923d078509d85fae956a90f1663f11b68520 Mon Sep 17 00:00:00 2001 From: Ben Date: Tue, 8 Apr 2025 19:36:39 +0000 Subject: [PATCH] Add support for de-duplicating chapters and selecting the best version --- cbz_volume_combiner/parsing.py | 126 +++++++++++++++++++++++++++++++-- cbz_volume_combiner/volume.py | 78 +++++++++++++++++++- 2 files changed, 199 insertions(+), 5 deletions(-) diff --git a/cbz_volume_combiner/parsing.py b/cbz_volume_combiner/parsing.py index 6ac37d2..75d47f0 100644 --- a/cbz_volume_combiner/parsing.py +++ b/cbz_volume_combiner/parsing.py @@ -1,4 +1,3 @@ -# File: /home/code/projects/manga-organizer-1/cbz-volume-combiner/cbz_volume_combiner/parsing.py import os import re @@ -26,15 +25,15 @@ def parse_manga_filename(filename): # Try to normalize the filename to handle encoding issues normalized_filename = normalize_filename(base_filename) - # Use a more flexible pattern to handle apostrophes and other special characters - pattern = r'(.*?)\s+v(\d+)\s+c(\d+[.\d-]*)\s+(.*)\.cbz$' + # Updated pattern to handle decimal chapter numbers like c019.1 + pattern = r'(.*?)\s+v(\d+)\s+c(\d+(?:\.\d+)?)\s*(.*)\.cbz$' match = re.match(pattern, normalized_filename) if match: manga_name = match.group(1).strip() volume = int(match.group(2)) chapter_str = match.group(3) - rest = match.group(4) + rest = match.group(4).strip() # Try to extract title and group if available group_match = re.search(r'\[(.*?)\]$', rest) @@ -64,6 +63,125 @@ def parse_manga_filename(filename): 'filename': filename } + # Try an alternative pattern for filenames like "Manga Name v04 c021.1.cbz" (no title/group) + alt_pattern = r'(.*?)\s+v(\d+)\s+c(\d+(?:\.\d+)?)\.cbz$' + alt_match = re.match(alt_pattern, normalized_filename) + + if alt_match: + manga_name = alt_match.group(1).strip() + volume = int(alt_match.group(2)) + chapter_str = alt_match.group(3) + + try: + chapter = float(chapter_str) + except ValueError: + chapter = 0 + + return { + 'manga_name': manga_name, + 'volume': volume, + 'chapter': chapter, + 'chapter_str': chapter_str, + 'title': '', + 'group': '', + 'filename': filename + } + + # Add a new pattern for "Vol. XX Ch. YYY - Title.cbz" format + vol_ch_pattern = r'Vol\.\s*(\d+)\s+Ch\.\s*(\d+(?:\.\d+)?)\s*(?:-\s*(.*))?\.cbz$' + vol_ch_match = re.match(vol_ch_pattern, normalized_filename, re.IGNORECASE) + + if vol_ch_match: + # For this format, we need to extract the manga name from the directory + manga_name = os.path.basename(os.path.dirname(filename)) + volume = int(vol_ch_match.group(1)) + chapter_str = vol_ch_match.group(2) + title = vol_ch_match.group(3) if vol_ch_match.group(3) else "" + + try: + chapter = float(chapter_str) + except ValueError: + chapter = 0 + + return { + 'manga_name': manga_name, + 'volume': volume, + 'chapter': chapter, + 'chapter_str': chapter_str, + 'title': title, + 'group': '', + 'filename': filename + } + + # New pattern for underscore_format_vXXcYY_(ScanGroup)[tag].cbz + underscore_pattern = r'(.+?)_v(\d+)c(\d+[a-z]?(?:\.\d+)?)(?:_\((.+?)\))?(?:\[(.+?)\])?\.cbz$' + underscore_match = re.match(underscore_pattern, normalized_filename) + + if underscore_match: + # Extract parts from the filename + manga_name = underscore_match.group(1).replace('_', ' ').strip() + volume = int(underscore_match.group(2)) + chapter_str = underscore_match.group(3) + publisher = underscore_match.group(4) if underscore_match.group(4) else "" + tag = underscore_match.group(5) if underscore_match.group(5) else "" + + # Handle special case for titles with volume TPB format + if '_-_' in normalized_filename and '(Dark_Horse_TPB)' in normalized_filename: + # This is likely a tankōbon/volume title format like "Shadow_Star_v04_-_Nothing_But_the_Truth_(Dark_Horse_TPB)[m-s].cbz" + title_match = re.search(r'_v\d+_-_(.+?)_\(', normalized_filename) + title = title_match.group(1).replace('_', ' ') if title_match else "" + else: + title = "" + + # Create group/publisher string + group = f"{publisher} [{tag}]" if publisher and tag else publisher or tag + + # Handle chapter numbers with letter suffixes like "50b" + try: + # Extract just the numeric part for sorting + numeric_part = re.match(r'(\d+(?:\.\d+)?)', chapter_str) + if (numeric_part): + chapter = float(numeric_part.group(1)) + else: + chapter = 0 + except ValueError: + chapter = 0 + + return { + 'manga_name': manga_name, + 'volume': volume, + 'chapter': chapter, + 'chapter_str': chapter_str, + 'title': title, + 'group': group, + 'filename': filename + } + + # Add a new pattern for "Manga Name Vol XX, YYYcTitle.cbz" format + comma_vol_pattern = r'^(.*?)\s+Vol\s+(\d+),\s+(\d+(?:\.\d+)?)[cC](.*?)\.cbz$' + comma_vol_match = re.match(comma_vol_pattern, normalized_filename, re.IGNORECASE) + + if comma_vol_match: + manga_name = comma_vol_match.group(1).strip() + volume = int(comma_vol_match.group(2)) + chapter_str = comma_vol_match.group(3) + title = comma_vol_match.group(4).strip() + + try: + chapter = float(chapter_str) + except ValueError: + chapter = 0 + + return { + 'manga_name': manga_name, + 'volume': volume, + 'chapter': chapter, + 'chapter_str': chapter_str, + 'title': title, + 'group': '', + 'filename': filename + } + if os.path.exists(filename): # For debugging: print the filename that couldn't be parsed print(f"WARNING: Could not parse filename: {base_filename}") diff --git a/cbz_volume_combiner/volume.py b/cbz_volume_combiner/volume.py index fab05c9..a4fa725 100644 --- a/cbz_volume_combiner/volume.py +++ b/cbz_volume_combiner/volume.py @@ -29,6 +29,55 @@ def create_volume_cbz(manga_name, volume_num, chapter_infos, output_dir=None, fo return True, "Skipped (already exists)" try: + # De-duplicate chapters (keep only one copy of each chapter number) + if len(chapter_infos) > 1: + # Group chapters by their chapter number + chapter_groups = {} + for chapter in chapter_infos: + chapter_num = str(chapter['chapter']) # Convert to string for exact matching + if chapter_num not in chapter_groups: + chapter_groups[chapter_num] = [] + chapter_groups[chapter_num].append(chapter) + + # Check for duplicates + duplicates_found = False + for chapter_num, group in chapter_groups.items(): + if len(group) > 1: + duplicates_found = True + break + + if duplicates_found: + # Create a new de-duplicated chapter list + unique_chapters = [] + + if verbose: + print("Found duplicate chapters, selecting one copy of each:") + + for chapter_num, group in chapter_groups.items(): + if len(group) > 1: + # We have duplicates for this chapter + if verbose: + print(f" Chapter {chapter_num} has {len(group)} copies:") + for i, ch in enumerate(group): + print(f" {i+1}. {os.path.basename(ch['filename'])}") + + # Select the best version - prefer certain scan groups or larger files + selected_chapter = select_best_chapter(group, extra_verbose) + + if verbose: + print(f" Selected: {os.path.basename(selected_chapter['filename'])}") + + unique_chapters.append(selected_chapter) + else: + # Only one copy, just add it + unique_chapters.append(group[0]) + + # Replace the original chapter list with the de-duplicated one + chapter_infos = unique_chapters + + if verbose: + print(f"De-duplicated chapter list now has {len(chapter_infos)} chapters") + # Verify all chapter files exist before starting and attempt to fix missing files missing_files = [] for chapter_info in chapter_infos: @@ -177,4 +226,31 @@ def create_volume_cbz(manga_name, volume_num, chapter_infos, output_dir=None, fo return True, "Created" except Exception as e: - return False, str(e) \ No newline at end of file + return False, str(e) + +def select_best_chapter(chapter_group, extra_verbose=False): + """ + Select the best chapter from a group of duplicate chapters. + Strategy: + 1. If all files exist, pick the largest one (likely better quality) + 2. If some don't exist, pick one that exists + 3. If none exist, just return the first one + """ + existing_chapters = [ch for ch in chapter_group if os.path.exists(ch['filename'])] + + if not existing_chapters: + if extra_verbose: + print("Warning: None of the duplicate chapters exist on disk") + return chapter_group[0] # Return the first one and hope for the best + + # Get file sizes for existing chapters + for chapter in existing_chapters: + chapter['filesize'] = os.path.getsize(chapter['filename']) + + # Sort by file size (descending) - bigger files are often better quality + existing_chapters.sort(key=lambda x: x['filesize'], reverse=True) + + if extra_verbose: + print(f"Selected largest file: {os.path.basename(existing_chapters[0]['filename'])} ({existing_chapters[0]['filesize'] / 1024:.1f} KB)") + + return existing_chapters[0] \ No newline at end of file