From 4a00923d078509d85fae956a90f1663f11b68520 Mon Sep 17 00:00:00 2001
From: Ben <code@ben.io>
Date: Tue, 8 Apr 2025 19:36:39 +0000
Subject: [PATCH] Add support for de-duplicating chapters and selecting the
 best version

---
 cbz_volume_combiner/parsing.py | 126 +++++++++++++++++++++++++++++++--
 cbz_volume_combiner/volume.py  |  78 +++++++++++++++++++-
 2 files changed, 199 insertions(+), 5 deletions(-)

diff --git a/cbz_volume_combiner/parsing.py b/cbz_volume_combiner/parsing.py
index 6ac37d2..75d47f0 100644
--- a/cbz_volume_combiner/parsing.py
+++ b/cbz_volume_combiner/parsing.py
@@ -1,4 +1,3 @@
-# File: /home/code/projects/manga-organizer-1/cbz-volume-combiner/cbz_volume_combiner/parsing.py
 import os
 import re
 
@@ -26,15 +25,15 @@ def parse_manga_filename(filename):
     # Try to normalize the filename to handle encoding issues
     normalized_filename = normalize_filename(base_filename)
     
-    # Use a more flexible pattern to handle apostrophes and other special characters
-    pattern = r'(.*?)\s+v(\d+)\s+c(\d+[.\d-]*)\s+(.*)\.cbz$'
+    # Updated pattern to handle decimal chapter numbers like c019.1
+    pattern = r'(.*?)\s+v(\d+)\s+c(\d+(?:\.\d+)?)\s*(.*)\.cbz$'
     match = re.match(pattern, normalized_filename)
     
     if match:
         manga_name = match.group(1).strip()
         volume = int(match.group(2))
         chapter_str = match.group(3)
-        rest = match.group(4)
+        rest = match.group(4).strip()
         
         # Try to extract title and group if available
         group_match = re.search(r'\[(.*?)\]$', rest)
@@ -64,6 +63,125 @@ def parse_manga_filename(filename):
             'filename': filename
         }
     
+    # Try an alternative pattern for filenames like "Manga Name v04 c021.1.cbz" (no title/group)
+    alt_pattern = r'(.*?)\s+v(\d+)\s+c(\d+(?:\.\d+)?)\.cbz$'
+    alt_match = re.match(alt_pattern, normalized_filename)
+    
+    if alt_match:
+        manga_name = alt_match.group(1).strip()
+        volume = int(alt_match.group(2))
+        chapter_str = alt_match.group(3)
+        
+        try:
+            chapter = float(chapter_str)
+        except ValueError:
+            chapter = 0
+        
+        return {
+            'manga_name': manga_name,
+            'volume': volume,
+            'chapter': chapter,
+            'chapter_str': chapter_str,
+            'title': '',
+            'group': '',
+            'filename': filename
+        }
+    
+    # Add a new pattern for "Vol. XX Ch. YYY - Title.cbz" format
+    vol_ch_pattern = r'Vol\.\s*(\d+)\s+Ch\.\s*(\d+(?:\.\d+)?)\s*(?:-\s*(.*))?\.cbz$'
+    vol_ch_match = re.match(vol_ch_pattern, normalized_filename, re.IGNORECASE)
+    
+    if vol_ch_match:
+        # For this format, we need to extract the manga name from the directory
+        manga_name = os.path.basename(os.path.dirname(filename))
+        volume = int(vol_ch_match.group(1))
+        chapter_str = vol_ch_match.group(2)
+        title = vol_ch_match.group(3) if vol_ch_match.group(3) else ""
+        
+        try:
+            chapter = float(chapter_str)
+        except ValueError:
+            chapter = 0
+        
+        return {
+            'manga_name': manga_name,
+            'volume': volume,
+            'chapter': chapter,
+            'chapter_str': chapter_str,
+            'title': title,
+            'group': '',
+            'filename': filename
+        }
+    
+    # New pattern for underscore_format_vXXcYY_(ScanGroup)[tag].cbz
+    underscore_pattern = r'(.+?)_v(\d+)c(\d+[a-z]?(?:\.\d+)?)(?:_\((.+?)\))?(?:\[(.+?)\])?\.cbz$'
+    underscore_match = re.match(underscore_pattern, normalized_filename)
+    
+    if underscore_match:
+        # Extract parts from the filename
+        manga_name = underscore_match.group(1).replace('_', ' ').strip()
+        volume = int(underscore_match.group(2))
+        chapter_str = underscore_match.group(3)
+        publisher = underscore_match.group(4) if underscore_match.group(4) else ""
+        tag = underscore_match.group(5) if underscore_match.group(5) else ""
+        
+        # Handle special case for titles with volume TPB format
+        if '_-_' in normalized_filename and '(Dark_Horse_TPB)' in normalized_filename:
+            # This is likely a tankōbon/volume title format like "Shadow_Star_v04_-_Nothing_But_the_Truth_(Dark_Horse_TPB)[m-s].cbz"
+            title_match = re.search(r'_v\d+_-_(.+?)_\(', normalized_filename)
+            title = title_match.group(1).replace('_', ' ') if title_match else ""
+        else:
+            title = ""
+        
+        # Create group/publisher string
+        group = f"{publisher} [{tag}]" if publisher and tag else publisher or tag
+        
+        # Handle chapter numbers with letter suffixes like "50b"
+        try:
+            # Extract just the numeric part for sorting
+            numeric_part = re.match(r'(\d+(?:\.\d+)?)', chapter_str)
+            if (numeric_part):
+                chapter = float(numeric_part.group(1))
+            else:
+                chapter = 0
+        except ValueError:
+            chapter = 0
+        
+        return {
+            'manga_name': manga_name,
+            'volume': volume,
+            'chapter': chapter,
+            'chapter_str': chapter_str,
+            'title': title,
+            'group': group,
+            'filename': filename
+        }
+    
+    # Add a new pattern for "Manga Name Vol XX,  YYYcTitle.cbz" format
+    comma_vol_pattern = r'^(.*?)\s+Vol\s+(\d+),\s+(\d+(?:\.\d+)?)[cC](.*?)\.cbz$'
+    comma_vol_match = re.match(comma_vol_pattern, normalized_filename, re.IGNORECASE)
+
+    if comma_vol_match:
+        manga_name = comma_vol_match.group(1).strip()
+        volume = int(comma_vol_match.group(2))
+        chapter_str = comma_vol_match.group(3)
+        title = comma_vol_match.group(4).strip()
+
+        try:
+            chapter = float(chapter_str)
+        except ValueError:
+            chapter = 0
+
+        return {
+            'manga_name': manga_name,
+            'volume': volume,
+            'chapter': chapter,
+            'chapter_str': chapter_str,
+            'title': title,
+            'group': '',
+            'filename': filename
+        }
+
     if os.path.exists(filename):
         # For debugging: print the filename that couldn't be parsed
         print(f"WARNING: Could not parse filename: {base_filename}")
diff --git a/cbz_volume_combiner/volume.py b/cbz_volume_combiner/volume.py
index fab05c9..a4fa725 100644
--- a/cbz_volume_combiner/volume.py
+++ b/cbz_volume_combiner/volume.py
@@ -29,6 +29,55 @@ def create_volume_cbz(manga_name, volume_num, chapter_infos, output_dir=None, fo
         return True, "Skipped (already exists)"
     
     try:
+        # De-duplicate chapters (keep only one copy of each chapter number)
+        if len(chapter_infos) > 1:
+            # Group chapters by their chapter number
+            chapter_groups = {}
+            for chapter in chapter_infos:
+                chapter_num = str(chapter['chapter'])  # Convert to string for exact matching
+                if chapter_num not in chapter_groups:
+                    chapter_groups[chapter_num] = []
+                chapter_groups[chapter_num].append(chapter)
+            
+            # Check for duplicates
+            duplicates_found = False
+            for chapter_num, group in chapter_groups.items():
+                if len(group) > 1:
+                    duplicates_found = True
+                    break
+            
+            if duplicates_found:
+                # Create a new de-duplicated chapter list
+                unique_chapters = []
+                
+                if verbose:
+                    print("Found duplicate chapters, selecting one copy of each:")
+                
+                for chapter_num, group in chapter_groups.items():
+                    if len(group) > 1:
+                        # We have duplicates for this chapter
+                        if verbose:
+                            print(f"  Chapter {chapter_num} has {len(group)} copies:")
+                            for i, ch in enumerate(group):
+                                print(f"    {i+1}. {os.path.basename(ch['filename'])}")
+                        
+                        # Select the best version - prefer certain scan groups or larger files
+                        selected_chapter = select_best_chapter(group, extra_verbose)
+                        
+                        if verbose:
+                            print(f"  Selected: {os.path.basename(selected_chapter['filename'])}")
+                        
+                        unique_chapters.append(selected_chapter)
+                    else:
+                        # Only one copy, just add it
+                        unique_chapters.append(group[0])
+                
+                # Replace the original chapter list with the de-duplicated one
+                chapter_infos = unique_chapters
+                
+                if verbose:
+                    print(f"De-duplicated chapter list now has {len(chapter_infos)} chapters")
+        
         # Verify all chapter files exist before starting and attempt to fix missing files
         missing_files = []
         for chapter_info in chapter_infos:
@@ -177,4 +226,31 @@ def create_volume_cbz(manga_name, volume_num, chapter_infos, output_dir=None, fo
         
         return True, "Created"
     except Exception as e:
-        return False, str(e)
\ No newline at end of file
+        return False, str(e)
+
+def select_best_chapter(chapter_group, extra_verbose=False):
+    """
+    Select the best chapter from a group of duplicate chapters.
+    Strategy:
+    1. If all files exist, pick the largest one (likely better quality)
+    2. If some don't exist, pick one that exists
+    3. If none exist, just return the first one
+    """
+    existing_chapters = [ch for ch in chapter_group if os.path.exists(ch['filename'])]
+    
+    if not existing_chapters:
+        if extra_verbose:
+            print("Warning: None of the duplicate chapters exist on disk")
+        return chapter_group[0]  # Return the first one and hope for the best
+    
+    # Get file sizes for existing chapters
+    for chapter in existing_chapters:
+        chapter['filesize'] = os.path.getsize(chapter['filename'])
+    
+    # Sort by file size (descending) - bigger files are often better quality
+    existing_chapters.sort(key=lambda x: x['filesize'], reverse=True)
+    
+    if extra_verbose:
+        print(f"Selected largest file: {os.path.basename(existing_chapters[0]['filename'])} ({existing_chapters[0]['filesize'] / 1024:.1f} KB)")
+    
+    return existing_chapters[0]
\ No newline at end of file