Add support for de-duplicating chapters and selecting the best version
This commit is contained in:
@@ -1,4 +1,3 @@
|
|||||||
# File: /home/code/projects/manga-organizer-1/cbz-volume-combiner/cbz_volume_combiner/parsing.py
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
||||||
@@ -26,15 +25,15 @@ def parse_manga_filename(filename):
|
|||||||
# Try to normalize the filename to handle encoding issues
|
# Try to normalize the filename to handle encoding issues
|
||||||
normalized_filename = normalize_filename(base_filename)
|
normalized_filename = normalize_filename(base_filename)
|
||||||
|
|
||||||
# Use a more flexible pattern to handle apostrophes and other special characters
|
# Updated pattern to handle decimal chapter numbers like c019.1
|
||||||
pattern = r'(.*?)\s+v(\d+)\s+c(\d+[.\d-]*)\s+(.*)\.cbz$'
|
pattern = r'(.*?)\s+v(\d+)\s+c(\d+(?:\.\d+)?)\s*(.*)\.cbz$'
|
||||||
match = re.match(pattern, normalized_filename)
|
match = re.match(pattern, normalized_filename)
|
||||||
|
|
||||||
if match:
|
if match:
|
||||||
manga_name = match.group(1).strip()
|
manga_name = match.group(1).strip()
|
||||||
volume = int(match.group(2))
|
volume = int(match.group(2))
|
||||||
chapter_str = match.group(3)
|
chapter_str = match.group(3)
|
||||||
rest = match.group(4)
|
rest = match.group(4).strip()
|
||||||
|
|
||||||
# Try to extract title and group if available
|
# Try to extract title and group if available
|
||||||
group_match = re.search(r'\[(.*?)\]$', rest)
|
group_match = re.search(r'\[(.*?)\]$', rest)
|
||||||
@@ -64,6 +63,125 @@ def parse_manga_filename(filename):
|
|||||||
'filename': filename
|
'filename': filename
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Try an alternative pattern for filenames like "Manga Name v04 c021.1.cbz" (no title/group)
|
||||||
|
alt_pattern = r'(.*?)\s+v(\d+)\s+c(\d+(?:\.\d+)?)\.cbz$'
|
||||||
|
alt_match = re.match(alt_pattern, normalized_filename)
|
||||||
|
|
||||||
|
if alt_match:
|
||||||
|
manga_name = alt_match.group(1).strip()
|
||||||
|
volume = int(alt_match.group(2))
|
||||||
|
chapter_str = alt_match.group(3)
|
||||||
|
|
||||||
|
try:
|
||||||
|
chapter = float(chapter_str)
|
||||||
|
except ValueError:
|
||||||
|
chapter = 0
|
||||||
|
|
||||||
|
return {
|
||||||
|
'manga_name': manga_name,
|
||||||
|
'volume': volume,
|
||||||
|
'chapter': chapter,
|
||||||
|
'chapter_str': chapter_str,
|
||||||
|
'title': '',
|
||||||
|
'group': '',
|
||||||
|
'filename': filename
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add a new pattern for "Vol. XX Ch. YYY - Title.cbz" format
|
||||||
|
vol_ch_pattern = r'Vol\.\s*(\d+)\s+Ch\.\s*(\d+(?:\.\d+)?)\s*(?:-\s*(.*))?\.cbz$'
|
||||||
|
vol_ch_match = re.match(vol_ch_pattern, normalized_filename, re.IGNORECASE)
|
||||||
|
|
||||||
|
if vol_ch_match:
|
||||||
|
# For this format, we need to extract the manga name from the directory
|
||||||
|
manga_name = os.path.basename(os.path.dirname(filename))
|
||||||
|
volume = int(vol_ch_match.group(1))
|
||||||
|
chapter_str = vol_ch_match.group(2)
|
||||||
|
title = vol_ch_match.group(3) if vol_ch_match.group(3) else ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
chapter = float(chapter_str)
|
||||||
|
except ValueError:
|
||||||
|
chapter = 0
|
||||||
|
|
||||||
|
return {
|
||||||
|
'manga_name': manga_name,
|
||||||
|
'volume': volume,
|
||||||
|
'chapter': chapter,
|
||||||
|
'chapter_str': chapter_str,
|
||||||
|
'title': title,
|
||||||
|
'group': '',
|
||||||
|
'filename': filename
|
||||||
|
}
|
||||||
|
|
||||||
|
# New pattern for underscore_format_vXXcYY_(ScanGroup)[tag].cbz
|
||||||
|
underscore_pattern = r'(.+?)_v(\d+)c(\d+[a-z]?(?:\.\d+)?)(?:_\((.+?)\))?(?:\[(.+?)\])?\.cbz$'
|
||||||
|
underscore_match = re.match(underscore_pattern, normalized_filename)
|
||||||
|
|
||||||
|
if underscore_match:
|
||||||
|
# Extract parts from the filename
|
||||||
|
manga_name = underscore_match.group(1).replace('_', ' ').strip()
|
||||||
|
volume = int(underscore_match.group(2))
|
||||||
|
chapter_str = underscore_match.group(3)
|
||||||
|
publisher = underscore_match.group(4) if underscore_match.group(4) else ""
|
||||||
|
tag = underscore_match.group(5) if underscore_match.group(5) else ""
|
||||||
|
|
||||||
|
# Handle special case for titles with volume TPB format
|
||||||
|
if '_-_' in normalized_filename and '(Dark_Horse_TPB)' in normalized_filename:
|
||||||
|
# This is likely a tankōbon/volume title format like "Shadow_Star_v04_-_Nothing_But_the_Truth_(Dark_Horse_TPB)[m-s].cbz"
|
||||||
|
title_match = re.search(r'_v\d+_-_(.+?)_\(', normalized_filename)
|
||||||
|
title = title_match.group(1).replace('_', ' ') if title_match else ""
|
||||||
|
else:
|
||||||
|
title = ""
|
||||||
|
|
||||||
|
# Create group/publisher string
|
||||||
|
group = f"{publisher} [{tag}]" if publisher and tag else publisher or tag
|
||||||
|
|
||||||
|
# Handle chapter numbers with letter suffixes like "50b"
|
||||||
|
try:
|
||||||
|
# Extract just the numeric part for sorting
|
||||||
|
numeric_part = re.match(r'(\d+(?:\.\d+)?)', chapter_str)
|
||||||
|
if (numeric_part):
|
||||||
|
chapter = float(numeric_part.group(1))
|
||||||
|
else:
|
||||||
|
chapter = 0
|
||||||
|
except ValueError:
|
||||||
|
chapter = 0
|
||||||
|
|
||||||
|
return {
|
||||||
|
'manga_name': manga_name,
|
||||||
|
'volume': volume,
|
||||||
|
'chapter': chapter,
|
||||||
|
'chapter_str': chapter_str,
|
||||||
|
'title': title,
|
||||||
|
'group': group,
|
||||||
|
'filename': filename
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add a new pattern for "Manga Name Vol XX, YYYcTitle.cbz" format
|
||||||
|
comma_vol_pattern = r'^(.*?)\s+Vol\s+(\d+),\s+(\d+(?:\.\d+)?)[cC](.*?)\.cbz$'
|
||||||
|
comma_vol_match = re.match(comma_vol_pattern, normalized_filename, re.IGNORECASE)
|
||||||
|
|
||||||
|
if comma_vol_match:
|
||||||
|
manga_name = comma_vol_match.group(1).strip()
|
||||||
|
volume = int(comma_vol_match.group(2))
|
||||||
|
chapter_str = comma_vol_match.group(3)
|
||||||
|
title = comma_vol_match.group(4).strip()
|
||||||
|
|
||||||
|
try:
|
||||||
|
chapter = float(chapter_str)
|
||||||
|
except ValueError:
|
||||||
|
chapter = 0
|
||||||
|
|
||||||
|
return {
|
||||||
|
'manga_name': manga_name,
|
||||||
|
'volume': volume,
|
||||||
|
'chapter': chapter,
|
||||||
|
'chapter_str': chapter_str,
|
||||||
|
'title': title,
|
||||||
|
'group': '',
|
||||||
|
'filename': filename
|
||||||
|
}
|
||||||
|
|
||||||
if os.path.exists(filename):
|
if os.path.exists(filename):
|
||||||
# For debugging: print the filename that couldn't be parsed
|
# For debugging: print the filename that couldn't be parsed
|
||||||
print(f"WARNING: Could not parse filename: {base_filename}")
|
print(f"WARNING: Could not parse filename: {base_filename}")
|
||||||
|
|||||||
@@ -29,6 +29,55 @@ def create_volume_cbz(manga_name, volume_num, chapter_infos, output_dir=None, fo
|
|||||||
return True, "Skipped (already exists)"
|
return True, "Skipped (already exists)"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# De-duplicate chapters (keep only one copy of each chapter number)
|
||||||
|
if len(chapter_infos) > 1:
|
||||||
|
# Group chapters by their chapter number
|
||||||
|
chapter_groups = {}
|
||||||
|
for chapter in chapter_infos:
|
||||||
|
chapter_num = str(chapter['chapter']) # Convert to string for exact matching
|
||||||
|
if chapter_num not in chapter_groups:
|
||||||
|
chapter_groups[chapter_num] = []
|
||||||
|
chapter_groups[chapter_num].append(chapter)
|
||||||
|
|
||||||
|
# Check for duplicates
|
||||||
|
duplicates_found = False
|
||||||
|
for chapter_num, group in chapter_groups.items():
|
||||||
|
if len(group) > 1:
|
||||||
|
duplicates_found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if duplicates_found:
|
||||||
|
# Create a new de-duplicated chapter list
|
||||||
|
unique_chapters = []
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Found duplicate chapters, selecting one copy of each:")
|
||||||
|
|
||||||
|
for chapter_num, group in chapter_groups.items():
|
||||||
|
if len(group) > 1:
|
||||||
|
# We have duplicates for this chapter
|
||||||
|
if verbose:
|
||||||
|
print(f" Chapter {chapter_num} has {len(group)} copies:")
|
||||||
|
for i, ch in enumerate(group):
|
||||||
|
print(f" {i+1}. {os.path.basename(ch['filename'])}")
|
||||||
|
|
||||||
|
# Select the best version - prefer certain scan groups or larger files
|
||||||
|
selected_chapter = select_best_chapter(group, extra_verbose)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f" Selected: {os.path.basename(selected_chapter['filename'])}")
|
||||||
|
|
||||||
|
unique_chapters.append(selected_chapter)
|
||||||
|
else:
|
||||||
|
# Only one copy, just add it
|
||||||
|
unique_chapters.append(group[0])
|
||||||
|
|
||||||
|
# Replace the original chapter list with the de-duplicated one
|
||||||
|
chapter_infos = unique_chapters
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f"De-duplicated chapter list now has {len(chapter_infos)} chapters")
|
||||||
|
|
||||||
# Verify all chapter files exist before starting and attempt to fix missing files
|
# Verify all chapter files exist before starting and attempt to fix missing files
|
||||||
missing_files = []
|
missing_files = []
|
||||||
for chapter_info in chapter_infos:
|
for chapter_info in chapter_infos:
|
||||||
@@ -178,3 +227,30 @@ def create_volume_cbz(manga_name, volume_num, chapter_infos, output_dir=None, fo
|
|||||||
return True, "Created"
|
return True, "Created"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return False, str(e)
|
return False, str(e)
|
||||||
|
|
||||||
|
def select_best_chapter(chapter_group, extra_verbose=False):
|
||||||
|
"""
|
||||||
|
Select the best chapter from a group of duplicate chapters.
|
||||||
|
Strategy:
|
||||||
|
1. If all files exist, pick the largest one (likely better quality)
|
||||||
|
2. If some don't exist, pick one that exists
|
||||||
|
3. If none exist, just return the first one
|
||||||
|
"""
|
||||||
|
existing_chapters = [ch for ch in chapter_group if os.path.exists(ch['filename'])]
|
||||||
|
|
||||||
|
if not existing_chapters:
|
||||||
|
if extra_verbose:
|
||||||
|
print("Warning: None of the duplicate chapters exist on disk")
|
||||||
|
return chapter_group[0] # Return the first one and hope for the best
|
||||||
|
|
||||||
|
# Get file sizes for existing chapters
|
||||||
|
for chapter in existing_chapters:
|
||||||
|
chapter['filesize'] = os.path.getsize(chapter['filename'])
|
||||||
|
|
||||||
|
# Sort by file size (descending) - bigger files are often better quality
|
||||||
|
existing_chapters.sort(key=lambda x: x['filesize'], reverse=True)
|
||||||
|
|
||||||
|
if extra_verbose:
|
||||||
|
print(f"Selected largest file: {os.path.basename(existing_chapters[0]['filename'])} ({existing_chapters[0]['filesize'] / 1024:.1f} KB)")
|
||||||
|
|
||||||
|
return existing_chapters[0]
|
||||||
Reference in New Issue
Block a user