cbz-volume-combiner/cbz_volume_combiner/combine.py

import os
import re
import zipfile
import tempfile
import shutil
import argparse
from tqdm import tqdm
from collections import defaultdict
import unicodedata

def normalize_filename(filename):
    """Normalize a filename to handle encoding issues and special characters."""
    # Replace common problematic characters
    replacements = {
        '?': "'",  # Replace question marks that might be incorrectly encoded apostrophes
        '?': "'",  # Another possible encoding of apostrophe
        '?': '"',  # Possible encoding of double quote
        '?': '-',  # Possible encoding of dash
        '?': ' '   # Possible encoding of space
    }

    for bad_char, good_char in replacements.items():
        filename = filename.replace(bad_char, good_char)

    return filename

def parse_manga_filename(filename):
    """Extract volume, chapter and title information from a manga filename."""
    # Pattern to match: manga_name v## c### [optional title] [optional group]
    base_filename = os.path.basename(filename)

    # Try to normalize the filename to handle encoding issues
    normalized_filename = normalize_filename(base_filename)

    # Use a more flexible pattern to handle apostrophes and other special characters
    pattern = r'(.*?)\s+v(\d+)\s+c(\d+[.\d-]*)\s+(.*)\.cbz$'
    match = re.match(pattern, normalized_filename)

    if match:
        manga_name = match.group(1).strip()
        volume = int(match.group(2))
        chapter_str = match.group(3)
        rest = match.group(4)

        # Try to extract title and group if available
        group_match = re.search(r'\[(.*?)\]$', rest)
        if group_match:
            group = group_match.group(1)
            title = rest[:rest.rfind('[')-1].strip()
        else:
            group = ""
            title = rest.strip()

        # Handle chapter numbers like "005.5" or "005-006"
        try:
            chapter = float(chapter_str)
        except ValueError:
            try:
                chapter = float(chapter_str.split('-')[0])  # Take first number for ranges
            except ValueError:
                chapter = 0  # Fallback for unparseable chapter numbers

        return {
            'manga_name': manga_name,
            'volume': volume,
            'chapter': chapter,
            'chapter_str': chapter_str,
            'title': title,
            'group': group,
            'filename': filename
        }

    if os.path.exists(filename):
        # For debugging: print the filename that couldn't be parsed
        print(f"WARNING: Could not parse filename: {base_filename}")

    return None

def find_cbz_files(folder_path, recursive=False, extra_verbose=False):
    """Find all CBZ files in the given folder."""
    cbz_files = []

    if recursive:
        if extra_verbose:
            print(f"Recursively searching for CBZ files in {folder_path}")

        for root, _, files in os.walk(folder_path):
            for file in files:
                if file.lower().endswith('.cbz'):
                    cbz_files.append(os.path.join(root, file))
    else:
        if extra_verbose:
            print(f"Searching for CBZ files in {folder_path} (non-recursive)")

        for file in os.listdir(folder_path):
            if file.lower().endswith('.cbz'):
                cbz_files.append(os.path.join(folder_path, file))

    if extra_verbose:
        print(f"Found {len(cbz_files)} CBZ files")

    return cbz_files

def fix_missing_files(chapter_infos, folder_path, extra_verbose=False):
    """Attempt to find missing files by searching for similar filenames."""
    fixed_chapters = []
    unresolved_chapters = []

    for chapter_info in chapter_infos:
        if os.path.exists(chapter_info['filename']):
            fixed_chapters.append(chapter_info)
            continue

        # Get the problematic filename
        base_filename = os.path.basename(chapter_info['filename'])
        if extra_verbose:
            print(f"Trying to find replacement for: {base_filename}")

        # Look for similar files in the directory
        found_replacement = False
        directory = os.path.dirname(chapter_info['filename'])

        try:
            for file in os.listdir(directory):
                if not file.lower().endswith('.cbz'):
                    continue

                # Check if volume and chapter match
                v_match = re.search(fr"v{chapter_info['volume']:02d}", file)
                c_match = re.search(fr"c{chapter_info['chapter_str']}", file.replace('?', "'"))

                if v_match and c_match:
                    if extra_verbose:
                        print(f"  Found potential replacement: {file}")

                    # Create a new chapter info with the correct filename
                    new_chapter_info = dict(chapter_info)
                    new_chapter_info['filename'] = os.path.join(directory, file)

                    if os.path.exists(new_chapter_info['filename']):
                        if extra_verbose:
                            print(f"  Replacement file exists, using it instead")
                        fixed_chapters.append(new_chapter_info)
                        found_replacement = True
                        break
        except Exception as e:
            if extra_verbose:
                print(f"  Error while searching for replacement: {str(e)}")

        if not found_replacement:
            unresolved_chapters.append(chapter_info)

    return fixed_chapters, unresolved_chapters

def organize_by_volume(cbz_files, extra_verbose=False):
    """Group CBZ files by manga name and volume."""
    volumes = defaultdict(lambda: defaultdict(list))
    unparsed_files = []

    for cbz_file in cbz_files:
        info = parse_manga_filename(cbz_file)
        if info:
            manga_key = info['manga_name'].lower()
            volumes[manga_key][info['volume']].append(info)
        else:
            unparsed_files.append(cbz_file)

    # Sort chapters within each volume
    for manga in volumes:
        for volume in volumes[manga]:
            volumes[manga][volume].sort(key=lambda x: x['chapter'])

    if extra_verbose and unparsed_files:
        print(f"\nWARNING: Could not parse {len(unparsed_files)} files:")
        for file in unparsed_files[:10]:  # Show first 10 only to avoid spam
            print(f"  - {os.path.basename(file)}")
        if len(unparsed_files) > 10:
            print(f"  ... and {len(unparsed_files) - 10} more")

    return volumes

def create_volume_cbz(manga_name, volume_num, chapter_infos, output_dir=None, force=False, verbose=False, extra_verbose=False):
    """Combine multiple chapter CBZ files into a single volume CBZ."""
    # Determine output path
    volume_filename = f"{manga_name} - Volume {volume_num:02d}.cbz"

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, volume_filename)
    else:
        # Use the directory of the first chapter
        output_path = os.path.join(os.path.dirname(chapter_infos[0]['filename']), volume_filename)

    if verbose:
        print(f"Creating volume: {volume_filename}")
        print(f"Output path: {output_path}")

    # Check if volume already exists
    if os.path.exists(output_path) and not force:
        if verbose:
            print(f"Skipping {volume_filename} (already exists)")
        return True, "Skipped (already exists)"

    try:
        # Verify all chapter files exist before starting
        missing_files = []
        for chapter_info in chapter_infos:
            if not os.path.exists(chapter_info['filename']):
                missing_files.append(chapter_info['filename'])

        if missing_files:
            if extra_verbose:
                print(f"Found {len(missing_files)} missing files, attempting to fix:")
                for missing in missing_files:
                    print(f"  - {os.path.basename(missing)}")

            # Try to fix missing files by finding alternatives
            chapter_dir = os.path.dirname(chapter_infos[0]['filename'])
            fixed_chapters, unresolved_chapters = fix_missing_files(chapter_infos, chapter_dir, extra_verbose)

            if unresolved_chapters:
                if extra_verbose:
                    print(f"Could not resolve {len(unresolved_chapters)} missing files:")
                    for chapter in unresolved_chapters:
                        print(f"  - {os.path.basename(chapter['filename'])}")
                return False, f"Missing {len(unresolved_chapters)} chapter file(s) after resolution attempts"

            # Update chapter_infos with the fixed list
            chapter_infos = fixed_chapters

            if extra_verbose:
                print("All missing files resolved, proceeding with conversion")

        # Create a temporary directory for extraction
        with tempfile.TemporaryDirectory() as temp_dir:
            # Extract all chapters in order
            for chapter_info in tqdm(chapter_infos, desc=f"Extracting chapters for Volume {volume_num}", disable=not verbose):
                chapter_dir = os.path.join(temp_dir, f"chapter_{chapter_info['chapter_str'].zfill(3)}")
                os.makedirs(chapter_dir, exist_ok=True)

                if verbose:
                    print(f"Extracting chapter {chapter_info['chapter_str']}")

                if extra_verbose:
                    print(f"File: {chapter_info['filename']}")
                    if not os.path.exists(chapter_info['filename']):
                        print(f"  ERROR: File does not exist!")
                        continue

                try:
                    # Extract the chapter
                    with zipfile.ZipFile(chapter_info['filename'], 'r') as zf:
                        file_list = sorted(zf.namelist())

                        if extra_verbose:
                            print(f"  Contains {len(file_list)} files:")

                        for i, file_name in enumerate(file_list):
                            if file_name.endswith('/'):  # Skip directories
                                continue

                            if extra_verbose and i < 10:  # Show first 10 files only
                                print(f"    - {file_name}")

                            # Extract with a standardized naming pattern: chapterXXX_pageYYY.ext
                            base, ext = os.path.splitext(os.path.basename(file_name))
                            new_name = f"chapter{chapter_info['chapter_str'].zfill(3)}_{i+1:03d}{ext}"

                            # Extract file to temp directory
                            try:
                                with zf.open(file_name) as source, open(os.path.join(chapter_dir, new_name), 'wb') as target:
                                    shutil.copyfileobj(source, target)
                            except Exception as file_error:
                                if extra_verbose:
                                    print(f"    ERROR extracting {file_name}: {str(file_error)}")

                        if extra_verbose and len(file_list) > 10:
                            print(f"    ... and {len(file_list) - 10} more files")
                except Exception as e:
                    return False, f"Error extracting chapter {chapter_info['chapter_str']}: {str(e)}"

            # Create the volume CBZ
            if verbose:
                print(f"Creating volume CBZ: {volume_filename}")

            try:
                with zipfile.ZipFile(output_path, 'w') as volume_zip:
                    # Add all files from all chapters in order
                    chapter_dirs = sorted(os.listdir(temp_dir))

                    for chapter_dir in chapter_dirs:
                        chapter_path = os.path.join(temp_dir, chapter_dir)
                        if os.path.isdir(chapter_path):
                            chapter_files = sorted(os.listdir(chapter_path))

                            if extra_verbose:
                                print(f"Adding directory: {chapter_dir} ({len(chapter_files)} files)")

                            for file in chapter_files:
                                file_path = os.path.join(chapter_path, file)
                                arc_name = os.path.join(chapter_dir, file)

                                if extra_verbose and chapter_files.index(file) < 5:
                                    print(f"  - Adding {arc_name}")

                                volume_zip.write(file_path, arc_name)
            except Exception as e:
                return False, f"Error creating volume ZIP: {str(e)}"

        return True, "Created"
    except Exception as e:
        return False, str(e)

def main():
    parser = argparse.ArgumentParser(description='Combine individual CBZ chapters into volume CBZ files')
    parser.add_argument('folder', help='Folder containing CBZ chapter files')
    parser.add_argument('-r', '--recursive', action='store_true', help='Search for CBZ files recursively')
    parser.add_argument('-o', '--output', help='Output folder for volume CBZ files (defaults to same location as chapters)')
    parser.add_argument('-f', '--force', action='store_true', help='Force creation even if volume CBZ already exists')
    parser.add_argument('-v', '--verbose', action='store_true', help='Show detailed progress')
    parser.add_argument('-vv', '--extra-verbose', action='store_true', help='Show extremely detailed debugging information')
    parser.add_argument('-m', '--min-chapters', type=int, default=2,
                       help='Minimum number of chapters required to create a volume (default: 2)')

    args = parser.parse_args()

    # If extra-verbose is enabled, automatically enable verbose too
    if args.extra_verbose:
        args.verbose = True

    if not os.path.isdir(args.folder):
        print(f"Error: '{args.folder}' is not a valid directory")
        return 1

    cbz_files = find_cbz_files(args.folder, args.recursive, args.extra_verbose)

    if not cbz_files:
        print(f"No CBZ files found in '{args.folder}'")
        return 0

    print(f"Found {len(cbz_files)} CBZ file(s)")

    # Organize files by manga and volume
    volumes = organize_by_volume(cbz_files, args.extra_verbose)

    total_manga = len(volumes)
    total_volumes = sum(len(volumes[manga]) for manga in volumes)

    print(f"Found {total_manga} manga series with {total_volumes} volume(s) to process")

    # Print detailed manga and volume information in extra verbose mode
    if args.extra_verbose:
        print("\nDetailed manga and volume breakdown:")
        for manga_name in volumes:
            try:
                first_volume = min(volumes[manga_name].keys())
                manga_display_name = volumes[manga_name][first_volume][0]['manga_name']
                print(f"\n{manga_display_name}:")
                for volume_num in sorted(volumes[manga_name].keys()):
                    chapters = volumes[manga_name][volume_num]
                    print(f"  Volume {volume_num}: {len(chapters)} chapters")
                    if args.extra_verbose:
                        for chapter in chapters:
                            print(f"    - Chapter {chapter['chapter_str']}: {os.path.basename(chapter['filename'])}")
                            print(f"      Exists: {os.path.exists(chapter['filename'])}")
            except (ValueError, IndexError):
                pass

    success_count = 0
    skip_count = 0
    fail_count = 0
    ignored_count = 0

    # Process each manga and volume
    for manga_name in volumes:
        # Get a proper display name from the first volume's first chapter
        try:
            first_volume = min(volumes[manga_name].keys())
            manga_display_name = volumes[manga_name][first_volume][0]['manga_name']
        except (ValueError, IndexError, KeyError):
            # Fallback if we can't get proper name
            manga_display_name = manga_name

        if args.verbose:
            print(f"\nProcessing manga: {manga_display_name}")

        for volume_num in sorted(volumes[manga_name].keys()):
            chapters = volumes[manga_name][volume_num]

            # Skip volumes with too few chapters
            if len(chapters) < args.min_chapters:
                if args.verbose:
                    print(f"Skipping Volume {volume_num} - only has {len(chapters)} chapter(s) (minimum is {args.min_chapters})")
                ignored_count += 1
                continue

            # List all chapters for debugging
            if args.extra_verbose:
                print(f"\nChapters for {manga_display_name} Volume {volume_num}:")
                for chapter in chapters:
                    print(f"  - Chapter {chapter['chapter_str']}: {os.path.basename(chapter['filename'])}")
                    print(f"    Exists: {os.path.exists(chapter['filename'])}")

            success, message = create_volume_cbz(
                manga_display_name,
                volume_num,
                chapters,
                args.output,
                args.force,
                args.verbose,
                args.extra_verbose
            )

            if success:
                if message == "Skipped (already exists)":
                    skip_count += 1
                else:
                    success_count += 1
            else:
                fail_count += 1
                print(f"Error creating Volume {volume_num} for {manga_display_name}: {message}")

    print(f"\nVolume creation complete:")
    print(f"  - {success_count} volumes created successfully")
    print(f"  - {skip_count} volumes skipped (already exist)")
    print(f"  - {ignored_count} volumes ignored (too few chapters)")
    print(f"  - {fail_count} volumes failed")

    return 0

if __name__ == '__main__':
    import sys
    sys.exit(main())