schwab-mcp-custom/schwab_scraper/features/equity/parser.py

import re
from io import BytesIO
from typing import Dict
import pdfplumber


def clean_value(label: str, value: str) -> str:
    """Cleans the extracted value based on the label."""
    if label == "Morningstar Rating":
        return f"{value.count('Q')} stars"
    if label == "Economic Moat":
        if "Wide" in value:
            return "Wide"
        if "Narrow" in value:
            return "Narrow"
        if "None" in value:
            return "None"
    if label in ["Fair Value", "1-Star Price", "5-Star Price"]:
        match = re.match(r"[\d,]+\.\d{2}", value)
        if match:
            return match.group(0)
    if label == "Assessment":
        return value.split()[0]
    if label == "52-Week-Range":
        return value.replace('\u2014', '-')
    if label == "52-Week Range":
        return value.replace('\u2014', '-')
    return value


def parse(pdf_content: bytes) -> Dict[str, str]:
    """
    Parses a Morningstar PDF report to extract key data points.
    Returns a dict keyed by the label names present in the report.
    """
    with pdfplumber.open(BytesIO(pdf_content)) as pdf:
        page = pdf.pages[2]  # Page 3
        words = page.extract_words(x_tolerance=1, y_tolerance=1, keep_blank_chars=False)

        data: Dict[str, str] = {}
        labels = [
            "Fair Value", "1-Star Price", "5-Star Price", "Assessment",
            "Dividend Yield", "Capital Allocation", "52-Week Range", "Investment Style",
            "Economic Moat", "Morningstar Rating"
        ]

        for i, word in enumerate(words):
            # Combine words to form potential labels
            for j in range(i + 1, min(i + 4, len(words))):
                potential_label = " ".join(w['text'] for w in words[i:j])
                if potential_label in labels:
                    if potential_label == "Economic Moat":
                        # Find the value to the right of the label
                        label_end_x = words[j-1]['x1']
                        value_words = [
                            w['text'] for w in words[j:]
                            if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100
                        ]
                        if value_words:
                            value = " ".join(value_words)
                            if "Wide" in value:
                                data[potential_label] = "Wide"
                            elif "Narrow" in value:
                                data[potential_label] = "Narrow"
                            elif "None" in value:
                                data[potential_label] = "None"
                            break
                    else:
                        # Find the value to the right of the label
                        label_end_x = words[j-1]['x1']
                        value_words = [
                            w['text'] for w in words[j:]
                            if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100
                        ]
                        if value_words:
                            # Join the value words and clean them
                            value = " ".join(value_words)
                            data[potential_label] = clean_value(potential_label, value)
                            break  # Move to the next word once a label is found
        return data