import re from io import BytesIO from typing import Dict import pdfplumber def clean_value(label: str, value: str) -> str: """Cleans the extracted value based on the label.""" if label == "Morningstar Rating": return f"{value.count('Q')} stars" if label == "Economic Moat": if "Wide" in value: return "Wide" if "Narrow" in value: return "Narrow" if "None" in value: return "None" if label in ["Fair Value", "1-Star Price", "5-Star Price"]: match = re.match(r"[\d,]+\.\d{2}", value) if match: return match.group(0) if label == "Assessment": return value.split()[0] if label == "52-Week-Range": return value.replace('\u2014', '-') if label == "52-Week Range": return value.replace('\u2014', '-') return value def parse(pdf_content: bytes) -> Dict[str, str]: """ Parses a Morningstar PDF report to extract key data points. Returns a dict keyed by the label names present in the report. """ with pdfplumber.open(BytesIO(pdf_content)) as pdf: page = pdf.pages[2] # Page 3 words = page.extract_words(x_tolerance=1, y_tolerance=1, keep_blank_chars=False) data: Dict[str, str] = {} labels = [ "Fair Value", "1-Star Price", "5-Star Price", "Assessment", "Dividend Yield", "Capital Allocation", "52-Week Range", "Investment Style", "Economic Moat", "Morningstar Rating" ] for i, word in enumerate(words): # Combine words to form potential labels for j in range(i + 1, min(i + 4, len(words))): potential_label = " ".join(w['text'] for w in words[i:j]) if potential_label in labels: if potential_label == "Economic Moat": # Find the value to the right of the label label_end_x = words[j-1]['x1'] value_words = [ w['text'] for w in words[j:] if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100 ] if value_words: value = " ".join(value_words) if "Wide" in value: data[potential_label] = "Wide" elif "Narrow" in value: data[potential_label] = "Narrow" elif "None" in value: data[potential_label] = "None" break else: # Find the value to the right of the label label_end_x = words[j-1]['x1'] value_words = [ w['text'] for w in words[j:] if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100 ] if value_words: # Join the value words and clean them value = " ".join(value_words) data[potential_label] = clean_value(potential_label, value) break # Move to the next word once a label is found return data