All checks were successful
Build and Push Docker Image / build (push) Successful in 34s
81 lines
3.4 KiB
Python
81 lines
3.4 KiB
Python
import re
|
|
from io import BytesIO
|
|
from typing import Dict
|
|
import pdfplumber
|
|
|
|
|
|
def clean_value(label: str, value: str) -> str:
|
|
"""Cleans the extracted value based on the label."""
|
|
if label == "Morningstar Rating":
|
|
return f"{value.count('Q')} stars"
|
|
if label == "Economic Moat":
|
|
if "Wide" in value:
|
|
return "Wide"
|
|
if "Narrow" in value:
|
|
return "Narrow"
|
|
if "None" in value:
|
|
return "None"
|
|
if label in ["Fair Value", "1-Star Price", "5-Star Price"]:
|
|
match = re.match(r"[\d,]+\.\d{2}", value)
|
|
if match:
|
|
return match.group(0)
|
|
if label == "Assessment":
|
|
return value.split()[0]
|
|
if label == "52-Week-Range":
|
|
return value.replace('\u2014', '-')
|
|
if label == "52-Week Range":
|
|
return value.replace('\u2014', '-')
|
|
return value
|
|
|
|
|
|
def parse(pdf_content: bytes) -> Dict[str, str]:
|
|
"""
|
|
Parses a Morningstar PDF report to extract key data points.
|
|
Returns a dict keyed by the label names present in the report.
|
|
"""
|
|
with pdfplumber.open(BytesIO(pdf_content)) as pdf:
|
|
page = pdf.pages[2] # Page 3
|
|
words = page.extract_words(x_tolerance=1, y_tolerance=1, keep_blank_chars=False)
|
|
|
|
data: Dict[str, str] = {}
|
|
labels = [
|
|
"Fair Value", "1-Star Price", "5-Star Price", "Assessment",
|
|
"Dividend Yield", "Capital Allocation", "52-Week Range", "Investment Style",
|
|
"Economic Moat", "Morningstar Rating"
|
|
]
|
|
|
|
for i, word in enumerate(words):
|
|
# Combine words to form potential labels
|
|
for j in range(i + 1, min(i + 4, len(words))):
|
|
potential_label = " ".join(w['text'] for w in words[i:j])
|
|
if potential_label in labels:
|
|
if potential_label == "Economic Moat":
|
|
# Find the value to the right of the label
|
|
label_end_x = words[j-1]['x1']
|
|
value_words = [
|
|
w['text'] for w in words[j:]
|
|
if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100
|
|
]
|
|
if value_words:
|
|
value = " ".join(value_words)
|
|
if "Wide" in value:
|
|
data[potential_label] = "Wide"
|
|
elif "Narrow" in value:
|
|
data[potential_label] = "Narrow"
|
|
elif "None" in value:
|
|
data[potential_label] = "None"
|
|
break
|
|
else:
|
|
# Find the value to the right of the label
|
|
label_end_x = words[j-1]['x1']
|
|
value_words = [
|
|
w['text'] for w in words[j:]
|
|
if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100
|
|
]
|
|
if value_words:
|
|
# Join the value words and clean them
|
|
value = " ".join(value_words)
|
|
data[potential_label] = clean_value(potential_label, value)
|
|
break # Move to the next word once a label is found
|
|
return data
|