Fix build: Bundle schwab_scraper source and use local dependencies
All checks were successful
Build and Push Docker Image / build (push) Successful in 34s

This commit is contained in:
2026-04-24 01:50:20 +00:00
parent 02ac293692
commit 650ea2d087
43 changed files with 10900 additions and 41 deletions

View File

@@ -0,0 +1,80 @@
import re
from io import BytesIO
from typing import Dict
import pdfplumber
def clean_value(label: str, value: str) -> str:
"""Cleans the extracted value based on the label."""
if label == "Morningstar Rating":
return f"{value.count('Q')} stars"
if label == "Economic Moat":
if "Wide" in value:
return "Wide"
if "Narrow" in value:
return "Narrow"
if "None" in value:
return "None"
if label in ["Fair Value", "1-Star Price", "5-Star Price"]:
match = re.match(r"[\d,]+\.\d{2}", value)
if match:
return match.group(0)
if label == "Assessment":
return value.split()[0]
if label == "52-Week-Range":
return value.replace('\u2014', '-')
if label == "52-Week Range":
return value.replace('\u2014', '-')
return value
def parse(pdf_content: bytes) -> Dict[str, str]:
"""
Parses a Morningstar PDF report to extract key data points.
Returns a dict keyed by the label names present in the report.
"""
with pdfplumber.open(BytesIO(pdf_content)) as pdf:
page = pdf.pages[2] # Page 3
words = page.extract_words(x_tolerance=1, y_tolerance=1, keep_blank_chars=False)
data: Dict[str, str] = {}
labels = [
"Fair Value", "1-Star Price", "5-Star Price", "Assessment",
"Dividend Yield", "Capital Allocation", "52-Week Range", "Investment Style",
"Economic Moat", "Morningstar Rating"
]
for i, word in enumerate(words):
# Combine words to form potential labels
for j in range(i + 1, min(i + 4, len(words))):
potential_label = " ".join(w['text'] for w in words[i:j])
if potential_label in labels:
if potential_label == "Economic Moat":
# Find the value to the right of the label
label_end_x = words[j-1]['x1']
value_words = [
w['text'] for w in words[j:]
if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100
]
if value_words:
value = " ".join(value_words)
if "Wide" in value:
data[potential_label] = "Wide"
elif "Narrow" in value:
data[potential_label] = "Narrow"
elif "None" in value:
data[potential_label] = "None"
break
else:
# Find the value to the right of the label
label_end_x = words[j-1]['x1']
value_words = [
w['text'] for w in words[j:]
if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100
]
if value_words:
# Join the value words and clean them
value = " ".join(value_words)
data[potential_label] = clean_value(potential_label, value)
break # Move to the next word once a label is found
return data