Fix build: Bundle schwab_scraper source and use local dependencies
All checks were successful
Build and Push Docker Image / build (push) Successful in 34s
All checks were successful
Build and Push Docker Image / build (push) Successful in 34s
This commit is contained in:
80
schwab_scraper/features/equity/parser.py
Normal file
80
schwab_scraper/features/equity/parser.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import re
|
||||
from io import BytesIO
|
||||
from typing import Dict
|
||||
import pdfplumber
|
||||
|
||||
|
||||
def clean_value(label: str, value: str) -> str:
|
||||
"""Cleans the extracted value based on the label."""
|
||||
if label == "Morningstar Rating":
|
||||
return f"{value.count('Q')} stars"
|
||||
if label == "Economic Moat":
|
||||
if "Wide" in value:
|
||||
return "Wide"
|
||||
if "Narrow" in value:
|
||||
return "Narrow"
|
||||
if "None" in value:
|
||||
return "None"
|
||||
if label in ["Fair Value", "1-Star Price", "5-Star Price"]:
|
||||
match = re.match(r"[\d,]+\.\d{2}", value)
|
||||
if match:
|
||||
return match.group(0)
|
||||
if label == "Assessment":
|
||||
return value.split()[0]
|
||||
if label == "52-Week-Range":
|
||||
return value.replace('\u2014', '-')
|
||||
if label == "52-Week Range":
|
||||
return value.replace('\u2014', '-')
|
||||
return value
|
||||
|
||||
|
||||
def parse(pdf_content: bytes) -> Dict[str, str]:
|
||||
"""
|
||||
Parses a Morningstar PDF report to extract key data points.
|
||||
Returns a dict keyed by the label names present in the report.
|
||||
"""
|
||||
with pdfplumber.open(BytesIO(pdf_content)) as pdf:
|
||||
page = pdf.pages[2] # Page 3
|
||||
words = page.extract_words(x_tolerance=1, y_tolerance=1, keep_blank_chars=False)
|
||||
|
||||
data: Dict[str, str] = {}
|
||||
labels = [
|
||||
"Fair Value", "1-Star Price", "5-Star Price", "Assessment",
|
||||
"Dividend Yield", "Capital Allocation", "52-Week Range", "Investment Style",
|
||||
"Economic Moat", "Morningstar Rating"
|
||||
]
|
||||
|
||||
for i, word in enumerate(words):
|
||||
# Combine words to form potential labels
|
||||
for j in range(i + 1, min(i + 4, len(words))):
|
||||
potential_label = " ".join(w['text'] for w in words[i:j])
|
||||
if potential_label in labels:
|
||||
if potential_label == "Economic Moat":
|
||||
# Find the value to the right of the label
|
||||
label_end_x = words[j-1]['x1']
|
||||
value_words = [
|
||||
w['text'] for w in words[j:]
|
||||
if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100
|
||||
]
|
||||
if value_words:
|
||||
value = " ".join(value_words)
|
||||
if "Wide" in value:
|
||||
data[potential_label] = "Wide"
|
||||
elif "Narrow" in value:
|
||||
data[potential_label] = "Narrow"
|
||||
elif "None" in value:
|
||||
data[potential_label] = "None"
|
||||
break
|
||||
else:
|
||||
# Find the value to the right of the label
|
||||
label_end_x = words[j-1]['x1']
|
||||
value_words = [
|
||||
w['text'] for w in words[j:]
|
||||
if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100
|
||||
]
|
||||
if value_words:
|
||||
# Join the value words and clean them
|
||||
value = " ".join(value_words)
|
||||
data[potential_label] = clean_value(potential_label, value)
|
||||
break # Move to the next word once a label is found
|
||||
return data
|
||||
Reference in New Issue
Block a user