Fix build: Bundle schwab_scraper source and use local dependencies

2026-04-24 01:50:20 +00:00
parent 02ac293692
commit 650ea2d087
43 changed files with 10900 additions and 41 deletions
--- a/schwab_scraper/features/equity/parser.py
+++ b/schwab_scraper/features/equity/parser.py
@@ -0,0 +1,80 @@
+import re
+from io import BytesIO
+from typing import Dict
+import pdfplumber
+
+
+def clean_value(label: str, value: str) -> str:
+    """Cleans the extracted value based on the label."""
+    if label == "Morningstar Rating":
+        return f"{value.count('Q')} stars"
+    if label == "Economic Moat":
+        if "Wide" in value:
+            return "Wide"
+        if "Narrow" in value:
+            return "Narrow"
+        if "None" in value:
+            return "None"
+    if label in ["Fair Value", "1-Star Price", "5-Star Price"]:
+        match = re.match(r"[\d,]+\.\d{2}", value)
+        if match:
+            return match.group(0)
+    if label == "Assessment":
+        return value.split()[0]
+    if label == "52-Week-Range":
+        return value.replace('\u2014', '-')
+    if label == "52-Week Range":
+        return value.replace('\u2014', '-')
+    return value
+
+
+def parse(pdf_content: bytes) -> Dict[str, str]:
+    """
+    Parses a Morningstar PDF report to extract key data points.
+    Returns a dict keyed by the label names present in the report.
+    """
+    with pdfplumber.open(BytesIO(pdf_content)) as pdf:
+        page = pdf.pages[2]  # Page 3
+        words = page.extract_words(x_tolerance=1, y_tolerance=1, keep_blank_chars=False)
+
+        data: Dict[str, str] = {}
+        labels = [
+            "Fair Value", "1-Star Price", "5-Star Price", "Assessment",
+            "Dividend Yield", "Capital Allocation", "52-Week Range", "Investment Style",
+            "Economic Moat", "Morningstar Rating"
+        ]
+
+        for i, word in enumerate(words):
+            # Combine words to form potential labels
+            for j in range(i + 1, min(i + 4, len(words))):
+                potential_label = " ".join(w['text'] for w in words[i:j])
+                if potential_label in labels:
+                    if potential_label == "Economic Moat":
+                        # Find the value to the right of the label
+                        label_end_x = words[j-1]['x1']
+                        value_words = [
+                            w['text'] for w in words[j:]
+                            if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100
+                        ]
+                        if value_words:
+                            value = " ".join(value_words)
+                            if "Wide" in value:
+                                data[potential_label] = "Wide"
+                            elif "Narrow" in value:
+                                data[potential_label] = "Narrow"
+                            elif "None" in value:
+                                data[potential_label] = "None"
+                            break
+                    else:
+                        # Find the value to the right of the label
+                        label_end_x = words[j-1]['x1']
+                        value_words = [
+                            w['text'] for w in words[j:]
+                            if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100
+                        ]
+                        if value_words:
+                            # Join the value words and clean them
+                            value = " ".join(value_words)
+                            data[potential_label] = clean_value(potential_label, value)
+                            break  # Move to the next word once a label is found
+        return data