Fix build: Bundle schwab_scraper source and use local dependencies
All checks were successful
Build and Push Docker Image / build (push) Successful in 34s

This commit is contained in:
2026-04-24 01:50:20 +00:00
parent 02ac293692
commit 650ea2d087
43 changed files with 10900 additions and 41 deletions

View File

@@ -28,5 +28,3 @@ jobs:
context: . context: .
push: true push: true
tags: gitea.ext.ben.io/${{ gitea.repository }}:latest tags: gitea.ext.ben.io/${{ gitea.repository }}:latest
build-args: |
GITEA_TOKEN=${{ secrets.CR_PAT }}

View File

@@ -4,29 +4,13 @@ ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy
WORKDIR /app WORKDIR /app
# Install git for dependency installation # Copy dependency files and install
RUN apt-get update && apt-get install -y --no-install-recommends \ COPY pyproject.toml uv.lock ./
git \ RUN uv sync --frozen --no-dev --no-install-project
&& rm -rf /var/lib/apt/lists/*
# Use Gitea PAT for private dependencies if provided # Copy project files
ARG GITEA_TOKEN COPY . .
RUN if [ -n "$GITEA_TOKEN" ]; then \ RUN uv sync --frozen --no-dev
git config --global url."https://${GITEA_TOKEN}@gitea.ext.ben.io/".insteadOf "https://gitea.ext.ben.io/"; \
fi
# Install dependencies
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
--mount=type=bind,source=uv.lock,target=uv.lock \
uv sync --frozen --no-install-project --no-dev
# Copy the rest of the application
COPY . /app
# Install the project
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --frozen --no-dev
FROM python:3.12-slim-bookworm FROM python:3.12-slim-bookworm

View File

@@ -9,7 +9,13 @@ dependencies = [
"fastmcp>=0.4.1", "fastmcp>=0.4.1",
"starlette>=0.41.0", "starlette>=0.41.0",
"uvicorn>=0.32.0", "uvicorn>=0.32.0",
"schwab-scraper @ git+https://gitea.ext.ben.io/b3nw/schwab-scraper.git", "aiohttp>=3.9.0",
"fastapi>=0.136.1",
"greenlet>=3.2.3",
"pdfplumber>=0.11.4",
"playwright==1.54.0",
"pyee>=13.0.0",
"typing-extensions>=4.14.0",
] ]
[build-system] [build-system]
@@ -20,4 +26,4 @@ build-backend = "hatchling.build"
allow-direct-references = true allow-direct-references = true
[tool.hatch.build.targets.wheel] [tool.hatch.build.targets.wheel]
packages = [] packages = ["schwab_scraper"]

View File

@@ -0,0 +1,37 @@
"""Public package exports sync wrappers and unified API references."""
from .api import (
get_morningstar_data,
get_transaction_history,
get_transaction_history_enhanced,
list_accounts,
get_account_overview,
get_positions,
get_portfolio_snapshot,
refresh_session,
check_session_health,
get_session_status,
get_session_info,
ensure_valid_session,
export_cookies,
set_cookies,
list_available_accounts,
)
__all__ = [
"get_morningstar_data",
"get_transaction_history",
"get_transaction_history_enhanced",
"list_accounts",
"get_account_overview",
"get_positions",
"get_portfolio_snapshot",
"refresh_session",
"check_session_health",
"get_session_status",
"get_session_info",
"ensure_valid_session",
"export_cookies",
"set_cookies",
"list_available_accounts",
]

View File

@@ -0,0 +1,7 @@
#!/usr/bin/env python3
"""Main entry point for the schwab-morningstar-scraper package when run with python3 -m."""
from .cli import main
if __name__ == "__main__":
main()

102
schwab_scraper/api.py Normal file
View File

@@ -0,0 +1,102 @@
import asyncio
from . import unified_api
from .browser.session import get_session_info as _session_info
def get_morningstar_data(ticker: str, debug: bool = False):
"""Synchronous wrapper for `unified_api.get_morningstar_data`"""
return asyncio.run(unified_api.get_morningstar_data(ticker, debug=debug))
def get_transaction_history(account=None, start_date=None, end_date=None, time_period=None, debug=False):
"""Synchronous wrapper for `unified_api.get_transaction_history`"""
return asyncio.run(
unified_api.get_transaction_history(
account=account,
start_date=start_date,
end_date=end_date,
time_period=time_period,
debug=debug,
)
)
def get_transaction_history_enhanced(account=None, start_date=None, end_date=None, time_period=None, debug=False):
"""Synchronous wrapper for enhanced transaction history."""
return asyncio.run(
unified_api.get_transaction_history_enhanced(
account=account,
start_date=start_date,
end_date=end_date,
time_period=time_period,
debug=debug,
)
)
def list_accounts(debug: bool = False):
"""Synchronous wrapper for account discovery."""
return asyncio.run(unified_api.list_accounts(debug=debug))
def get_account_overview(account=None, debug: bool = False):
return asyncio.run(unified_api.get_account_overview(account=account, debug=debug))
def get_positions(account=None, include_non_equity: bool = False, debug: bool = False):
return asyncio.run(
unified_api.get_positions(
account=account,
include_non_equity=include_non_equity,
debug=debug,
)
)
def get_portfolio_snapshot(account=None, aggregate_by_symbol: bool = True, include_non_equity: bool = False, debug: bool = False):
return asyncio.run(
unified_api.get_portfolio_snapshot(
account=account,
aggregate_by_symbol=aggregate_by_symbol,
include_non_equity=include_non_equity,
debug=debug,
)
)
def refresh_session(debug: bool = False):
return asyncio.run(unified_api.refresh_session(debug=debug))
def check_session_health(debug: bool = False):
envelope = asyncio.run(unified_api.get_session_status(debug=debug))
return envelope["success"]
def get_session_status(debug: bool = False):
return asyncio.run(unified_api.get_session_status(debug=debug))
def get_session_info(debug: bool = False):
return _session_info()
def ensure_valid_session(debug: bool = False):
envelope = asyncio.run(unified_api.refresh_session(debug=debug))
return envelope["success"]
def export_cookies(cookies_path: str, debug: bool = False):
"""Synchronous wrapper for exporting cookies."""
return asyncio.run(unified_api.export_cookies(cookies_path, debug=debug))
def set_cookies(cookies_path: str, debug: bool = False):
"""Synchronous wrapper for setting cookies."""
return asyncio.run(unified_api.set_cookies(cookies_path, debug=debug))
def list_available_accounts(debug: bool = False):
"""Synchronous wrapper for listing available transaction accounts."""
return asyncio.run(unified_api.list_available_accounts(debug=debug))

View File

@@ -0,0 +1,20 @@
from .client import connect, new_context, new_page
from .navigation import goto_with_auth_check
from .session import (
export_cookies,
get_session_status,
refresh_session,
set_cookies_from_file,
)
__all__ = [
"connect",
"new_context",
"new_page",
"goto_with_auth_check",
"get_session_status",
"refresh_session",
"set_cookies_from_file",
"export_cookies",
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,30 @@
from typing import Any
from playwright.async_api import async_playwright
async def connect(playwright_url: str):
p = await async_playwright().start()
browser = await p.chromium.connect(playwright_url)
return p, browser
async def new_context(browser, cookies: list[dict] | None = None, user_agent: str | None = None):
context = await browser.new_context(
user_agent=user_agent or 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
)
if cookies:
valid_same_site_values = ['Strict', 'Lax', 'None']
for cookie in cookies:
if cookie.get('sameSite') not in valid_same_site_values:
if cookie.get('sameSite') == 'no_restriction':
cookie['sameSite'] = 'None'
else:
cookie['sameSite'] = 'Lax'
await context.add_cookies(cookies) # type: ignore
return context
async def new_page(context):
return await context.new_page()

View File

@@ -0,0 +1,38 @@
async def ensure_authenticated_page(page, context, debug: bool = False) -> bool:
if 'login' in page.url.lower() or 'sessiontimeout=y' in page.url.lower():
if debug:
print("DEBUG: Detected session timeout, attempting re-authentication...")
from ..core.config import load_config, get_schwab_credentials # adjusted after refactor
from .auth import login_to_schwab
config = load_config()
username, password = get_schwab_credentials(config)
if username and password:
fresh_cookies = await login_to_schwab(username, password)
if fresh_cookies:
await context.clear_cookies()
await context.add_cookies(fresh_cookies)
if debug:
print("DEBUG: Re-authentication successful")
return True
else:
if debug:
print("DEBUG: Re-authentication failed")
return False
else:
if debug:
print("DEBUG: No credentials available for re-authentication")
return False
return True
async def goto_with_auth_check(page, context, url: str, debug: bool = False, timeout: int = 60000):
await page.goto(url, timeout=timeout)
await page.wait_for_load_state('domcontentloaded')
if not await ensure_authenticated_page(page, context, debug=debug):
return False
if 'login' in page.url.lower() or 'sessiontimeout=y' in page.url.lower():
await page.goto(url, timeout=timeout)
await page.wait_for_load_state('domcontentloaded')
return True

View File

@@ -0,0 +1,470 @@
"""
Session management module for maintaining Schwab authenticated sessions.
This module provides functionality to refresh session state through browser navigation
without requiring 2FA approval for active sessions.
"""
import json
import logging
import time
from typing import List, Dict, Any, Optional
from datetime import datetime
from playwright.async_api import async_playwright
from ..core.config import load_config, get_playwright_url, get_cookies_path
from .client import new_context, new_page
from ..core import ErrorType, Envelope, fail, ok
async def refresh_session_state(cookies: Optional[List[Dict[str, Any]]] = None) -> bool:
"""
Refresh session state through browser navigation.
This function maintains active sessions by navigating to a Schwab page,
which updates cookie expiration times and session state without requiring
2FA approval for active sessions.
Args:
cookies: Optional list of cookies to use. If None, loads from cookies.json
Returns:
bool: True if session refresh was successful, False otherwise
"""
logger = logging.getLogger(__name__)
try:
logger.info("Starting session refresh through navigation")
# Load cookies if not provided
if cookies is None:
cookies_path = get_cookies_path()
try:
with open(cookies_path, 'r') as f:
cookies = json.load(f)
logger.info(f"Loaded {len(cookies) if cookies else 0} cookies from {cookies_path}")
except (FileNotFoundError, json.JSONDecodeError) as e:
logger.error(f"Could not load cookies: {e}")
return False
if not cookies:
logger.error("No cookies available for session refresh")
return False
config = load_config()
playwright_url = get_playwright_url(config)
async with async_playwright() as p:
try:
browser = await p.chromium.connect(playwright_url)
except Exception as e:
logger.error(f"Failed to connect to browser: {e}")
return False
try:
# Create context with existing cookies
context = await new_context(browser, cookies=cookies)
page = await new_page(context)
# Navigate to refresh session state
logger.info("Navigating to Schwab research page to refresh session")
await page.goto("https://client.schwab.com/app/research/#/stocks/AAPL", timeout=30000)
await page.wait_for_timeout(2000) # Let page settle and cookies update
# Check if navigation was successful (no redirect to login)
current_url = page.url
is_redirected = any(pattern in current_url for pattern in [
'/login', '/signin', '/auth', '/Access/'
])
if is_redirected:
logger.warning(f"Session refresh failed: redirected to login page")
logger.debug(f"Current URL: {current_url}")
await context.close()
await browser.close()
return False
# Get updated cookies after navigation
new_cookies = await context.cookies()
logger.info(f"Retrieved {len(new_cookies)} cookies after navigation")
# Check if we still have critical session cookies
critical_session_cookies = ['LVAL', 'NS2', 'sstate']
missing_critical_cookies = []
for cookie_name in critical_session_cookies:
old_cookie = next((c for c in cookies if c['name'] == cookie_name), None)
new_cookie = next((c for c in new_cookies if c['name'] == cookie_name), None)
if not new_cookie:
missing_critical_cookies.append(cookie_name)
elif old_cookie and new_cookie.get('expires') != -1:
# Session cookies should have expires = -1
missing_critical_cookies.append(f"{cookie_name} (invalid session cookie)")
if missing_critical_cookies:
logger.warning(f"Session refresh failed: missing critical session cookies: {missing_critical_cookies}")
await context.close()
await browser.close()
return False
# Compare cookie states to detect changes
changes = []
old_dict = {c['name']: c for c in cookies}
new_dict = {c['name']: c for c in new_cookies}
# Check for modified cookies (especially expiration changes)
for name in old_dict:
if name in new_dict:
old_cookie = old_dict[name]
new_cookie = new_dict[name]
# Check if expiration changed
old_expires = old_cookie.get('expires', -1)
new_expires = new_cookie.get('expires', -1)
if old_expires != new_expires:
changes.append({
'type': 'expiration_changed',
'name': name,
'old_expires': old_expires,
'new_expires': new_expires
})
if changes:
logger.info(f"Detected {len(changes)} cookie changes (session refreshed)")
for change in changes[:3]: # Show first 3
logger.debug(f" {change['name']}: expiration updated")
else:
logger.info("No cookie changes detected (session maintained)")
# Save updated cookies
cookies_path = get_cookies_path()
with open(cookies_path, 'w') as f:
json.dump(new_cookies, f, indent=2)
logger.info(f"Saved {len(new_cookies)} updated cookies")
await context.close()
await browser.close()
return True
except Exception as e:
logger.error(f"Error during session refresh: {e}")
try:
await context.close()
except:
pass
await browser.close()
return False
except Exception as e:
logger.error(f"Session refresh failed: {e}")
return False
async def maintain_session_health() -> bool:
"""
Check if the current session is healthy by attempting a simple navigation.
Returns:
bool: True if session is healthy, False if refresh is needed
"""
logger = logging.getLogger(__name__)
try:
logger.info("Checking session health")
# Load current cookies
cookies_path = get_cookies_path()
try:
with open(cookies_path, 'r') as f:
cookies = json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
logger.error("No valid cookies found")
return False
if not cookies:
logger.error("No cookies available")
return False
# First, check if we have valid session cookies (basic check)
current_time = int(time.time())
has_valid_session_cookies = False
for cookie in cookies:
name = cookie.get('name', '')
expires = cookie.get('expires', -1)
# Check for actual Schwab session cookies
if name in ['auth', 'ASP.NET_SessionId', 'SessionInfo', '__RequestVerificationToken']:
# Session cookies (expires=-1) are valid until browser closes
# Other cookies must not be expired
if expires == -1 or (expires and expires > current_time):
has_valid_session_cookies = True
break
if not has_valid_session_cookies:
logger.warning("Session health check: FAILED - no valid session cookies found")
return False
config = load_config()
playwright_url = get_playwright_url(config)
async with async_playwright() as p:
browser = await p.chromium.connect(playwright_url)
try:
context = await new_context(browser, cookies=cookies)
page = await new_page(context)
# Navigate to a simple page to test session
await page.goto("https://client.schwab.com/app/research/#/stocks/AAPL", timeout=30000)
# Check if we're still authenticated by URL pattern
current_url = page.url
logger.debug(f"Current URL after navigation: {current_url}")
is_authenticated_by_url = any(pattern in current_url for pattern in [
'/app/', '/Apps/', '/accounts/', '/Areas/Accounts', '/summary'
])
# Check for login redirect patterns
is_redirected = any(pattern in current_url for pattern in [
'/login', '/signin', '/auth', '/Access/'
])
logger.debug(f"Authenticated by URL pattern: {is_authenticated_by_url}")
logger.debug(f"Redirected to login: {is_redirected}")
# Primary check: If we're not redirected and have a good URL pattern, we're authenticated
if is_authenticated_by_url and not is_redirected:
logger.info("Session health check: PASSED - authenticated URL detected")
result = True
elif is_redirected:
logger.warning("Session health check: FAILED - redirect to login detected")
result = False
else:
# Secondary check: Look for any page content that indicates we're not on a login page
try:
# Check for login form elements
login_indicators = [
'input[type="password"]',
'input[name*="login"]',
'input[name*="user"]',
'input[id*="login"]',
'input[id*="user"]',
'button:has-text("Log In")',
'button:has-text("Sign In")'
]
login_found = False
for selector in login_indicators:
login_element = await page.query_selector(selector)
if login_element:
login_found = True
break
if login_found:
logger.warning("Session health check: FAILED - login form detected")
result = False
else:
logger.info("Session health check: PASSED - no login form detected")
result = True
except Exception as e:
logger.debug(f"Login form check error: {e}")
# If we can't check, assume healthy if we have valid cookies and no redirect
logger.info("Session health check: PASSED - based on cookies and URL")
result = True
await context.close()
await browser.close()
return result
except Exception as e:
logger.error(f"Session health check error: {e}")
try:
await context.close()
except:
pass
await browser.close()
return False
except Exception as e:
logger.error(f"Session health check failed: {e}")
return False
def get_session_info() -> Dict[str, Any]:
"""
Get information about the current session state.
Returns:
Dict containing session information
"""
cookies_path = get_cookies_path()
try:
with open(cookies_path, 'r') as f:
cookies = json.load(f)
session_cookies = []
expiring_cookies = []
current_time = datetime.now().timestamp()
for cookie in cookies:
name = cookie.get('name', '')
expires = cookie.get('expires', -1)
# Check if this is a session-related cookie
if any(keyword in name.lower() for keyword in ['session', 'auth', 'token']):
session_cookies.append({
'name': name,
'domain': cookie.get('domain', ''),
'expires': expires,
'is_session_cookie': expires == -1
})
if expires != -1 and expires > 0:
days_until_expire = (expires - current_time) / (24 * 3600)
if days_until_expire < 7: # Expiring within a week
expiring_cookies.append({
'name': name,
'days_until_expire': days_until_expire
})
return {
'total_cookies': len(cookies),
'session_cookies': len(session_cookies),
'expiring_cookies': len(expiring_cookies),
'expiring_soon': expiring_cookies,
'session_status': 'active' if session_cookies else 'no_session_cookies'
}
except (FileNotFoundError, json.JSONDecodeError):
return {
'error': 'No valid cookies found',
'total_cookies': 0,
'session_cookies': 0,
'expiring_cookies': 0,
'expiring_soon': [],
'session_status': 'missing_cookies'
}
async def ensure_valid_session() -> bool:
"""
Ensure we have a valid session, attempting refresh if needed.
Returns:
bool: True if a valid session exists or was successfully refreshed
"""
logger = logging.getLogger(__name__)
# First check if we have any cookies
cookies_path = get_cookies_path()
try:
with open(cookies_path, 'r') as f:
cookies = json.load(f)
if not cookies:
logger.error("No cookies available")
return False
except (FileNotFoundError, json.JSONDecodeError):
logger.error("No valid cookies found")
return False
# Check session health
if await maintain_session_health():
logger.info("Session is healthy")
return True
# Session needs refresh
logger.info("Session needs refresh, attempting navigation refresh")
return await refresh_session_state(cookies)
async def get_session_status(debug: bool = False) -> Envelope[dict]:
logger = logging.getLogger(__name__)
try:
# First get basic cookie information
info = get_session_info()
# If we have session cookies, validate they actually work with Schwab
if info.get('session_status') == 'active':
logger.debug("Session cookies found, validating with Schwab...")
# Use maintain_session_health to actually test the session
is_healthy = await maintain_session_health()
if not is_healthy:
# Update status to reflect that cookies exist but are invalid
info['session_status'] = 'invalid'
info['validation_error'] = 'Session cookies exist but Schwab authentication failed'
logger.warning("Session validation failed: cookies present but not accepted by Schwab")
else:
logger.debug("Session validation succeeded")
logger.debug("Session status info: %s", info)
return ok(info)
except Exception as exc:
logger.exception("Failed to gather session status")
return fail(str(exc), ErrorType.UNKNOWN, retryable=True)
async def refresh_session(debug: bool = False) -> Envelope[None]:
logger = logging.getLogger(__name__)
try:
refreshed = await refresh_session_state()
if refreshed:
logger.info("Session refresh succeeded")
return ok(None)
logger.warning("Session refresh failed")
return fail("Session refresh failed", ErrorType.AUTHENTICATION, retryable=True)
except Exception as exc:
logger.exception("Exception during session refresh")
return fail(str(exc), ErrorType.UNKNOWN, retryable=True)
async def set_cookies_from_file(path: str, debug: bool = False) -> Envelope[None]:
logger = logging.getLogger(__name__)
try:
with open(path, "r") as fh:
cookies = json.load(fh)
cookies_path = get_cookies_path()
with open(cookies_path, "w") as fh:
json.dump(cookies, fh, indent=2)
logger.info("Imported %s cookies from %s", len(cookies), path)
return ok(None)
except (FileNotFoundError, json.JSONDecodeError) as exc:
logger.error("Failed to load cookies from %s: %s", path, exc)
return fail(str(exc), ErrorType.VALIDATION, retryable=False)
except Exception as exc:
logger.exception("Unexpected error importing cookies from %s", path)
return fail(str(exc), ErrorType.UNKNOWN, retryable=True)
async def export_cookies(path: str, debug: bool = False) -> Envelope[None]:
logger = logging.getLogger(__name__)
cookies_path = get_cookies_path()
try:
with open(cookies_path, "r") as fh:
cookies = json.load(fh)
with open(path, "w") as fh:
json.dump(cookies, fh, indent=2)
logger.info("Exported %s cookies to %s", len(cookies), path)
return ok(None)
except (FileNotFoundError, json.JSONDecodeError) as exc:
logger.error("Failed to read cookies for export: %s", exc)
return fail(str(exc), ErrorType.AUTHENTICATION, retryable=False)
except Exception as exc:
logger.exception("Unexpected error exporting cookies to %s", path)
return fail(str(exc), ErrorType.UNKNOWN, retryable=True)

190
schwab_scraper/cli.py Normal file
View File

@@ -0,0 +1,190 @@
import asyncio
import argparse
import json
import os
from dataclasses import asdict, is_dataclass
from typing import Any
from . import unified_api
from .browser.auth import login_to_schwab
from .core.config import load_config, get_schwab_credentials, set_config_path, set_cookies_path
def _to_serializable(obj: Any) -> Any:
if is_dataclass(obj):
return asdict(obj)
if isinstance(obj, list):
return [_to_serializable(item) for item in obj]
if isinstance(obj, dict):
return {key: _to_serializable(value) for key, value in obj.items()}
return obj
def _print_envelope(envelope):
payload = dict(envelope)
payload["data"] = _to_serializable(payload.get("data"))
print(json.dumps(payload, indent=2, default=str))
async def test_scraper(ticker: str, debug: bool):
"""Test the get_morningstar_data function."""
print(f"Running scraper test for ticker: {ticker}")
data = await unified_api.get_morningstar_data(ticker, debug=debug)
_print_envelope(data)
async def async_main():
parser = argparse.ArgumentParser(description="Schwab Morningstar Scraper CLI")
parser.add_argument("ticker", nargs='?', help="Stock ticker to scrape")
parser.add_argument("--debug", action="store_true", help="Enable debug output")
parser.add_argument("--login", action="store_true", help="Login only (don't scrape)")
parser.add_argument("--test", action="store_true", help="Test mode")
parser.add_argument("--phase1", action="store_true", help="Extract Phase 1 enhanced equity data (quote, dividends, earnings, valuation ratios)")
# Configuration file paths
parser.add_argument("--config-path", metavar="PATH", help="Custom path for config.json file")
parser.add_argument("--cookies-path", metavar="PATH", help="Custom path for cookies.json file")
# Session commands
parser.add_argument("--session-status", action="store_true", help="Display current session status")
parser.add_argument("--export-cookies", metavar="PATH", help="Export cookies to file")
parser.add_argument("--set-cookies", metavar="PATH", help="Load cookies from file")
# Transactions + accounts
parser.add_argument("--transactions", action="store_true", help="Export and parse transaction history")
parser.add_argument("--list-accounts", action="store_true", help="List available accounts")
parser.add_argument("--account", help="Account identifier (ending digits like 604 or name like Joint)")
parser.add_argument("--start-date", help="Start date for custom range (YYYY-MM-DD)")
parser.add_argument("--end-date", help="End date for custom range (YYYY-MM-DD)")
parser.add_argument("--time-period", help="Preset period (e.g., 'Current Month', 'Last 6 Months')")
# Accounts & positions
parser.add_argument("--account-overview", nargs='?', const="", help="Show balances for account or aggregate if omitted")
parser.add_argument("--positions", nargs='?', const="", help="Show positions for account or aggregate if omitted")
parser.add_argument("--portfolio-snapshot", nargs='?', const="", help="Show portfolio snapshot for account or aggregate if omitted")
parser.add_argument("--include-non-equity", action="store_true", help="Include non-equity positions")
parser.add_argument("--no-aggregate", action="store_true", help="Disable symbol aggregation in portfolio snapshot")
args = parser.parse_args()
# Apply custom path overrides if provided
if args.config_path:
if not os.path.exists(args.config_path):
print(f"Error: Config file not found: {args.config_path}")
return
set_config_path(args.config_path)
if args.cookies_path:
# Note: cookies.json may not exist yet (created on first login)
# so we don't validate existence, only that parent directory exists
cookies_dir = os.path.dirname(args.cookies_path)
if cookies_dir and not os.path.exists(cookies_dir):
print(f"Error: Directory for cookies file does not exist: {cookies_dir}")
return
set_cookies_path(args.cookies_path)
if args.login:
# Set up debug logging when --debug is used
if args.debug:
import logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(name)s: %(message)s')
print("Debug logging enabled")
config = load_config()
username, password = get_schwab_credentials(config)
if username and password:
print("Attempting to log in...")
if args.debug:
print(f"Using browserless server: {config.get('playwright', {}).get('url', 'default')}")
cookies = await login_to_schwab(username, password)
if cookies:
print("Login successful and cookies saved.")
print(f"Saved {len(cookies)} cookies to cookies.json")
else:
print("Login failed.")
else:
print("Schwab username and password not found in config.json.")
return
if args.session_status:
envelope = await unified_api.get_session_status(debug=args.debug)
_print_envelope(envelope)
return
if args.set_cookies:
envelope = await unified_api.set_cookies(args.set_cookies, debug=args.debug)
_print_envelope(envelope)
return
if args.export_cookies:
envelope = await unified_api.export_cookies(args.export_cookies, debug=args.debug)
_print_envelope(envelope)
return
if args.list_accounts:
envelope = await unified_api.list_accounts(debug=args.debug)
_print_envelope(envelope)
return
if args.account_overview is not None:
account_arg = args.account_overview or None
envelope = await unified_api.get_account_overview(account=account_arg, debug=args.debug)
_print_envelope(envelope)
return
if args.positions is not None:
account_arg = args.positions or None
envelope = await unified_api.get_positions(
account=account_arg,
include_non_equity=args.include_non_equity,
debug=args.debug,
)
_print_envelope(envelope)
return
if args.portfolio_snapshot is not None:
account_arg = args.portfolio_snapshot or None
envelope = await unified_api.get_portfolio_snapshot(
account=account_arg,
aggregate_by_symbol=not args.no_aggregate,
include_non_equity=args.include_non_equity,
debug=args.debug,
)
_print_envelope(envelope)
return
if args.transactions:
envelope = await unified_api.get_transaction_history(
account=args.account,
start_date=args.start_date,
end_date=args.end_date,
time_period=args.time_period,
debug=args.debug,
)
_print_envelope(envelope)
return
if args.ticker:
if args.test:
await test_scraper(args.ticker, args.debug)
elif args.phase1:
print(f"Extracting Phase 1 enhanced equity data for {args.ticker}...")
envelope = await unified_api.get_equity_phase1_data(args.ticker, debug=args.debug)
_print_envelope(envelope)
else:
print(f"Scraping Morningstar data for {args.ticker}...")
envelope = await unified_api.get_morningstar_data(args.ticker, debug=args.debug)
_print_envelope(envelope)
return
parser.print_help()
def main():
"""Entry point for console script"""
asyncio.run(async_main())
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,20 @@
from .contracts import ( # noqa: F401
Envelope,
ErrorType,
AccountOverview,
AccountSummary,
Lot,
MorningstarData,
PortfolioSnapshot,
Position,
SessionStatus,
Transaction,
# Phase 1 data structures
QuoteData,
EnhancedDividends,
EarningsData,
CalculatedMetrics,
EquityPhase1Data,
fail,
ok,
)

View File

@@ -0,0 +1,134 @@
import json
import logging
import os
from typing import Optional
# Module-level state for runtime path overrides
_config_path_override: Optional[str] = None
_cookies_path_override: Optional[str] = None
def set_config_path(path: Optional[str]) -> None:
"""
Set a custom path for config.json at runtime.
This override takes precedence over environment variables and defaults.
Note: This uses module-level state and is not thread-safe. Suitable for
single-threaded CLI usage or single async operations.
Args:
path: Absolute or relative path to config file, or None to reset
"""
global _config_path_override
_config_path_override = path
def set_cookies_path(path: Optional[str]) -> None:
"""
Set a custom path for cookies.json at runtime.
This override takes precedence over environment variables and defaults.
Note: This uses module-level state and is not thread-safe. Suitable for
single-threaded CLI usage or single async operations.
Args:
path: Absolute or relative path to cookies file, or None to reset
"""
global _cookies_path_override
_cookies_path_override = path
def get_config_path() -> str:
"""
Resolve the configuration file path using priority order:
1. Runtime override (set_config_path)
2. Environment variable SCHWAB_CONFIG_PATH
3. Default locations (../config.json relative to module, then ./config.json)
Returns:
str: Path to configuration file
"""
# Priority 1: Runtime override
if _config_path_override:
return _config_path_override
# Priority 2: Environment variable
env_path = os.environ.get('SCHWAB_CONFIG_PATH')
if env_path:
return env_path
# Priority 3: Default locations
# Try package root first (for development/installed package)
default_path = os.path.join(os.path.dirname(__file__), '..', 'config.json')
if os.path.exists(default_path):
return default_path
# Fall back to current working directory
return 'config.json'
def get_cookies_path() -> str:
"""
Resolve the cookies file path using priority order:
1. Runtime override (set_cookies_path)
2. Environment variable SCHWAB_COOKIES_PATH
3. Default location (./cookies.json in CWD)
Returns:
str: Path to cookies file
"""
# Priority 1: Runtime override
if _cookies_path_override:
return _cookies_path_override
# Priority 2: Environment variable
env_path = os.environ.get('SCHWAB_COOKIES_PATH')
if env_path:
return env_path
# Priority 3: Default location
return 'cookies.json'
def load_config():
"""Load configuration from config.json (or custom path if configured)"""
logger = logging.getLogger(__name__)
config_path = get_config_path()
try:
with open(config_path, 'r') as f:
return json.load(f)
except FileNotFoundError:
logger.error(f"config.json not found at {config_path}. Please create one based on config.json.sample")
return None
except json.JSONDecodeError:
logger.error(f"Invalid JSON in config file at {config_path}")
return None
def get_playwright_url(config=None):
"""Get the Playwright browserless URL from config"""
import os
env_url = os.environ.get('SCHWAB_PLAYWRIGHT_URL')
if env_url:
return env_url
if config is None:
config = load_config()
if config and 'playwright' in config and 'url' in config['playwright']:
return config['playwright']['url']
else:
# Default fallback URL
return "ws://browser.local.ben.io:3000/playwright/chromium"
def get_schwab_credentials(config=None):
"""Get Schwab credentials from config"""
if config is None:
config = load_config()
if config and 'schwab' in config:
return config['schwab'].get('username'), config['schwab'].get('password')
else:
return None, None

View File

@@ -0,0 +1,271 @@
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import datetime
from decimal import Decimal
from enum import Enum
from typing import Generic, Optional, TypeVar
from typing_extensions import TypedDict
T = TypeVar("T")
class ErrorType(str, Enum):
"""Categorisation for envelope failures."""
AUTHENTICATION = "AUTHENTICATION"
NETWORK = "NETWORK"
PARSING = "PARSING"
VALIDATION = "VALIDATION"
UNKNOWN = "UNKNOWN"
class Envelope(TypedDict, Generic[T]):
"""Standard response envelope for unified API operations."""
success: bool
data: Optional[T]
error: Optional[str]
error_type: Optional[ErrorType]
retryable: bool
def ok(data: T) -> Envelope[T]:
"""Create a success envelope containing the provided data."""
return {
"success": True,
"data": data,
"error": None,
"error_type": None,
"retryable": False,
}
def fail(
error: str,
error_type: ErrorType | str = ErrorType.UNKNOWN,
retryable: bool = False,
) -> Envelope[None]:
"""Create a failure envelope with error metadata."""
resolved_error_type: ErrorType
if isinstance(error_type, ErrorType):
resolved_error_type = error_type
else:
try:
resolved_error_type = ErrorType(error_type)
except ValueError:
resolved_error_type = ErrorType.UNKNOWN
return {
"success": False,
"data": None,
"error": error,
"error_type": resolved_error_type,
"retryable": retryable,
}
@dataclass(slots=True)
class SessionStatus:
"""Represents the current authentication session state."""
logged_in: bool
session_age_minutes: Optional[int] = None
last_refresh: Optional[datetime] = None
needs_mfa: bool = False
cookies_valid: bool = True
@dataclass(slots=True)
class AccountSummary:
"""Summary details for a Schwab account."""
id: str
label: str
type: str
last4: Optional[str] = None
is_margin: bool = False
@dataclass(slots=True)
class AccountOverview:
"""Aggregated balance snapshot for an account."""
account: AccountSummary
total_value: Optional[Decimal] = None
day_change: Optional[Decimal] = None
day_change_pct: Optional[float] = None
cash: Optional[Decimal] = None
settled_cash: Optional[Decimal] = None
buying_power: Optional[Decimal] = None
margin_balance: Optional[Decimal] = None
@dataclass(slots=True)
class Lot:
"""Individual lot information within a position."""
acquired_date: Optional[str] = None
quantity: Optional[float] = None
cost_basis: Optional[Decimal] = None
lot_id: Optional[str] = None
@dataclass(slots=True)
class Position:
"""Holding data for a specific security."""
symbol: str
description: Optional[str] = None
asset_type: Optional[str] = None
quantity: Optional[float] = None
market_price: Optional[Decimal] = None
market_value: Optional[Decimal] = None
cost_basis_total: Optional[Decimal] = None
unrealized_gain: Optional[Decimal] = None
unrealized_gain_pct: Optional[float] = None
lots: list[Lot] = field(default_factory=list)
@dataclass(slots=True)
class PortfolioSnapshot:
"""Aggregated view of equity holdings across accounts."""
equities: list[Position]
total_value: Optional[Decimal] = None
count: int = 0
@dataclass(slots=True)
class MorningstarData:
"""Unified Morningstar data payload (existing equity fields)."""
ticker: str
company_name: Optional[str] = None
previous_dividend_payment: Optional[str] = None
previous_pay_date: Optional[str] = None
previous_ex_date: Optional[str] = None
frequency: Optional[str] = None
annual_dividend_rate: Optional[str] = None
annual_dividend_yield: Optional[str] = None
fair_value: Optional[str] = None
economic_moat: Optional[str] = None
capital_allocation: Optional[str] = None
rating: Optional[int] = None
one_star_price: Optional[str] = None
five_star_price: Optional[str] = None
assessment: Optional[str] = None
range_52_week: Optional[str] = None
dividend_yield: Optional[str] = None
investment_style: Optional[str] = None
report_url: Optional[str] = None
report_date: Optional[str] = None
source: Optional[str] = None
@dataclass(slots=True)
class Transaction:
"""Normalized transaction record matching transactions feature."""
date: str
action: str
symbol: Optional[str]
description: str
quantity: Optional[str]
price: Optional[str]
fees_comm: Optional[str]
amount: Optional[str]
# Phase 1 Data Structures
@dataclass(slots=True)
class QuoteData:
"""Quote and price data from symbol bar."""
price: Optional[float] = None
change: Optional[float] = None
change_percent: Optional[float] = None
after_hours_price: Optional[float] = None
after_hours_change: Optional[float] = None
after_hours_change_percent: Optional[float] = None
bid: Optional[float] = None
ask: Optional[float] = None
bid_ask_size: Optional[str] = None
previous_close: Optional[float] = None
open: Optional[float] = None
volume: Optional[int] = None
volume_vs_avg: Optional[str] = None
day_range_low: Optional[float] = None
day_range_high: Optional[float] = None
week_52_low: Optional[float] = None
week_52_high: Optional[float] = None
market_cap: Optional[str] = None
sector: Optional[str] = None
exchange: Optional[str] = None
@dataclass(slots=True)
class EnhancedDividends:
"""Enhanced dividend data including forward-looking information."""
# Forward-looking data (Phase 1)
next_payment: Optional[float] = None
next_pay_date: Optional[str] = None
next_ex_date: Optional[str] = None
# Existing data
frequency: Optional[str] = None
annual_rate: Optional[float] = None
annual_yield: Optional[float] = None
previous_payment: Optional[float] = None
previous_pay_date: Optional[str] = None
previous_ex_date: Optional[str] = None
@dataclass(slots=True)
class EarningsData:
"""Core earnings metrics and forecasts."""
# Upcoming earnings
next_announcement_date: Optional[str] = None
announcement_timing: Optional[str] = None
analysts_covering: Optional[int] = None
consensus_estimate: Optional[float] = None
estimate_high: Optional[float] = None
estimate_low: Optional[float] = None
# Historical earnings
eps_ttm: Optional[float] = None
revenue_ttm: Optional[float] = None # Stored in dollars
pe_ttm: Optional[float] = None
forward_pe: Optional[float] = None
peg_ratio: Optional[float] = None
# Beat/miss history (simplified for Phase 1)
recent_beats: list[dict] = field(default_factory=list)
future_estimates: list[dict] = field(default_factory=list)
@dataclass(slots=True)
class CalculatedMetrics:
"""Calculated metrics derived from other data."""
payout_ratio: Optional[float] = None
@dataclass(slots=True)
class EquityPhase1Data:
"""Complete Phase 1 enhanced equity data."""
ticker: str
quote: Optional[QuoteData] = None
dividends: Optional[EnhancedDividends] = None
earnings: Optional[EarningsData] = None
calculated_metrics: Optional[CalculatedMetrics] = None

View File

@@ -0,0 +1,30 @@
class ScraperError(Exception):
"""Base class for scraper-related errors."""
class SessionExpiredError(ScraperError):
pass
class LoginError(ScraperError):
pass
class InvalidTickerError(ScraperError):
pass
class NoDataError(ScraperError):
pass
class DownloadError(ScraperError):
pass
class PdfParseError(ScraperError):
pass
class NavigationError(ScraperError):
pass

View File

@@ -0,0 +1,66 @@
from dataclasses import dataclass
from typing import Optional, List
@dataclass
class DividendsData:
previous_payment: Optional[str] = None
previous_pay_date: Optional[str] = None
previous_ex_date: Optional[str] = None
frequency: Optional[str] = None
annual_dividend_rate: Optional[str] = None
annual_dividend_yield: Optional[str] = None
@dataclass
class MorningstarPdfData:
fair_value: Optional[str] = None
economic_moat: Optional[str] = None
capital_allocation: Optional[str] = None
rating: Optional[int] = None
one_star_price: Optional[str] = None
five_star_price: Optional[str] = None
assessment: Optional[str] = None
range_52_week: Optional[str] = None
dividend_yield: Optional[str] = None
investment_style: Optional[str] = None
report_url: Optional[str] = None
report_date: Optional[str] = None
@dataclass
class ScrapeResult:
ticker: str
company_name: Optional[str]
dividends: DividendsData
morningstar: MorningstarPdfData
source: str # "live" | "cache"
# -------------------- Transactions Feature --------------------
@dataclass
class AccountInfo:
account_type: str # e.g., "Joint", "IRA", "Individual"
account_ending: str # e.g., "604", "197", "873"
full_description: str # e.g., "Joint …604 (Account ending in 6 0 4)"
is_selected: bool = False
@dataclass
class TransactionRecord:
date: str
action: str
symbol: Optional[str]
description: str
quantity: Optional[str]
price: Optional[str]
fees_comm: Optional[str]
amount: Optional[str]
@dataclass
class TransactionData:
account_info: AccountInfo
transactions: List[TransactionRecord]
date_range: str
export_date: str
total_transactions: int
source: str # "live" | "cache"

View File

View File

@@ -0,0 +1,14 @@
"""Unified accounts and positions feature package."""
from .accounts_scraper import list_accounts
from .overview_scraper import get_account_overview
from .positions_scraper import get_positions
from .portfolio_scraper import get_portfolio_snapshot
__all__ = [
"list_accounts",
"get_account_overview",
"get_positions",
"get_portfolio_snapshot",
]

View File

@@ -0,0 +1,153 @@
from __future__ import annotations
import asyncio
import re
from typing import Optional
from ...core import AccountSummary, Envelope, ErrorType, fail, ok
from ...browser.client import connect, new_context, new_page
from ...browser.navigation import goto_with_auth_check
from ...browser.auth import ensure_cookies
from ...core.config import get_playwright_url, load_config
# Use the same URL as transactions feature for consistency and reliability
TRANSACTION_HISTORY_URL = "https://client.schwab.com/app/accounts/history/#/"
def _normalize_account_option(text: str, value: str) -> Optional[AccountSummary]:
text = text.strip()
if not text:
return None
normalized_text = re.sub(r"\s+", " ", text)
last4_match = re.search(r"(\d{3,4})", normalized_text.replace(" ", ""))
last4 = last4_match.group(1)[-4:] if last4_match else None
type_match = re.search(r"^([A-Za-z&'\- ]+)", normalized_text)
account_type = (type_match.group(1).strip() if type_match else "Account").replace(" ", "_")
account_id_candidates = [candidate for candidate in (value.strip(), last4, normalized_text) if candidate]
account_id = account_id_candidates[0] if account_id_candidates else normalized_text
label = normalized_text
is_margin = "margin" in normalized_text.lower()
return AccountSummary(
id=account_id,
label=label,
type=account_type,
last4=last4,
is_margin=is_margin,
)
async def list_accounts(debug: bool = False) -> Envelope[list[AccountSummary]]:
"""
Discover accounts from Schwab transaction history page.
Uses the robust account discovery logic from the transactions feature
which handles multiple selector patterns and has enhanced reliability.
"""
cookies = await ensure_cookies()
if not cookies:
return fail("Unable to establish Schwab session.", ErrorType.AUTHENTICATION, retryable=False)
config = load_config()
playwright_url = get_playwright_url(config)
playwright = browser = context = page = None
try:
playwright, browser = await connect(playwright_url)
context = await new_context(browser, cookies=cookies)
page = await new_page(context)
if not await goto_with_auth_check(page, context, TRANSACTION_HISTORY_URL, debug=debug):
return fail("Failed to load transaction history for account discovery.", ErrorType.AUTHENTICATION, retryable=True)
# Allow page to fully load
await asyncio.sleep(2)
# Use the robust account discovery from transactions feature
from ..transactions.scraper import discover_accounts_from_page
discovered_accounts = await discover_accounts_from_page(page, debug=debug)
if not discovered_accounts:
return fail("Account dropdown not found on transaction history page.", ErrorType.PARSING, retryable=True)
# Convert discovered accounts to AccountSummary objects
accounts: list[AccountSummary] = []
seen_ids: set[str] = set()
for acc in discovered_accounts:
# Create AccountSummary from discovered account info
account_id = acc.get('ending', acc.get('label', ''))
if account_id and account_id not in seen_ids:
summary = AccountSummary(
id=account_id,
label=acc.get('label', ''),
type=acc.get('type', 'Account'),
last4=acc.get('ending', ''),
is_margin=False, # Will be enhanced in future if needed
)
accounts.append(summary)
seen_ids.add(account_id)
if not accounts:
return fail("No accounts discovered from Schwab transaction history.", ErrorType.PARSING, retryable=True)
if debug:
print(f"DEBUG: Successfully discovered {len(accounts)} accounts:")
for acc in accounts:
print(f"DEBUG: - {acc.label} (type: {acc.type}, last4: {acc.last4})")
return ok(accounts)
except Exception as exc:
if debug:
print(f"DEBUG: Account discovery error: {exc}")
return fail(str(exc), ErrorType.UNKNOWN, retryable=True)
finally:
await _safe_close_page(page)
await _safe_close_context(context)
await _safe_close_browser(browser)
await _safe_stop_playwright(playwright)
async def _safe_close_page(page) -> None:
if page is None:
return
try:
await page.close()
except Exception:
pass
async def _safe_close_context(context) -> None:
if context is None:
return
try:
await context.close()
except Exception:
pass
async def _safe_close_browser(browser) -> None:
if browser is None:
return
try:
await browser.close()
except Exception:
pass
async def _safe_stop_playwright(playwright) -> None:
if playwright is None:
return
try:
await playwright.stop()
except Exception:
pass

View File

@@ -0,0 +1,426 @@
from __future__ import annotations
import asyncio
import re
from decimal import Decimal, InvalidOperation
from typing import Any, Optional, Sequence
from ...browser.auth import ensure_cookies
from ...browser.client import connect, new_context, new_page
from ...browser.navigation import goto_with_auth_check
from ...core import AccountOverview, AccountSummary, Envelope, ErrorType, fail, ok
from ...core.config import get_playwright_url, load_config
SUMMARY_URL = "https://client.schwab.com/accounts/summary/summary.aspx/"
def _parse_currency(value: str | None) -> Optional[Decimal]:
if not value:
return None
cleaned = value.strip()
if not cleaned or cleaned in {"-", "--"}:
return None
negative = False
if cleaned.startswith("(") and cleaned.endswith(")"):
negative = True
cleaned = cleaned.replace("$", "").replace(",", "")
cleaned = cleaned.replace("(", "").replace(")", "")
cleaned = cleaned.replace("", "-").strip()
if not cleaned:
return None
try:
parsed = Decimal(cleaned)
if negative or parsed < 0:
parsed = -abs(parsed)
return parsed
except InvalidOperation:
return None
def _parse_percentage(value: str | None) -> Optional[float]:
if not value:
return None
cleaned = value.strip()
if not cleaned:
return None
negative = False
if cleaned.startswith("(") and cleaned.endswith(")"):
negative = True
cleaned = cleaned.replace("%", "").replace("(", "").replace(")", "")
cleaned = cleaned.replace("", "-").strip()
if not cleaned:
return None
try:
parsed = float(cleaned)
except ValueError:
return None
if negative or parsed < 0:
parsed = -abs(parsed)
return parsed
def _normalize_account_label(label: str) -> AccountSummary:
normalized = re.sub(r"\s+", " ", label).strip()
last4_match = re.search(r"(\d{3,4})\b", normalized.replace(" ", ""))
last4 = last4_match.group(1)[-4:] if last4_match else None
type_match = re.search(r"^[A-Za-z&'\- ]+", normalized)
account_type = re.sub(r"\s+", "_", type_match.group(0).strip()) if type_match else "Account"
account_id = f"{account_type}-{last4}" if last4 else account_type
return AccountSummary(
id=account_id,
label=normalized,
type=account_type,
last4=last4,
is_margin="margin" in normalized.lower(),
)
def _match_account(candidate: AccountSummary, requested: AccountSummary | str | None) -> bool:
if requested is None:
return True
if isinstance(requested, AccountSummary):
requested_values = {
requested.id.lower(),
requested.label.lower(),
}
if requested.last4:
requested_values.add(requested.last4.lower())
else:
lookup = requested.strip().lower()
requested_values = {lookup}
candidate_values = {candidate.id.lower(), candidate.label.lower()}
if candidate.last4:
candidate_values.add(candidate.last4.lower())
return bool(candidate_values & requested_values)
def _rows_to_dicts(headers: Sequence[str], rows: Sequence[Sequence[str]]) -> list[dict[str, str]]:
normalized_headers = [header.strip().lower() for header in headers]
results: list[dict[str, str]] = []
for row in rows:
row_map: dict[str, str] = {}
for idx, header in enumerate(normalized_headers):
if idx < len(row):
row_map[header] = row[idx].strip()
results.append(row_map)
return results
async def _extract_table(page) -> dict[str, Any] | None:
return await page.evaluate(
"""
() => {
const wrapper = document.querySelector('.sdps-tables__wrapper');
if (!wrapper) {
return null;
}
const headerRow = wrapper.querySelector('.sdps-tables__row--header');
const headers = headerRow
? Array.from(headerRow.querySelectorAll('.sdps-tables__header-text'))
.map((el) => (el.textContent || '').trim())
: [];
if (!headers.length) {
const legacyHeaders = wrapper.querySelectorAll('thead th');
if (legacyHeaders.length) {
for (const th of legacyHeaders) {
headers.push((th.textContent || '').trim());
}
}
}
const bodyRows = wrapper.querySelectorAll('.sdps-tables__row--body');
const rows = [];
if (bodyRows.length) {
bodyRows.forEach((row) => {
const cells = Array.from(
row.querySelectorAll('.sdps-tables__cell, div[role="cell"], td')
).map((cell) => (cell.textContent || '').trim());
rows.push(cells);
});
}
if (!rows.length) {
const fallbackRows = wrapper.querySelectorAll('tbody tr');
fallbackRows.forEach((row) => {
const cells = Array.from(row.querySelectorAll('td')).map((cell) => (cell.textContent || '').trim());
if (cells.length) {
rows.push(cells);
}
});
}
return { headers, rows };
}
"""
)
async def _extract_totals(page) -> dict[str, str | None]:
return await page.evaluate(
r"""
() => {
const result = { total: null, dayChange: null, dayChangePct: null, cash: null };
const totalLabel = document.querySelector('#total-value-label');
if (totalLabel) {
const valueEl = totalLabel.closest('[class*="sdps-panel"], h2, div');
if (valueEl) {
const currencyMatch = valueEl.textContent?.match(/\$[\d,]+\.?\d*/);
if (currencyMatch) {
result.total = currencyMatch[0];
}
}
}
const dayChangeLabel = document.querySelector('#day-change-label');
if (dayChangeLabel) {
const container = dayChangeLabel.parentElement;
if (container) {
const matchCurrency = container.textContent?.match(/\$[\d,]+\.?\d*/);
const matchPct = container.textContent?.match(/-?\d+(?:\.\d+)?%/);
if (matchCurrency) {
result.dayChange = matchCurrency[0];
}
if (matchPct) {
result.dayChangePct = matchPct[0];
}
}
}
const cashLabel = Array.from(document.querySelectorAll('.sdps-tables__header-text')).find((el) =>
el.textContent?.toLowerCase().includes('cash & cash investments')
);
if (cashLabel) {
const container = cashLabel.closest('div');
if (container) {
const matchCurrency = container.textContent?.match(/\$[\d,]+\.?\d*/);
if (matchCurrency) {
result.cash = matchCurrency[0];
}
}
}
return result;
}
"""
)
def _row_to_overview(row_map: dict[str, str]) -> tuple[AccountSummary, AccountOverview]:
label = row_map.get('name') or row_map.get('account') or row_map.get('account name') or row_map.get('', '')
label = label or "Account"
account_summary = _normalize_account_label(label)
total_value = _parse_currency(
row_map.get('account value')
or row_map.get('total value')
or row_map.get('market value')
)
day_change = _parse_currency(
row_map.get('day change $')
or row_map.get('day change')
or row_map.get('day change amount')
)
day_change_pct = _parse_percentage(
row_map.get('day change %')
or row_map.get('day change percent')
)
cash_value = _parse_currency(
row_map.get('cash & cash investments')
or row_map.get('cash')
)
settled_cash = _parse_currency(row_map.get('settled cash'))
buying_power = _parse_currency(row_map.get('buying power') or row_map.get('available to trade'))
margin_balance = _parse_currency(row_map.get('margin balance') or row_map.get('margin'))
overview = AccountOverview(
account=account_summary,
total_value=total_value,
day_change=day_change,
day_change_pct=day_change_pct,
cash=cash_value,
settled_cash=settled_cash,
buying_power=buying_power,
margin_balance=margin_balance,
)
return account_summary, overview
async def get_account_overview(
account: AccountSummary | str | None = None, *, debug: bool = False
) -> Envelope[AccountOverview]:
cookies = await ensure_cookies()
if not cookies:
return fail("Unable to establish Schwab session.", ErrorType.AUTHENTICATION, retryable=False)
config = load_config()
playwright_url = get_playwright_url(config)
playwright = browser = context = page = None
try:
playwright, browser = await connect(playwright_url)
context = await new_context(browser, cookies=cookies)
page = await new_page(context)
if not await goto_with_auth_check(page, context, SUMMARY_URL, debug=debug):
return fail("Failed to load Schwab account summary page.", ErrorType.AUTHENTICATION, retryable=True)
await asyncio.sleep(1)
table_data = await _extract_table(page)
if not table_data:
return fail("Unable to locate account overview table.", ErrorType.PARSING, retryable=True)
row_dicts = _rows_to_dicts(table_data["headers"], table_data["rows"])
matched_overviews: list[AccountOverview] = []
for row_map in row_dicts:
# Skip empty rows or totals indicated by lack of numeric data
values = "".join(row_map.values())
if not values:
continue
summary, overview = _row_to_overview(row_map)
if _match_account(summary, account):
matched_overviews.append(overview)
if not matched_overviews:
return fail("Account not found in overview table.", ErrorType.VALIDATION, retryable=False)
if account is None and len(matched_overviews) > 1:
aggregated = _aggregate_overviews(matched_overviews)
totals = await _extract_totals(page)
if totals:
if totals.get("total"):
aggregated.total_value = _parse_currency(totals.get("total"))
if totals.get("dayChange"):
aggregated.day_change = _parse_currency(totals.get("dayChange"))
if totals.get("dayChangePct"):
aggregated.day_change_pct = _parse_percentage(totals.get("dayChangePct"))
if totals.get("cash"):
aggregated.cash = _parse_currency(totals.get("cash"))
return ok(aggregated)
return ok(matched_overviews[0])
except Exception as exc:
return fail(str(exc), ErrorType.UNKNOWN, retryable=True)
finally:
await _safe_close_page(page)
await _safe_close_context(context)
await _safe_close_browser(browser)
await _safe_stop_playwright(playwright)
def _aggregate_overviews(overviews: Sequence[AccountOverview]) -> AccountOverview:
total_value = Decimal("0")
day_change = Decimal("0")
cash_total = Decimal("0")
settled_total = Decimal("0")
buying_total = Decimal("0")
margin_total = Decimal("0")
for item in overviews:
if item.total_value is not None:
total_value += item.total_value
if item.day_change is not None:
day_change += item.day_change
if item.cash is not None:
cash_total += item.cash
if item.settled_cash is not None:
settled_total += item.settled_cash
if item.buying_power is not None:
buying_total += item.buying_power
if item.margin_balance is not None:
margin_total += item.margin_balance
aggregated_summary = AccountSummary(
id="AGGREGATE",
label="All Accounts",
type="AGGREGATE",
last4=None,
is_margin=False,
)
total_value_out = total_value if total_value != 0 else None
day_change_out = day_change if day_change != 0 else None
cash_out = cash_total if cash_total != 0 else None
settled_out = settled_total if settled_total != 0 else None
buying_out = buying_total if buying_total != 0 else None
margin_out = margin_total if margin_total != 0 else None
day_change_pct: Optional[float] = None
if total_value_out and day_change_out:
try:
day_change_pct = float((day_change_out / total_value_out) * 100)
except (InvalidOperation, ZeroDivisionError):
day_change_pct = None
return AccountOverview(
account=aggregated_summary,
total_value=total_value_out,
day_change=day_change_out,
day_change_pct=day_change_pct,
cash=cash_out,
settled_cash=settled_out,
buying_power=buying_out,
margin_balance=margin_out,
)
async def _safe_close_page(page) -> None:
if page is None:
return
try:
await page.close()
except Exception:
pass
async def _safe_close_context(context) -> None:
if context is None:
return
try:
await context.close()
except Exception:
pass
async def _safe_close_browser(browser) -> None:
if browser is None:
return
try:
await browser.close()
except Exception:
pass
async def _safe_stop_playwright(playwright) -> None:
if playwright is None:
return
try:
await playwright.stop()
except Exception:
pass

View File

@@ -0,0 +1,134 @@
from __future__ import annotations
from decimal import Decimal, InvalidOperation
from typing import Iterable, Optional
from ...core import AccountSummary, Envelope, ErrorType, PortfolioSnapshot, Position, fail, ok
from .positions_scraper import get_positions
def _aggregate_positions(positions: Iterable[Position]) -> tuple[list[Position], Optional[Decimal]]:
aggregated: dict[str, Position] = {}
total_value = Decimal("0")
has_value = False
for position in positions:
if position.market_value is not None:
total_value += position.market_value
has_value = True
key = position.symbol.upper() if position.symbol else "UNKNOWN"
if key not in aggregated:
aggregated[key] = Position(
symbol=position.symbol,
description=position.description,
asset_type=position.asset_type,
quantity=position.quantity,
market_price=position.market_price,
market_value=position.market_value,
cost_basis_total=position.cost_basis_total,
unrealized_gain=position.unrealized_gain,
unrealized_gain_pct=position.unrealized_gain_pct,
lots=list(position.lots),
)
continue
existing = aggregated[key]
if position.quantity is not None:
if existing.quantity is None:
existing.quantity = position.quantity
else:
existing.quantity += position.quantity
if position.market_value is not None:
if existing.market_value is None:
existing.market_value = position.market_value
else:
existing.market_value += position.market_value
if position.cost_basis_total is not None:
if existing.cost_basis_total is None:
existing.cost_basis_total = position.cost_basis_total
else:
existing.cost_basis_total += position.cost_basis_total
if position.unrealized_gain is not None:
if existing.unrealized_gain is None:
existing.unrealized_gain = position.unrealized_gain
else:
existing.unrealized_gain += position.unrealized_gain
if position.market_price is not None:
existing.market_price = position.market_price
if position.unrealized_gain_pct is not None:
existing.unrealized_gain_pct = position.unrealized_gain_pct
if position.description and not existing.description:
existing.description = position.description
if position.asset_type:
existing.asset_type = position.asset_type
if position.lots:
existing.lots.extend(position.lots)
for item in aggregated.values():
if item.unrealized_gain is not None and item.cost_basis_total not in (None, Decimal("0")):
try:
item.unrealized_gain_pct = float((item.unrealized_gain / item.cost_basis_total) * 100)
except (InvalidOperation, ZeroDivisionError):
item.unrealized_gain_pct = None
total_value_out = total_value if has_value else None
return list(aggregated.values()), total_value_out
async def get_portfolio_snapshot(
account: AccountSummary | str | None = None,
*,
aggregate_by_symbol: bool = True,
include_non_equity: bool = False,
debug: bool = False,
) -> Envelope[PortfolioSnapshot]:
positions_envelope = await get_positions(
account=account,
include_non_equity=include_non_equity,
debug=debug,
)
if not positions_envelope["success"]:
return fail(
positions_envelope.get("error") or "Failed to retrieve positions.",
positions_envelope.get("error_type") or ErrorType.UNKNOWN,
positions_envelope.get("retryable", True),
)
positions = positions_envelope["data"] or []
if aggregate_by_symbol:
aggregated_positions, total_value = _aggregate_positions(positions)
count = len(aggregated_positions)
snapshot = PortfolioSnapshot(
equities=aggregated_positions,
total_value=total_value,
count=count,
)
return ok(snapshot)
total_value = Decimal("0")
has_value = False
for position in positions:
if position.market_value is not None:
total_value += position.market_value
has_value = True
total_value_out = total_value if has_value else None
snapshot = PortfolioSnapshot(
equities=positions,
total_value=total_value_out,
count=len(positions),
)
return ok(snapshot)

View File

@@ -0,0 +1,432 @@
from __future__ import annotations
import re
from decimal import Decimal, InvalidOperation
from typing import Any, Optional, Sequence
from ...browser.auth import ensure_cookies
from ...browser.client import connect, new_context, new_page
from ...browser.navigation import goto_with_auth_check
from ...core import AccountSummary, Envelope, ErrorType, Lot, Position, fail, ok
from ...core.config import get_playwright_url, load_config
POSITIONS_URL = "https://client.schwab.com/app/accounts/positions/#/"
def _parse_decimal(value: str | None) -> Optional[Decimal]:
if not value:
return None
cleaned = value.strip()
if not cleaned or cleaned in {"-", "--"}:
return None
negative = False
if cleaned.startswith("(") and cleaned.endswith(")"):
negative = True
cleaned = (
cleaned.replace("$", "")
.replace(",", "")
.replace("(", "")
.replace(")", "")
.replace("", "-")
.replace("%", "")
.strip()
)
if not cleaned:
return None
try:
parsed = Decimal(cleaned)
if negative or parsed < 0:
parsed = -abs(parsed)
return parsed
except InvalidOperation:
return None
def _parse_float(value: str | None) -> Optional[float]:
decimal_value = _parse_decimal(value)
if decimal_value is None:
return None
try:
return float(decimal_value)
except (ValueError, InvalidOperation):
return None
def _normalize_account_label(label: str) -> AccountSummary:
normalized = re.sub(r"\s+", " ", label).strip()
last4_match = re.search(r"(\d{3,4})\b", normalized.replace(" ", ""))
last4 = last4_match.group(1)[-4:] if last4_match else None
type_match = re.search(r"^[A-Za-z&'\- ]+", normalized)
account_type = re.sub(r"\s+", "_", type_match.group(0).strip()) if type_match else "Account"
account_id = f"{account_type}-{last4}" if last4 else account_type
return AccountSummary(
id=account_id,
label=normalized,
type=account_type,
last4=last4,
is_margin="margin" in normalized.lower(),
)
def _match_account(candidate: AccountSummary, requested: AccountSummary | str | None) -> bool:
if requested is None:
return True
if isinstance(requested, AccountSummary):
requested_values = {
requested.id.lower(),
requested.label.lower(),
}
if requested.last4:
requested_values.add(requested.last4.lower())
else:
lookup = requested.strip().lower()
requested_values = {lookup}
candidate_values = {candidate.id.lower(), candidate.label.lower()}
if candidate.last4:
candidate_values.add(candidate.last4.lower())
return bool(candidate_values & requested_values)
def classify_asset(symbol: str | None, description: str | None) -> str:
if symbol:
sym = symbol.strip().upper()
else:
sym = ""
desc = (description or "").strip().upper()
if sym and re.fullmatch(r"[A-Z]{1,5}", sym):
if "ETF" in desc:
return "ETF"
if any(kw in desc for kw in ["FUND", "MUTUAL"]):
return "MUTUAL_FUND"
return "EQUITY"
if sym and re.search(r"\d", sym) and len(sym) > 5:
return "OPTION"
if any(kw in desc for kw in ["BOND", "CD", "TREASURY"]):
return "BOND"
if sym in {"CASH", "MMDA", "SWEEP"} or "CASH" in desc:
return "CASH"
if "ETF" in desc:
return "ETF"
if "FUND" in desc:
return "MUTUAL_FUND"
return "OTHER"
async def _evaluate_table(page) -> dict[str, Any] | None:
return await page.evaluate(
"""
() => {
const table = document.querySelector('#positionsDetails');
if (!table) {
return null;
}
const headers = Array.from(table.querySelectorAll('thead tr th')).map((th) =>
(th.innerText || th.textContent || '').trim()
);
const rowElements = Array.from(table.querySelectorAll('tbody tr'));
const rows = [];
let current = null;
let currentAccount = null;
const isLotRow = (row) => {
const klass = (row.className || '').toLowerCase();
if (klass.includes('lot') || klass.includes('sub') || klass.includes('child')) {
return true;
}
const dataRole = (row.getAttribute('data-row-type') || '').toLowerCase();
return dataRole.includes('lot');
};
const isPositionRow = (row) => {
const klass = (row.className || '').toLowerCase();
return klass.includes('position-row');
};
const isAccountHeader = (row) => {
const klass = (row.className || '').toLowerCase();
const text = (row.textContent || '').trim();
return !klass.includes('position-row') &&
(klass.includes('highlight-row') || klass.includes('border-top-dark')) &&
text.includes('account panel');
};
for (const row of rowElements) {
// Check if this is an account header row
if (isAccountHeader(row)) {
const text = row.textContent.trim();
// Extract account name from account panel text
const match = text.match(/account panel[\\s\\n]+([^\\n]+)/);
if (match) {
currentAccount = match[1].trim();
}
continue;
}
const cells = Array.from(row.querySelectorAll('td')).map((cell) =>
(cell.innerText || cell.textContent || '').trim()
);
if (!cells.length) {
continue;
}
if (isLotRow(row)) {
if (current) {
current.lots.push(cells);
}
} else if (isPositionRow(row)) {
// Extract symbol from data-symbol attribute
const symbol = row.getAttribute('data-symbol') || '';
current = {
type: 'position',
cells: cells,
lots: [],
symbol: symbol,
account: currentAccount
};
rows.push(current);
}
}
return { headers, rows };
}
"""
)
def _map_row(headers: Sequence[str], cells: Sequence[str]) -> dict[str, str]:
result: dict[str, str] = {}
# Special handling: The table has columns in headers that don't correspond to cells
# Headers: ['', 'Symbol', 'Description', 'Qty', 'Price', ...]
# Cells: ['VANGUARD...', '192.5', '$328.17', ...]
# The first two headers (empty checkbox and Symbol) have no corresponding cells
# So: Cell 0 → 'Description', Cell 1 → 'Qty', Cell 2 → 'Price', etc.
# Find the symbol header index to know where the offset starts
symbol_header_idx = None
for idx, header in enumerate(headers):
key = header.strip().lower()
if 'symbol' in key and 'description' not in key:
symbol_header_idx = idx
break
# Calculate offset - typically 2 (empty column + symbol column)
offset = symbol_header_idx + 1 if symbol_header_idx is not None else 0
for idx, header in enumerate(headers):
# Normalize header: take first line, strip, lowercase
# Headers often have format "Label\nsort\nfieldname"
header_parts = header.strip().split('\n')
key = header_parts[0].strip().lower() if header_parts else ""
if not key:
key = f"column_{idx}"
# Map header to cell with offset
if idx < offset:
# These headers (empty, symbol) have no corresponding cells
value = ""
else:
cell_idx = idx - offset
value = cells[cell_idx].strip() if cell_idx < len(cells) else ""
result[key] = value
return result
def _parse_lots(lot_rows: Sequence[Sequence[str]]) -> list[Lot]:
lots: list[Lot] = []
for cells in lot_rows:
if not cells:
continue
acquired_date = cells[0].strip() if len(cells) > 0 else None
quantity = _parse_float(cells[1] if len(cells) > 1 else None)
cost_basis = _parse_decimal(cells[2] if len(cells) > 2 else None)
lot_id = cells[3].strip() if len(cells) > 3 else None
lots.append(
Lot(
acquired_date=acquired_date or None,
quantity=quantity,
cost_basis=cost_basis,
lot_id=lot_id or None,
)
)
return lots
def _row_to_position(row_map: dict[str, str], lots_rows: Sequence[Sequence[str]], symbol: str = "") -> Position:
# Symbol is now passed from data-symbol attribute on row
# Description is in the first visible cell
description = row_map.get('description') or row_map.get('name') or row_map.get('column_1') or ""
# Price is typically in column labeled 'price' or similar
market_price = _parse_decimal(
row_map.get('price')
or row_map.get('market price')
or row_map.get('last price')
)
# Quantity - now in different column due to layout change
quantity = _parse_float(row_map.get('quantity') or row_map.get('qty'))
market_value = _parse_decimal(row_map.get('market value') or row_map.get('mkt val'))
cost_basis_total = _parse_decimal(row_map.get('cost basis') or row_map.get('total cost'))
unrealized_gain = _parse_decimal(
row_map.get('gain/loss $')
or row_map.get('unrealized gain')
or row_map.get('gain/loss')
)
unrealized_gain_pct = _parse_float(
row_map.get('gain/loss %')
or row_map.get('unrealized gain %')
)
asset_type = classify_asset(symbol, description)
lots = _parse_lots(lots_rows)
return Position(
symbol=symbol or "",
description=description or None,
asset_type=asset_type,
quantity=quantity,
market_price=market_price,
market_value=market_value,
cost_basis_total=cost_basis_total,
unrealized_gain=unrealized_gain,
unrealized_gain_pct=unrealized_gain_pct,
lots=lots,
)
async def get_positions(
account: AccountSummary | str | None = None,
*,
include_non_equity: bool = False,
debug: bool = False,
) -> Envelope[list[Position]]:
cookies = await ensure_cookies()
if not cookies:
return fail("Unable to establish Schwab session.", ErrorType.AUTHENTICATION, retryable=False)
config = load_config()
playwright_url = get_playwright_url(config)
playwright = browser = context = page = None
try:
playwright, browser = await connect(playwright_url)
context = await new_context(browser, cookies=cookies)
page = await new_page(context)
if not await goto_with_auth_check(page, context, POSITIONS_URL, debug=debug):
return fail("Failed to load Schwab positions page.", ErrorType.AUTHENTICATION, retryable=True)
await page.wait_for_selector('#positionsDetails', timeout=45000)
await page.wait_for_timeout(1000)
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
await page.wait_for_timeout(1500)
table_data = await _evaluate_table(page)
if not table_data:
return fail("Unable to locate positions table.", ErrorType.PARSING, retryable=True)
headers = [header.strip().lower() for header in table_data.get('headers') or []]
if not headers:
return fail("Positions table headers not found.", ErrorType.PARSING, retryable=True)
positions: list[Position] = []
for row in table_data.get('rows', []):
if row.get('type') != 'position':
continue
cells = row.get('cells') or []
symbol = row.get('symbol') or ""
account_label = row.get('account') or ""
row_map = _map_row(headers, cells)
position = _row_to_position(row_map, row.get('lots') or [], symbol=symbol)
# Filter by account if requested
if account is not None and account_label:
# Normalize the account label from the row
account_summary = _normalize_account_label(account_label)
if not _match_account(account_summary, account):
continue
elif account is not None and not account_label:
# If filtering by account but row has no account, skip it
continue
if not include_non_equity and position.asset_type not in {"EQUITY", "ETF"}:
continue
positions.append(position)
if not positions:
return fail("No positions matched the requested criteria.", ErrorType.VALIDATION, retryable=False)
return ok(positions)
except Exception as exc:
return fail(str(exc), ErrorType.UNKNOWN, retryable=True)
finally:
await _safe_close_page(page)
await _safe_close_context(context)
await _safe_close_browser(browser)
await _safe_stop_playwright(playwright)
async def _safe_close_page(page) -> None:
if page is None:
return
try:
await page.close()
except Exception:
pass
async def _safe_close_context(context) -> None:
if context is None:
return
try:
await context.close()
except Exception:
pass
async def _safe_close_browser(browser) -> None:
if browser is None:
return
try:
await browser.close()
except Exception:
pass
async def _safe_stop_playwright(playwright) -> None:
if playwright is None:
return
try:
await playwright.stop()
except Exception:
pass

View File

@@ -0,0 +1,239 @@
from typing import Optional, Tuple
import logging
async def find_report(page, debug: bool = False) -> Tuple[Optional[str], Optional[str]]:
"""Locate the Morningstar Equity Report link and date on the stock page.
Uses multiple fallback strategies to handle Schwab website changes.
Returns:
Tuple of (url, date) where:
- url: The href attribute if it's a traditional link, or a special marker
'__CLICK_TO_OPEN__' if it's a JavaScript/blob link that requires clicking
- date: The report date string if found
"""
logger = logging.getLogger(__name__)
# Strategy 1: Original selector
report_link_selector = "div[id='Morningstar Equity Report'] a.sr-report-link"
if await page.is_visible(report_link_selector):
if debug:
logger.debug("Found Morningstar report using original selector")
report_link_element = page.locator(report_link_selector)
await report_link_element.scroll_into_view_if_needed()
url = await report_link_element.get_attribute("href")
# Date element (escaped spaces)
date_locator = page.locator(r"#Morningstar\ Equity\ Report > span:nth-child(3) > sdps-date-time > time > span:nth-child(2)")
date_text = (await date_locator.inner_text()).strip() if await date_locator.count() > 0 else None
# Check if href is empty (modern web component using blob URLs)
if not url or url == '':
if debug:
logger.debug("Link found but href is empty - this is a modern web component that generates blob URLs on click")
# Return a special marker to indicate we need to click the link to get the URL
return '__CLICK_TO_OPEN__', date_text
return url, date_text
# Strategy 2: Look for any link containing "morningstar" in research section
if debug:
logger.debug("Original selector failed, trying fallback selectors...")
fallback_selectors = [
"a.sr-report-link[href*='morningstar']",
"a[href*='morningstar'][href*='pdf']",
"#morningstar-section a.sr-report-link",
"div[id*='Morningstar'] a",
]
for selector in fallback_selectors:
try:
if await page.is_visible(selector, timeout=2000):
if debug:
logger.debug(f"Found Morningstar report using fallback selector: {selector}")
report_link_element = page.locator(selector).first
await report_link_element.scroll_into_view_if_needed()
url = await report_link_element.get_attribute("href")
# Try to find date with various selectors
date_text = None
date_selectors = [
r"#Morningstar\ Equity\ Report > span:nth-child(3) > sdps-date-time > time > span:nth-child(2)",
"sdps-date-time time span",
"time span",
]
for date_sel in date_selectors:
try:
date_locator = page.locator(date_sel)
if await date_locator.count() > 0:
date_text = (await date_locator.first.inner_text()).strip()
if date_text:
break
except:
continue
return url, date_text
except Exception as e:
if debug:
logger.debug(f"Fallback selector {selector} failed: {e}")
continue
# Strategy 3: Use JavaScript to search for Morningstar links
if debug:
logger.debug("All CSS selectors failed, trying JavaScript search...")
try:
result = await page.evaluate("""
() => {
// Look for any link containing 'morningstar' and 'pdf'
const links = Array.from(document.querySelectorAll('a[href]'));
const morningstarLink = links.find(link =>
link.href.toLowerCase().includes('morningstar') &&
link.href.toLowerCase().includes('pdf')
);
if (morningstarLink) {
// Try to find associated date
let dateText = null;
const parent = morningstarLink.closest('[id*="Morningstar"]') || morningstarLink.parentElement;
if (parent) {
const timeElement = parent.querySelector('time');
if (timeElement) {
dateText = timeElement.textContent.trim();
}
}
return {
url: morningstarLink.href,
date: dateText
};
}
return null;
}
""")
if result and result.get('url'):
if debug:
logger.debug(f"Found Morningstar report using JavaScript search: {result['url']}")
return result['url'], result.get('date')
except Exception as e:
if debug:
logger.debug(f"JavaScript search failed: {e}")
# No report found
if debug:
logger.debug("No Morningstar report link found using any strategy")
# Capture page state for debugging
try:
await page.screenshot(path="debug_morningstar_not_found.png", full_page=True)
logger.debug("Saved debug screenshot to: debug_morningstar_not_found.png")
# Log available elements for debugging
page_info = await page.evaluate("""
() => {
return {
hasMorningstarSection: !!document.querySelector('#morningstar-section'),
hasMorningstarDiv: !!document.querySelector('div[id*="Morningstar"]'),
morningstarLinks: Array.from(document.querySelectorAll('a[href]'))
.filter(a => a.href.toLowerCase().includes('morningstar'))
.length,
allReportLinks: Array.from(document.querySelectorAll('a.sr-report-link')).length
}
}
""")
logger.debug(f"Page state: {page_info}")
except Exception as e:
logger.debug(f"Failed to capture debug info: {e}")
return None, None
async def download_report_as_bytes(page, url: str, debug: bool = False) -> Optional[bytes]:
"""Open the PDF in a new page and return bytes via data URL conversion.
Args:
page: The current Playwright page
url: Either a traditional URL or '__CLICK_TO_OPEN__' marker for blob URLs
debug: Enable debug logging
Returns:
PDF bytes if successful, None otherwise
"""
logger = logging.getLogger(__name__)
if not url:
return None
# Handle blob URL case (modern web component)
if url == '__CLICK_TO_OPEN__':
if debug:
logger.debug("Handling blob URL - clicking link to open PDF")
# Click the Morningstar report link to open the PDF
report_link_selector = "div[id='Morningstar Equity Report'] a.sr-report-link"
try:
# Wait for new page to open after clicking
new_page_promise = page.context.wait_for_event("page", timeout=15000)
await page.click(report_link_selector)
new_page = await new_page_promise
if debug:
logger.debug(f"New page opened with URL: {new_page.url}")
# Wait for PDF to load
await new_page.wait_for_load_state('load', timeout=10000)
# The PDF is now loaded as a blob URL - extract it
blob_url = new_page.url
except Exception as e:
if debug:
logger.debug(f"Error clicking link to open PDF: {e}")
return None
else:
# Traditional URL case
if debug:
logger.debug(f"Opening PDF from traditional URL: {url}")
new_page_promise = page.context.wait_for_event("page")
await page.evaluate("url => window.open(url, '_blank')", url)
new_page = await new_page_promise
await new_page.wait_for_load_state('load')
blob_url = url
# Fetch and convert to Base64 in browser context
try:
pdf_base64 = await new_page.evaluate(
"""
async (url) => {
const response = await fetch(url);
const blob = await response.blob();
return await new Promise((resolve) => {
const reader = new FileReader();
reader.onloadend = () => resolve(reader.result.split(',')[1]);
reader.readAsDataURL(blob);
});
}
""",
blob_url,
)
await new_page.close()
if not pdf_base64:
return None
import base64
return base64.b64decode(pdf_base64)
except Exception as e:
if debug:
logger.debug(f"Error extracting PDF bytes: {e}")
try:
await new_page.close()
except:
pass
return None

View File

@@ -0,0 +1,80 @@
import re
from io import BytesIO
from typing import Dict
import pdfplumber
def clean_value(label: str, value: str) -> str:
"""Cleans the extracted value based on the label."""
if label == "Morningstar Rating":
return f"{value.count('Q')} stars"
if label == "Economic Moat":
if "Wide" in value:
return "Wide"
if "Narrow" in value:
return "Narrow"
if "None" in value:
return "None"
if label in ["Fair Value", "1-Star Price", "5-Star Price"]:
match = re.match(r"[\d,]+\.\d{2}", value)
if match:
return match.group(0)
if label == "Assessment":
return value.split()[0]
if label == "52-Week-Range":
return value.replace('\u2014', '-')
if label == "52-Week Range":
return value.replace('\u2014', '-')
return value
def parse(pdf_content: bytes) -> Dict[str, str]:
"""
Parses a Morningstar PDF report to extract key data points.
Returns a dict keyed by the label names present in the report.
"""
with pdfplumber.open(BytesIO(pdf_content)) as pdf:
page = pdf.pages[2] # Page 3
words = page.extract_words(x_tolerance=1, y_tolerance=1, keep_blank_chars=False)
data: Dict[str, str] = {}
labels = [
"Fair Value", "1-Star Price", "5-Star Price", "Assessment",
"Dividend Yield", "Capital Allocation", "52-Week Range", "Investment Style",
"Economic Moat", "Morningstar Rating"
]
for i, word in enumerate(words):
# Combine words to form potential labels
for j in range(i + 1, min(i + 4, len(words))):
potential_label = " ".join(w['text'] for w in words[i:j])
if potential_label in labels:
if potential_label == "Economic Moat":
# Find the value to the right of the label
label_end_x = words[j-1]['x1']
value_words = [
w['text'] for w in words[j:]
if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100
]
if value_words:
value = " ".join(value_words)
if "Wide" in value:
data[potential_label] = "Wide"
elif "Narrow" in value:
data[potential_label] = "Narrow"
elif "None" in value:
data[potential_label] = "None"
break
else:
# Find the value to the right of the label
label_end_x = words[j-1]['x1']
value_words = [
w['text'] for w in words[j:]
if abs(w['top'] - word['top']) < 2 and w['x0'] > label_end_x and w['x0'] - label_end_x < 100
]
if value_words:
# Join the value words and clean them
value = " ".join(value_words)
data[potential_label] = clean_value(potential_label, value)
break # Move to the next word once a label is found
return data

View File

@@ -0,0 +1,490 @@
"""Phase 1: API-Based Data Extraction (EXPERIMENTAL - NON-FUNCTIONAL)
⚠️ **STATUS: NON-FUNCTIONAL DUE TO CORS RESTRICTIONS** ⚠️
This module was an attempt to extract equity data by calling Schwab's REST APIs directly.
While the APIs exist and were discovered via HAR analysis, they are NOT accessible from
this scraper due to fundamental browser security limitations (CORS).
## Why This Approach Failed:
1. **CORS (Cross-Origin Resource Sharing) Restrictions**:
- Research page: `client.schwab.com`, APIs: `ausgateway.schwab.com` (different origins)
- Browser blocks cross-origin fetch() calls even from page.evaluate()
- Results in "TypeError: Failed to fetch"
2. **Authentication Complexity**:
- Direct HTTP (aiohttp) with cookies: 401/403 errors
- Playwright page.request.fetch(): 401 errors (separate context)
- Likely requires dynamic tokens beyond cookies
## Recommendation:
**Use `phase1_scraper.py` (DOM scraping) instead**. It works reliably with authenticated
sessions and extracts all Phase 1 fields without CORS limitations.
## API Endpoints (discovered but inaccessible):
- Quote: /api/is.ResearchExperience/v1/quote
- Dividends: /api/is.ResearchExperience/v1/events/dividends
- Earnings: /api/is.ResearchExperience/v1/events/earnings
- Share Profile: /api/is.ResearchExperience/v1/shareprofile
"""
from typing import Dict, Any, Optional, List
import logging
import uuid
import aiohttp
from playwright.async_api import Page
from ...core import (
QuoteData, EnhancedDividends, EarningsData,
CalculatedMetrics, EquityPhase1Data
)
logger = logging.getLogger(__name__)
def _parse_float(value: Any) -> Optional[float]:
"""Safely parse a value to float."""
if value is None:
return None
try:
if isinstance(value, str):
# Remove % sign if present
value = value.replace('%', '').strip()
return float(value)
except (ValueError, TypeError):
return None
def _parse_market_cap(value: str) -> Optional[str]:
"""Parse market cap string like '$3.03T' or '$462.11B'."""
if not value:
return None
# Keep the formatted string as-is for readability
return value.strip()
def _parse_volume(value: Any) -> Optional[int]:
"""Parse volume value."""
if value is None:
return None
try:
return int(float(value))
except (ValueError, TypeError):
return None
def parse_quote_api_response(data: Dict[str, Any]) -> QuoteData:
"""Parse quote API response into QuoteData object.
API Response Structure:
{
"reference": {
"symbol": "JNJ",
"companyName": "JOHNSON & JOHNSON",
"exchangeName": "NYSE"
},
"quote": {
"lastPrice": 193.155,
"netChange": 1.275,
"netChangePercent": 0.6644778,
"postMarketChange": 0.0,
"postMarketPercentChange": 0.0,
"tradeTime": "2025-10-22T17:06:42.008Z"
},
"regularQuote": {
"lastPrice": 193.155,
"lastSize": 100.0,
"netChange": 1.275,
"percentChange": 0.6644778,
...
}
}
"""
quote = QuoteData()
try:
reference = data.get('reference', {})
quote_data = data.get('quote', {})
regular_quote = data.get('regularQuote', {})
# Basic info
quote.exchange = reference.get('exchangeName')
# Price data
quote.price = _parse_float(quote_data.get('lastPrice'))
quote.change = _parse_float(quote_data.get('netChange'))
quote.change_percent = _parse_float(quote_data.get('netChangePercent'))
# After hours (post market)
quote.after_hours_change = _parse_float(quote_data.get('postMarketChange'))
quote.after_hours_change_percent = _parse_float(quote_data.get('postMarketPercentChange'))
# Extended quote data
quote.previous_close = _parse_float(regular_quote.get('closePrice'))
quote.open = _parse_float(regular_quote.get('openPrice'))
quote.bid = _parse_float(regular_quote.get('bidPrice'))
quote.ask = _parse_float(regular_quote.get('askPrice'))
quote.volume = _parse_volume(regular_quote.get('totalVolume'))
quote.day_range_low = _parse_float(regular_quote.get('lowPrice'))
quote.day_range_high = _parse_float(regular_quote.get('highPrice'))
quote.week_52_low = _parse_float(regular_quote.get('priceLow52W'))
quote.week_52_high = _parse_float(regular_quote.get('priceHigh52W'))
# Bid/Ask size
bid_size = regular_quote.get('bidSize', 0)
ask_size = regular_quote.get('askSize', 0)
if bid_size or ask_size:
quote.bid_ask_size = f"{bid_size}/{ask_size}"
# Volume vs average
avg_volume_label = regular_quote.get('averageVolumeDaily')
if avg_volume_label:
quote.volume_vs_avg = avg_volume_label
except Exception as e:
logger.debug(f"Error parsing quote API response: {e}")
return quote
def parse_dividends_api_response(data: Dict[str, Any]) -> EnhancedDividends:
"""Parse dividends API response into EnhancedDividends object.
API Response Structure:
{
"symbol": "JNJ",
"currentAnnualDividendMethod": "IAD",
"status": "DIVIDENDS_PAID_CURRENTLY",
"dividends": [
{
"dividendPayment": 1.3,
"dividendPayDate": "December 09, 2025",
"dividendExDate": "November 25, 2025",
"dividendFrequency": "Quarterly",
"annualDividendRate": 5.2,
"dividendYield": "2.71%"
},
...
]
}
"""
dividends = EnhancedDividends()
try:
dividend_list = data.get('dividends', [])
if not dividend_list:
return dividends
# Most recent dividend is first
latest = dividend_list[0]
# Next/upcoming dividend data
dividends.next_payment = _parse_float(latest.get('dividendPayment'))
dividends.next_pay_date = latest.get('dividendPayDate')
dividends.next_ex_date = latest.get('dividendExDate')
dividends.frequency = latest.get('dividendFrequency')
dividends.annual_rate = _parse_float(latest.get('annualDividendRate'))
dividends.annual_yield = _parse_float(latest.get('dividendYield'))
# Previous dividend (if there's more than one in history)
if len(dividend_list) > 1:
previous = dividend_list[1]
dividends.previous_payment = _parse_float(previous.get('dividendPayment'))
dividends.previous_pay_date = previous.get('dividendPayDate')
dividends.previous_ex_date = previous.get('dividendExDate')
except Exception as e:
logger.debug(f"Error parsing dividends API response: {e}")
return dividends
def parse_earnings_api_response(data: Dict[str, Any]) -> EarningsData:
"""Parse earnings API response into EarningsData object.
API Response Structure:
{
"symbol": "GOOGL",
"fundamentals": {},
"upcoming": {
"earningsDate": "10/29/2025",
"numberOfAnalysts": 43,
"epsNonGaapEstimate": 2.18
},
"historical": [
{
"epsGaapActual": 2.31,
"epsNonGaapActual": 2.31,
"earningsDate": "07/23/2025",
"numberOfAnalysts": 43,
"epsNonGaapEstimate": 2.18,
"epsNonGaapEstimateHigh": 2.42,
"epsNonGaapEstimateLow": 2.0
}
]
}
"""
earnings = EarningsData()
try:
upcoming = data.get('upcoming', {})
historical = data.get('historical', [])
fundamentals = data.get('fundamentals', {})
# Upcoming earnings
if upcoming:
earnings.next_announcement_date = upcoming.get('earningsDate')
earnings.announcement_timing = upcoming.get('announcementTiming')
earnings.analysts_covering = upcoming.get('numberOfAnalysts')
earnings.consensus_estimate = _parse_float(upcoming.get('epsNonGaapEstimate'))
earnings.estimate_high = _parse_float(upcoming.get('epsNonGaapEstimateHigh'))
earnings.estimate_low = _parse_float(upcoming.get('epsNonGaapEstimateLow'))
# Historical earnings (most recent)
if historical:
latest = historical[0]
earnings.eps_ttm = _parse_float(latest.get('epsNonGaapActual') or latest.get('epsGaapActual'))
# If we don't have upcoming, use latest historical for analyst data
if not upcoming:
earnings.analysts_covering = latest.get('numberOfAnalysts')
earnings.consensus_estimate = _parse_float(latest.get('epsNonGaapEstimate'))
earnings.estimate_high = _parse_float(latest.get('epsNonGaapEstimateHigh'))
earnings.estimate_low = _parse_float(latest.get('epsNonGaapEstimateLow'))
# Beat/miss information
beat_amount = latest.get('epsNonGaapBeat')
if beat_amount is not None:
earnings.recent_beats = [{
'beat_amount': _parse_float(beat_amount),
'beat_percent': _parse_float(latest.get('epsNonGaapBeatPercent')),
'date': latest.get('earningsDate')
}]
# Fundamentals (PE ratios, revenue)
if fundamentals:
earnings.pe_ttm = _parse_float(fundamentals.get('peRatio'))
earnings.forward_pe = _parse_float(fundamentals.get('forwardPE'))
earnings.peg_ratio = _parse_float(fundamentals.get('pegRatio'))
earnings.revenue_ttm = _parse_float(fundamentals.get('revenue'))
except Exception as e:
logger.debug(f"Error parsing earnings API response: {e}")
return earnings
def parse_shareprofile_api_response(data: Dict[str, Any], quote: QuoteData) -> QuoteData:
"""Parse share profile API response and enhance QuoteData with market cap, etc.
API Response Structure:
{
"companySummary": {
"marketCapLabel": "Large Cap",
"marketCapValue": "$462.11B",
"companyEnterpriseValue": "$462.11B"
},
"shareInfo": [{
"sharesOutstanding": "2.41B",
"sharesHeld": "71.29%"
}]
}
"""
try:
company_summary = data.get('companySummary', {})
# Market cap
quote.market_cap = _parse_market_cap(company_summary.get('marketCapValue'))
# Sector info might be in other fields
# Note: Sector information may not be in shareprofile API
# It might be in securityprofiles or other endpoints
except Exception as e:
logger.debug(f"Error parsing share profile API response: {e}")
return quote
def calculate_payout_ratio(annual_dividend: Optional[float], eps_ttm: Optional[float]) -> Optional[float]:
"""Calculate dividend payout ratio.
Formula: (Annual Dividend Rate / EPS TTM) × 100
"""
if annual_dividend and eps_ttm and eps_ttm > 0:
ratio = (annual_dividend / eps_ttm) * 100
return round(ratio, 2)
return None
async def call_schwab_api(page: Page, url: str, debug: bool = False) -> Optional[Dict[str, Any]]:
"""Call a Schwab API endpoint from within the browser's JavaScript context.
This uses page.evaluate() to run fetch() directly in the browser, which ensures
all cookies, authentication tokens, and session state are automatically included.
This is the most reliable way to call Schwab APIs.
Args:
page: Playwright page with authenticated session
url: API endpoint URL
debug: Enable debug logging
Returns:
Parsed JSON response or None on error
"""
try:
if debug:
logger.debug(f"Calling API: {url}")
# Generate correlation IDs
correlator_id = str(uuid.uuid4())
client_correlid = str(uuid.uuid4())
# Call API from within browser's JavaScript context using fetch()
# This automatically includes all cookies and session state
result = await page.evaluate("""
async ({url, correlatorId, clientCorrelId}) => {
try {
const response = await fetch(url, {
method: 'GET',
credentials: 'include', // Include cookies
headers: {
'accept': 'application/json',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'no-cache',
'content-type': 'application/json',
'correlatorid': correlatorId,
'pragma': 'no-cache',
'schwab-client-appid': 'AD00007800',
'schwab-client-channel': 'IO',
'schwab-client-correlid': clientCorrelId,
'schwab-resource-version': '2',
}
});
if (!response.ok) {
const errorText = await response.text();
return {
success: false,
status: response.status,
error: errorText
};
}
const data = await response.json();
return {
success: true,
status: response.status,
data: data
};
} catch (error) {
return {
success: false,
error: error.toString()
};
}
}
""", {'url': url, 'correlatorId': correlator_id, 'clientCorrelId': client_correlid})
if not result.get('success'):
if debug:
status = result.get('status', 'unknown')
error = result.get('error', 'unknown error')
logger.debug(f"API returned status {status}: {str(error)[:200]}")
return None
data = result.get('data')
if debug and data:
logger.debug(f"API response keys: {list(data.keys()) if isinstance(data, dict) else 'list'}")
return data
except Exception as e:
if debug:
logger.debug(f"Error calling API {url}: {e}")
return None
async def extract_phase1_data_api(page: Page, ticker: str, debug: bool = False) -> EquityPhase1Data:
"""Extract Phase 1 data using Schwab's REST APIs.
This is the API-based replacement for the DOM scraping approach.
It calls Schwab's APIs directly using the authenticated session.
Args:
page: Playwright page with authenticated session
ticker: Stock ticker symbol
debug: Enable debug logging
Returns:
EquityPhase1Data with all extracted fields
"""
if debug:
logger.debug(f"Starting API-based Phase 1 extraction for {ticker}")
base_url = "https://ausgateway.schwab.com/api/is.ResearchExperience/v1"
# Build API URLs
quote_url = f"{base_url}/quote?symbols={ticker}&isComplex=true"
dividends_url = f"{base_url}/events/dividends?symbol={ticker}"
earnings_url = f"{base_url}/events/earnings?symbols={ticker}"
profile_url = f"{base_url}/shareprofile?symbols={ticker}&includeSubsidiaries=true"
# Make API calls using Playwright's request context (includes cookies automatically)
quote_data = await call_schwab_api(page, quote_url, debug)
dividends_data = await call_schwab_api(page, dividends_url, debug)
earnings_data = await call_schwab_api(page, earnings_url, debug)
profile_data = await call_schwab_api(page, profile_url, debug)
# Parse responses
# Quote API returns a list, get first item
if quote_data and isinstance(quote_data, list) and len(quote_data) > 0:
quote = parse_quote_api_response(quote_data[0])
elif quote_data and isinstance(quote_data, dict):
quote = parse_quote_api_response(quote_data)
else:
quote = QuoteData()
# Enhance quote with share profile data
if profile_data:
quote = parse_shareprofile_api_response(profile_data, quote)
# Parse dividends
dividends = parse_dividends_api_response(dividends_data) if dividends_data else EnhancedDividends()
# Parse earnings
earnings = parse_earnings_api_response(earnings_data) if earnings_data else EarningsData()
# Calculate derived metrics
calculated = CalculatedMetrics()
if dividends.annual_rate and earnings.eps_ttm:
calculated.payout_ratio = calculate_payout_ratio(
dividends.annual_rate,
earnings.eps_ttm
)
# Create Phase 1 data object
phase1_data = EquityPhase1Data(
ticker=ticker,
quote=quote,
dividends=dividends,
earnings=earnings,
calculated_metrics=calculated
)
if debug:
logger.debug(f"API-based Phase 1 extraction complete for {ticker}")
# Count populated fields (dataclasses with slots don't have __dict__)
from dataclasses import fields as dataclass_fields
quote_count = sum(1 for f in dataclass_fields(quote) if getattr(quote, f.name) is not None)
div_count = sum(1 for f in dataclass_fields(dividends) if getattr(dividends, f.name) is not None)
earn_count = sum(1 for f in dataclass_fields(earnings) if getattr(earnings, f.name) not in (None, []))
logger.debug(f" Quote fields populated: {quote_count}/21")
logger.debug(f" Dividend fields populated: {div_count}/9")
logger.debug(f" Earnings fields populated: {earn_count}/13")
return phase1_data

View File

@@ -0,0 +1,786 @@
"""Phase 1: Essential Dividend Metrics Implementation (DEPRECATED)
⚠️ DEPRECATED: This DOM-scraping based approach has been replaced by phase1_api_scraper.py
which uses Schwab's REST APIs directly. The API approach is more reliable, complete,
and maintainable than DOM scraping.
This module is kept for reference only. New code should use phase1_api_scraper.py.
Old approach extracts from DOM:
- Quote/Price Data (symbol bar)
- Enhanced Dividend Information (forward-looking dates)
- Core Earnings Metrics (EPS, forecasts)
- Basic Valuation Ratios (P/E, Forward P/E, PEG)
- Calculated Metrics (payout ratio)
"""
from typing import Dict, Any, Optional
import re
import logging
from ...core import QuoteData, EnhancedDividends, EarningsData, CalculatedMetrics, EquityPhase1Data
logger = logging.getLogger(__name__)
def _parse_float(value: Any) -> Optional[float]:
"""Safely parse a value to float, handling $ and % symbols."""
if value is None:
return None
try:
# Remove common formatting characters
clean = str(value).strip().replace('$', '').replace(',', '').replace('%', '')
if clean and clean != '--' and clean.lower() != 'n/a':
return float(clean)
except (ValueError, AttributeError):
pass
return None
def _parse_int(value: Any) -> Optional[int]:
"""Safely parse a value to int."""
if value is None:
return None
try:
clean = str(value).strip().replace(',', '')
if clean and clean != '--' and clean.lower() != 'n/a':
return int(float(clean))
except (ValueError, AttributeError):
pass
return None
def _parse_volume(volume_str: str) -> Optional[int]:
"""Parse volume string like '8M', '22.4M', '1.2B' to integer."""
if not volume_str:
return None
try:
volume_str = volume_str.strip().upper()
multiplier = 1
if volume_str.endswith('K'):
multiplier = 1_000
volume_str = volume_str[:-1]
elif volume_str.endswith('M'):
multiplier = 1_000_000
volume_str = volume_str[:-1]
elif volume_str.endswith('B'):
multiplier = 1_000_000_000
volume_str = volume_str[:-1]
value = float(volume_str)
return int(value * multiplier)
except (ValueError, AttributeError):
return None
def _parse_revenue(revenue_str: str) -> Optional[float]:
"""Parse revenue string like '$92.15B', '$1.5M' to dollar value."""
if not revenue_str:
return None
try:
revenue_str = revenue_str.strip().upper().replace('$', '').replace(',', '')
multiplier = 1
if revenue_str.endswith('K'):
multiplier = 1_000
revenue_str = revenue_str[:-1]
elif revenue_str.endswith('M'):
multiplier = 1_000_000
revenue_str = revenue_str[:-1]
elif revenue_str.endswith('B'):
multiplier = 1_000_000_000
revenue_str = revenue_str[:-1]
elif revenue_str.endswith('T'):
multiplier = 1_000_000_000_000
revenue_str = revenue_str[:-1]
value = float(revenue_str)
return value * multiplier
except (ValueError, AttributeError):
return None
async def extract_quote_data(page, ticker: str = "", debug: bool = False) -> QuoteData:
"""Extract quote/price data from symbol bar.
Args:
page: Playwright page object
ticker: Stock ticker symbol (for pattern matching)
debug: Enable debug logging
Returns:
QuoteData object with extracted fields
"""
quote = QuoteData()
try:
if debug:
logger.debug("Starting quote data extraction...")
# Wait for symbol bar content (look for key labels)
try:
await page.wait_for_selector('#app-symbol-bar-component, text=Previous close', state='attached', timeout=15000)
except Exception:
if debug:
logger.debug("Timeout waiting for symbol bar selector, attempting to parse whatever is there")
# Extract symbol bar text content (fallback to body if specific component not found)
symbol_bar_text = await page.evaluate('''
() => {
const symbolBar = document.querySelector('#app-symbol-bar-component');
if (symbolBar && symbolBar.textContent && symbolBar.textContent.includes('Previous close')) return symbolBar.textContent;
// If specific component not found, try to find the container with market data
// Look for container with "Previous close"
const labels = Array.from(document.querySelectorAll('span, div, p'));
const prevCloseLabel = labels.find(el => el.textContent && el.textContent.includes('Previous close'));
if (prevCloseLabel) {
// Return the parent's text content (go up a few levels to capture all data)
let parent = prevCloseLabel.parentElement;
let count = 0;
while (parent && count < 8) {
if (parent.textContent.length > 300) return parent.textContent;
parent = parent.parentElement;
count++;
}
}
return document.body.textContent || '';
}
''')
if debug:
logger.debug(f"Symbol bar text (first 500 chars): {symbol_bar_text[:500]}")
# Extract structured data
quote_data = await page.evaluate(r'''
(ticker) => {
const data = {};
// Helper to get text content from page
const getText = () => {
const symbolBar = document.querySelector('#app-symbol-bar-component');
// Verify it looks like the right component by checking for "Previous close"
if (symbolBar && symbolBar.textContent && symbolBar.textContent.includes('Previous close')) {
return symbolBar.textContent;
}
// Fallback logic
const labels = Array.from(document.querySelectorAll('span, div, p'));
const prevCloseLabel = labels.find(el => el.textContent && el.textContent.includes('Previous close'));
if (prevCloseLabel) {
let parent = prevCloseLabel.parentElement;
let count = 0;
while (parent && count < 8) {
if (parent.textContent.length > 300) return parent.textContent;
parent = parent.parentElement;
count++;
}
}
// Last resort: body text
return document.body.textContent || '';
};
const fullText = getText();
// Try to find price in quote container first for accuracy
const priceElement = document.querySelector('.symbol-quote-container, [data-testid="quote-price"]');
if (priceElement) {
const priceText = priceElement.textContent || '';
const priceMatch = priceText.match(/\$([0-9,]+\.[0-9]+)/);
if (priceMatch) data.price = priceMatch[1].replace(',', '');
} else {
// Fallback regex for price if element not found
// Look for price near top or just regex
const priceMatch = fullText.match(/\$([0-9,]+\.[0-9]{2})(\s|[+-]|$)/);
if (priceMatch) data.price = priceMatch[1].replace(',', '');
}
// After hours (using \s* for robustness)
const afterHoursMatch = fullText.match(/After hours:?\s*\$([0-9,.]+)/i);
if (afterHoursMatch) data.after_hours_price = afterHoursMatch[1].replace(',', '');
const afterHoursChangeMatch = fullText.match(/After hours:.*?([+-]\$[0-9,.]+)\s*\(([+-][0-9.]+)%\)/i);
if (afterHoursChangeMatch) {
data.after_hours_change = afterHoursChangeMatch[1].replace('$', '').replace(',', '');
data.after_hours_change_percent = afterHoursChangeMatch[2];
}
// Bid/Ask (using \s* for robustness)
const bidMatch = fullText.match(/Bid\s*\$([0-9,.]+)/i);
if (bidMatch) data.bid = bidMatch[1].replace(',', '');
const askMatch = fullText.match(/Ask\s*\$([0-9,.]+)/i);
if (askMatch) data.ask = askMatch[1].replace(',', '');
const bidAskSizeMatch = fullText.match(/Bid\/Ask Size\s*([0-9]+\/[0-9]+)/i);
if (bidAskSizeMatch) data.bid_ask_size = bidAskSizeMatch[1];
// Previous close and open (using \s* instead of \s+)
const prevCloseMatch = fullText.match(/Previous close\s*\$([0-9,.]+)/i);
if (prevCloseMatch) data.previous_close = prevCloseMatch[1].replace(',', '');
const openMatch = fullText.match(/Today's open\s*\$([0-9,.]+)/i);
if (openMatch) data.open = openMatch[1].replace(',', '');
// Volume (using \s*)
const volumeMatch = fullText.match(/Today's volume\s*([0-9.]+[KMB]?)/i);
if (volumeMatch) data.volume = volumeMatch[1];
const volumeVsAvgMatch = fullText.match(/Today's volume\s*[0-9.]+[KMB]?\s*(Above Avg\.|Below Avg\.|Average)/i);
if (volumeVsAvgMatch) data.volume_vs_avg = volumeVsAvgMatch[1];
// Day range
// Pattern: "Today's range low $200.81 Today's range high $203.45" or similar
// We'll look for "low $X" and "high $Y" appearing after "Today's range"
const dayRangeMatch = fullText.match(/Today's range.*?low\s*\$([0-9,.]+).*?high\s*\$([0-9,.]+)/i);
if (dayRangeMatch) {
data.day_range_low = dayRangeMatch[1].replace(',', '');
data.day_range_high = dayRangeMatch[2].replace(',', '');
}
// 52-week range
const weekRangeMatch = fullText.match(/52-week range.*?low\s*\$([0-9,.]+).*?high\s*\$([0-9,.]+)/i);
if (weekRangeMatch) {
data.week_52_low = weekRangeMatch[1].replace(',', '');
data.week_52_high = weekRangeMatch[2].replace(',', '');
}
// Market cap (may be in Share Profile section)
const marketCapMatch = fullText.match(/Market Cap\s*\$([0-9.]+[KMBT])/i);
if (marketCapMatch) data.market_cap = marketCapMatch[1];
// Change and change percent
// Try specific formatted pattern first: TICKER $PRICE CHANGE CHANGE%
// e.g. "JNJ $201.95 -1.03 -0.51%"
const standardPattern = fullText.match(/\$([0-9,.]+)\s*([+-]?[0-9,.]+)\s*([+-]?[0-9.]+)%/);
if (standardPattern) {
if (!data.price) data.price = standardPattern[1].replace(',', '');
data.change = standardPattern[2];
data.change_percent = standardPattern[3];
}
let percentMatch = null;
if (ticker && !data.change_percent) {
// Match: TICKER$digits.digits{2}percent%
const tickerPattern = new RegExp(ticker + '\\\\.?[\\s]*\\$([0-9,]+\\\\.[0-9]{2})[\\s]*([0-9.]+)%', 'i');
percentMatch = fullText.match(tickerPattern);
if (percentMatch) {
data.change_percent = percentMatch[2];
}
}
if (!data.change_percent) {
// Fallback: match any price+percent pattern with space
const fallbackMatch = fullText.match(/\$[0-9,.]+\s*([+-]?[0-9.]+)%/);
if (fallbackMatch) {
data.change_percent = fallbackMatch[1];
}
}
// Pattern 2: "+$1.23 (+0.45%)" or "-$1.23 (-0.45%)"
let changeMatch = fullText.match(/([+-]\$[0-9,.]+)\s*\(([+-][0-9.]+)%\)/);
// Pattern 3: "$193.08 +1.23 +0.64%" (price followed by change)
if (!changeMatch) {
changeMatch = fullText.match(/\$[0-9,.]+\s*([+-][0-9,.]+)\s*([+-][0-9.]+)%/);
}
// Pattern 4: "Change: +1.23 (+0.64%)"
if (!changeMatch) {
changeMatch = fullText.match(/Change:?\s*([+-][0-9,.]+)\s*\(([+-][0-9.]+)%\)/i);
}
if (changeMatch) {
data.change = changeMatch[1].replace('$', '').replace(',', '');
if (!data.change_percent) {
data.change_percent = changeMatch[2].replace(/[+]/g, '');
}
}
// Exchange - look for NYSE, NASDAQ, etc.
const exchangeMatch = fullText.match(/\b(NYSE|NASDAQ|AMEX|OTC|BATS)\b/i);
if (exchangeMatch) data.exchange = exchangeMatch[1].toUpperCase();
return data;
}
''', ticker)
# Parse and assign values
quote.price = _parse_float(quote_data.get('price'))
quote.change = _parse_float(quote_data.get('change'))
quote.change_percent = _parse_float(quote_data.get('change_percent'))
quote.after_hours_price = _parse_float(quote_data.get('after_hours_price'))
quote.after_hours_change = _parse_float(quote_data.get('after_hours_change'))
quote.after_hours_change_percent = _parse_float(quote_data.get('after_hours_change_percent'))
quote.bid = _parse_float(quote_data.get('bid'))
quote.ask = _parse_float(quote_data.get('ask'))
quote.bid_ask_size = quote_data.get('bid_ask_size')
quote.previous_close = _parse_float(quote_data.get('previous_close'))
quote.open = _parse_float(quote_data.get('open'))
quote.volume = _parse_volume(quote_data.get('volume', ''))
quote.volume_vs_avg = quote_data.get('volume_vs_avg')
quote.day_range_low = _parse_float(quote_data.get('day_range_low'))
quote.day_range_high = _parse_float(quote_data.get('day_range_high'))
quote.week_52_low = _parse_float(quote_data.get('week_52_low'))
quote.week_52_high = _parse_float(quote_data.get('week_52_high'))
quote.market_cap = quote_data.get('market_cap')
# Try to extract sector and exchange from page header
header_data = await page.evaluate(r'''
() => {
const data = {};
// Look for sector near company name
const sectorElement = document.querySelector('[data-testid="sector"], .sector');
if (sectorElement) {
data.sector = sectorElement.textContent.replace('Sector', '').trim();
} else {
// Manual search for text containing "Sector"
const spans = Array.from(document.querySelectorAll('span'));
const sectorSpan = spans.find(el => el.textContent && el.textContent.includes('Sector'));
if (sectorSpan) {
data.sector = sectorSpan.textContent.replace('Sector', '').replace(':', '').trim();
}
}
// Look for exchange near ticker
const exchangeElement = document.querySelector('[data-testid="exchange"], .exchange');
if (exchangeElement) {
data.exchange = exchangeElement.textContent.trim();
}
// Fallback: parse from page text
const pageText = document.body.textContent || '';
if (!data.sector) {
const sectorMatch = pageText.match(/Sector[:\s]+([A-Za-z\s&]+)/);
if (sectorMatch) data.sector = sectorMatch[1].trim();
}
if (!data.exchange) {
const exchangeMatch = pageText.match(/(NYSE|NASDAQ|AMEX|OTC)/i);
if (exchangeMatch) data.exchange = exchangeMatch[1].toUpperCase();
}
return data;
}
''')
quote.sector = header_data.get('sector')
quote.exchange = header_data.get('exchange')
if debug:
logger.debug(f"Extracted quote data: price={quote.price}, volume={quote.volume}, "
f"52w_range={quote.week_52_low}-{quote.week_52_high}")
except Exception as e:
if debug:
logger.debug(f"Error extracting quote data: {e}")
return quote
async def extract_enhanced_dividends(page, debug: bool = False) -> EnhancedDividends:
"""Extract enhanced dividend data including next payment dates.
Args:
page: Playwright page object
debug: Enable debug logging
Returns:
EnhancedDividends object with extracted fields
"""
dividends = EnhancedDividends()
try:
if debug:
logger.debug("Starting enhanced dividend extraction...")
# Wait for dividends panel to load
await page.wait_for_selector('#dividends', timeout=15000)
# Scroll to dividends panel
await page.evaluate('''
() => {
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
dividendsPanel.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
}
''')
await page.wait_for_timeout(1000)
# CRITICAL: Click on the panel header to trigger content loading
# Schwab's panels don't auto-load - they need to be clicked
if debug:
logger.debug("Clicking dividends panel header to trigger content load...")
try:
dividends_header = await page.query_selector('#dividends h2, #dividends .sdps-panel__title, #dividends-togglechevron-button')
if dividends_header:
await dividends_header.click()
await page.wait_for_timeout(2000)
if debug:
logger.debug("Clicked dividends panel header successfully")
except Exception as e:
if debug:
logger.debug(f"Could not click dividends header: {e}")
# Wait for content to load after click
await page.wait_for_timeout(1000)
# Extract dividend data
dividend_data = await page.evaluate('''
() => {
const data = {};
const dividendsPanel = document.querySelector('#dividends');
if (!dividendsPanel) return data;
const fullText = dividendsPanel.textContent || '';
// DEBUG: Return sample of text for debugging
data._debug_text_sample = fullText.substring(0, 800);
// Next dividend payment
const nextPaymentMatch = fullText.match(/Next Dividend Payment\\s*\\$([0-9.]+)/i);
if (nextPaymentMatch) data.next_payment = nextPaymentMatch[1];
// Next pay date
const nextPayDateMatch = fullText.match(/Next Pay Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i);
if (nextPayDateMatch) data.next_pay_date = nextPayDateMatch[1];
// Next ex-date
const nextExDateMatch = fullText.match(/Next Ex-Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i);
if (nextExDateMatch) data.next_ex_date = nextExDateMatch[1];
// Previous dividend payment
const prevPaymentMatch = fullText.match(/Previous Dividend Payment\\s*\\$([0-9.]+)/i);
if (prevPaymentMatch) data.previous_payment = prevPaymentMatch[1];
// Previous pay date
const prevPayDateMatch = fullText.match(/Previous Pay Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i);
if (prevPayDateMatch) data.previous_pay_date = prevPayDateMatch[1];
// Previous ex-date
const prevExDateMatch = fullText.match(/Previous Ex-Date\\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/i);
if (prevExDateMatch) data.previous_ex_date = prevExDateMatch[1];
// Frequency
const frequencyMatch = fullText.match(/Frequency\\s*(Quarterly|Monthly|Annual|Semi-Annual)/i);
if (frequencyMatch) data.frequency = frequencyMatch[1];
// Annual Dividend Rate (IAD)
const annualRateMatch = fullText.match(/Annual Dividend Rate.*?\\$([0-9.]+)/i);
if (annualRateMatch) data.annual_rate = annualRateMatch[1];
// Annual Dividend Yield - appears after "Annual Dividend Yield" text
// Text pattern: "Annual Dividend Yield...2.71%"
const yieldMatch = fullText.match(/Annual Dividend Yield[\\s\\S]{0,300}?([0-9]+\\.[0-9]+)%/i);
if (yieldMatch) data.annual_yield = yieldMatch[1];
return data;
}
''')
if debug and dividend_data.get('_debug_text_sample'):
logger.debug(f"Dividend panel text sample: {dividend_data['_debug_text_sample']}")
# Parse and assign values
dividends.next_payment = _parse_float(dividend_data.get('next_payment'))
dividends.next_pay_date = dividend_data.get('next_pay_date')
dividends.next_ex_date = dividend_data.get('next_ex_date')
dividends.previous_payment = _parse_float(dividend_data.get('previous_payment'))
dividends.previous_pay_date = dividend_data.get('previous_pay_date')
dividends.previous_ex_date = dividend_data.get('previous_ex_date')
dividends.frequency = dividend_data.get('frequency')
dividends.annual_rate = _parse_float(dividend_data.get('annual_rate'))
dividends.annual_yield = _parse_float(dividend_data.get('annual_yield'))
if debug:
logger.debug(f"Extracted dividend data: next_payment={dividends.next_payment}, "
f"next_pay_date={dividends.next_pay_date}, annual_rate={dividends.annual_rate}")
except Exception as e:
if debug:
logger.debug(f"Error extracting dividend data: {e}")
return dividends
async def extract_earnings_data(page, debug: bool = False) -> EarningsData:
"""Extract earnings metrics and forecasts.
Args:
page: Playwright page object
debug: Enable debug logging
Returns:
EarningsData object with extracted fields
"""
earnings = EarningsData()
try:
if debug:
logger.debug("Starting earnings data extraction...")
# Wait for earnings panel to load
await page.wait_for_selector('#expected-earnings', timeout=15000)
# Scroll to earnings panel
await page.evaluate('''
() => {
const earningsPanel = document.querySelector('#expected-earnings');
if (earningsPanel) {
earningsPanel.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
}
''')
await page.wait_for_timeout(1000)
# CRITICAL: Click on the panel header to trigger content loading
# Schwab's panels don't auto-load - they need to be clicked
if debug:
logger.debug("Clicking earnings panel header to trigger content load...")
try:
earnings_header = await page.query_selector('#expected-earnings h2, #expected-earnings .sdps-panel__title, #expected-earnings-heading, #expected-earnings-togglechevron-button')
if earnings_header:
await earnings_header.click()
await page.wait_for_timeout(2000)
if debug:
logger.debug("Clicked earnings panel header successfully")
except Exception as e:
if debug:
logger.debug(f"Could not click earnings header: {e}")
# Wait for content to load after click
await page.wait_for_timeout(1000)
# Check for and click "Show More" if present
try:
# Use JS to find and click - most robust way
clicked = await page.evaluate('''
() => {
const panel = document.querySelector('#expected-earnings');
if (!panel) return false;
// Find any element with "Show More" text
const elements = Array.from(panel.querySelectorAll('a, button, span, div'));
const showMore = elements.find(el => el.textContent.trim().toLowerCase() === "show more");
if (showMore) {
showMore.click();
return true;
}
return false;
}
''')
if clicked:
if debug:
logger.debug("found and clicked 'Show More' via JS")
await page.wait_for_timeout(2000)
elif debug:
logger.debug("'Show More' not found or not clickable")
except Exception as e:
if debug:
logger.debug(f"Error checking for Show More: {e}")
# Extract earnings data
earnings_data = await page.evaluate(r'''
(debug) => {
const data = {};
// Helper to get text content including Shadow DOMs
const getDeepText = (root) => {
if (!root) return '';
if (root.nodeType === Node.TEXT_NODE) return root.textContent;
if (root.nodeType === Node.ELEMENT_NODE && root.shadowRoot) {
return getDeepText(root.shadowRoot);
}
let text = '';
const children = root.childNodes;
for (let i = 0; i < children.length; i++) {
text += getDeepText(children[i]);
}
return text;
};
const earningsPanel = document.querySelector('#expected-earnings');
let fullText = '';
if (earningsPanel) {
fullText = getDeepText(earningsPanel);
}
// Fallback to body deep text if panel seems empty
if (fullText.length < 500 || !fullText.includes("Announcement")) {
fullText = getDeepText(document.body);
}
// Next earnings announcement - robust regex checking for various patterns
let nextAnnouncementMatch = fullText.match(/Next Earnings Announcement.*?([0-9]{2}\/[0-9]{2}\/[0-9]{4})/i);
if (!nextAnnouncementMatch) {
// Try alternate pattern: Announcement: 12/12/2025
nextAnnouncementMatch = fullText.match(/Announcement:?\s*([0-9]{2}\/[0-9]{2}\/[0-9]{4})/i);
}
if (nextAnnouncementMatch) data.next_announcement_date = nextAnnouncementMatch[1];
// Announcement timing
const timingMatch = fullText.match(/(Before Market Open|After Market Close)/i);
if (timingMatch) data.announcement_timing = timingMatch[1];
// Number of analysts
const analystsMatch = fullText.match(/With ([0-9]+) analysts covering/i);
if (analystsMatch) data.analysts_covering = analystsMatch[1];
// Consensus estimate
const consensusMatch = fullText.match(/consensus.*?estimate is \\$([0-9.]+)/i);
if (consensusMatch) data.consensus_estimate = consensusMatch[1];
// High/Low estimates
const highLowMatch = fullText.match(/high and low estimates are \\$([0-9.]+) and \\$([0-9.]+)/i);
if (highLowMatch) {
data.estimate_high = highLowMatch[1];
data.estimate_low = highLowMatch[2];
}
// EPS TTM (multiple patterns)
let epsMatch = fullText.match(/EPS\s*\(TTM\)\s*(?:Value)?\s*\$?([0-9.-]+)/i);
if (!epsMatch) epsMatch = fullText.match(/Earnings per Share\s*\(?TTM\)?\s*(?:Value)?\s*\$?([0-9.-]+)/i);
if (!epsMatch) epsMatch = fullText.match(/EPS\s+(?:Value)?\s*([0-9.-]+)/i);
if (epsMatch) data.eps_ttm = epsMatch[1];
// Revenue TTM
let revenueMatch = fullText.match(/Revenue\s*\(TTM\)\s*(?:Value)?\s*\$([0-9.]+[KMBT]?)/i);
if (!revenueMatch) revenueMatch = fullText.match(/Revenue\s+(?:Value)?\s*\$([0-9.]+[KMBT])/i);
if (revenueMatch) data.revenue_ttm = revenueMatch[1];
// P/E TTM (multiple patterns)
let peMatch = fullText.match(/Price[\/\s]*Earnings\s*\(TTM\)\s*(?:Value)?\s*([0-9.]+)/i);
if (!peMatch) peMatch = fullText.match(/P[\/\s]*E\s*\(?TTM\)?\s*(?:Value)?\s*([0-9.]+)/i);
if (!peMatch) peMatch = fullText.match(/PE Ratio\s*\(TTM\)\s*(?:Value)?\s*([0-9.]+)/i);
if (peMatch) data.pe_ttm = peMatch[1];
// Forward P/E
let forwardPeMatch = fullText.match(/Forward\s+P[\/\s]*E\s*(?:Value)?\s*([0-9.]+)/i);
if (!forwardPeMatch) forwardPeMatch = fullText.match(/P[\/\s]*E\s*\(Forward\)\s*(?:Value)?\s*([0-9.]+)/i);
if (forwardPeMatch) data.forward_pe = forwardPeMatch[1];
// PEG Ratio
let pegMatch = fullText.match(/Price\s+to\s+Earnings[\/\s]*Growth\s*\(PEG\)\s*(?:Value)?\s*([0-9.]+)/i);
if (!pegMatch) pegMatch = fullText.match(/PEG\s*Ratio?\s*(?:Value)?\s*([0-9.]+)/i);
if (pegMatch) data.peg_ratio = pegMatch[1];
// Recent beats/misses (simplified - just extract beat amounts)
const beatMatches = fullText.matchAll(/Beat.*?\$([0-9.]+)/gi);
data.recent_beats = [];
for (const match of beatMatches) {
data.recent_beats.push(match[1]);
}
return data;
}
''', debug)
# Parse and assign values
earnings.next_announcement_date = earnings_data.get('next_announcement_date')
earnings.announcement_timing = earnings_data.get('announcement_timing')
earnings.analysts_covering = _parse_int(earnings_data.get('analysts_covering'))
earnings.consensus_estimate = _parse_float(earnings_data.get('consensus_estimate'))
earnings.estimate_high = _parse_float(earnings_data.get('estimate_high'))
earnings.estimate_low = _parse_float(earnings_data.get('estimate_low'))
earnings.eps_ttm = _parse_float(earnings_data.get('eps_ttm'))
earnings.revenue_ttm = _parse_revenue(earnings_data.get('revenue_ttm', ''))
earnings.pe_ttm = _parse_float(earnings_data.get('pe_ttm'))
earnings.forward_pe = _parse_float(earnings_data.get('forward_pe'))
earnings.peg_ratio = _parse_float(earnings_data.get('peg_ratio'))
# Store recent beats as list of dicts
if earnings_data.get('recent_beats'):
earnings.recent_beats = [
{'beat_amount': _parse_float(beat)}
for beat in earnings_data.get('recent_beats', [])
]
if debug:
logger.debug(f"Extracted earnings data: eps_ttm={earnings.eps_ttm}, "
f"pe_ttm={earnings.pe_ttm}, forward_pe={earnings.forward_pe}")
except Exception as e:
if debug:
logger.debug(f"Error extracting earnings data: {e}")
return earnings
def calculate_payout_ratio(annual_dividend: Optional[float], eps_ttm: Optional[float]) -> Optional[float]:
"""Calculate dividend payout ratio.
Formula: (Annual Dividend Rate / EPS TTM) × 100
Args:
annual_dividend: Annual dividend rate per share
eps_ttm: Earnings per share (trailing twelve months)
Returns:
Payout ratio as percentage, or None if cannot calculate
"""
if annual_dividend and eps_ttm and eps_ttm > 0:
ratio = (annual_dividend / eps_ttm) * 100
return round(ratio, 2)
return None
async def extract_phase1_data(page, debug: bool = False) -> EquityPhase1Data:
"""Extract all Phase 1 data points.
Args:
page: Playwright page object
debug: Enable debug output
Returns:
EquityPhase1Data object with all extracted data
"""
if debug:
logger.debug("Starting Phase 1 data extraction...")
# Wait for page to stabilize
await page.wait_for_timeout(3000)
# Extract ticker from page URL
ticker = await page.evaluate('''
() => {
const url = window.location.href;
const match = url.match(/stocks\\/([A-Z]+)/i);
return match ? match[1].toUpperCase() : '';
}
''')
# Extract each section
quote = await extract_quote_data(page, ticker=ticker, debug=debug)
dividends = await extract_enhanced_dividends(page, debug=debug)
earnings = await extract_earnings_data(page, debug=debug)
# Calculate derived metrics
calculated = CalculatedMetrics()
if dividends.annual_rate and earnings.eps_ttm:
calculated.payout_ratio = calculate_payout_ratio(
dividends.annual_rate,
earnings.eps_ttm
)
# Create Phase 1 data object
phase1_data = EquityPhase1Data(
ticker=ticker,
quote=quote,
dividends=dividends,
earnings=earnings,
calculated_metrics=calculated
)
if debug:
logger.debug(f"Phase 1 extraction complete for {ticker}")
return phase1_data

View File

@@ -0,0 +1,977 @@
from typing import Dict, Any, Optional
from ...utils.logging import save_debug_artifact
def should_replace_dividend_value(existing_value: Optional[str], new_value: Optional[str]) -> bool:
"""
Decide whether to replace an existing dividend field value with a new one.
Rules:
- Never replace with empty/None values
- Replace if there is no existing value
- Replace if the existing value is "Show More" or contains "Show More"
- Otherwise, keep the existing (good) data
"""
if not new_value or not str(new_value).strip():
return False
if not existing_value:
return True
existing_text = str(existing_value)
if existing_text == 'Show More' or 'Show More' in existing_text:
return True
return False
async def extract_dividend_data(page, debug: bool = False) -> Dict[str, Any]:
"""
Extract dividend information from Schwab stock page.
Returns dictionary with dividend data fields.
"""
dividend_data: Dict[str, Any] = {}
try:
if debug:
print("DEBUG: Starting dividend data extraction...")
# Take initial screenshot to see page state
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_start.png", png)
print(f"DEBUG: Initial screenshot saved as {path}")
# Wait for the dividends section to load dynamically
if debug:
print("DEBUG: Waiting for dividends section to load...")
try:
# First wait for the dividends panel to appear
await page.wait_for_selector('#dividends', timeout=15000)
if debug:
print("DEBUG: #dividends panel found")
# Wait for dividend content to load dynamically
dividend_loaded = False
max_attempts = 5 # Reduced from 10 for faster tests
attempt = 0
while not dividend_loaded and attempt < max_attempts:
attempt += 1
if debug:
print(f"DEBUG: Attempt {attempt}/{max_attempts} - Waiting for dynamic dividend content...")
# Check if the dividends section has been populated with actual content
dividend_status = await page.evaluate('''
() => {
const result = { loaded: false, debug: {} };
// Look for the dividends panel content that should be populated
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
const panelBody = dividendsPanel.querySelector('.sdps-panel__body');
if (panelBody) {
const textContent = panelBody.textContent || '';
result.debug.panelBodyLength = textContent.length;
result.debug.panelBodySample = textContent.substring(0, 200);
// Check if the panel has been populated with actual dividend text
// (not just empty comments)
const hasRealContent = textContent.length > 50 && (
textContent.includes('Previous Dividend') ||
textContent.includes('Pay Date') ||
textContent.includes('Ex-Date') ||
textContent.includes('Frequency') ||
textContent.includes('Annual Dividend') ||
textContent.includes('$') ||
textContent.includes('%')
);
if (hasRealContent) {
result.loaded = true;
return result;
}
}
}
// Alternative: check for stock-dividends component
const stockDividends = document.querySelector('stock-dividends');
if (stockDividends) {
const text = stockDividends.textContent || '';
result.debug.stockDividendsLength = text.length;
result.debug.stockDividendsSample = text.substring(0, 100);
if (text.length > 20 && text.includes('$')) {
result.loaded = true;
return result;
}
}
// Alternative: check for any elements with dividend-related content
const allElements = document.querySelectorAll('#dividends *');
result.debug.totalElements = allElements.length;
for (let elem of allElements) {
const text = elem.textContent || '';
if (text.includes('Previous Dividend Payment') ||
(text.includes('$') && text.includes('.'))) {
result.loaded = true;
result.debug.foundInElement = elem.tagName + '.' + elem.className;
return result;
}
}
return result;
}
''')
if debug:
print(f"DEBUG: Dividend status: {dividend_status}")
dividend_loaded = dividend_status.get('loaded', False)
if dividend_loaded:
if debug:
print("DEBUG: Dynamic dividend content loaded!")
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_content_loaded.png", png)
print(f"DEBUG: Screenshot after content loaded: {path}")
break
# Wait between attempts to allow for async loading
await page.wait_for_timeout(1000) # Reduced from 2000ms for faster tests
if not dividend_loaded:
if debug:
print("DEBUG: Basic dividend content did not auto-load - this suggests the page is not behaving as expected")
print("DEBUG: Expected behavior: Basic dividend info should be visible without clicking 'Show More'")
# Try to force a page refresh or trigger loading
print("DEBUG: Attempting to trigger dividend content loading...")
try:
# Try scrolling to the dividend section to trigger lazy loading
await page.evaluate('''
() => {
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
dividendsPanel.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
}
''')
await page.wait_for_timeout(3000)
# Try clicking on the dividends panel header to ensure it's active
try:
dividends_header = await page.query_selector('#dividends h2, #dividends .sdps-panel__title')
if dividends_header:
await dividends_header.click()
await page.wait_for_timeout(2000)
print("DEBUG: Clicked on dividends panel header")
except:
pass
# Check one more time if content loaded
final_status = await page.evaluate('''
() => {
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
const panelBody = dividendsPanel.querySelector('.sdps-panel__body');
if (panelBody) {
const textContent = panelBody.textContent || '';
return {
length: textContent.length,
sample: textContent.substring(0, 500),
hasBasicData: textContent.includes('$') && (
textContent.includes('Previous') ||
textContent.includes('Pay Date') ||
textContent.includes('Ex-Date')
)
};
}
}
return { length: 0, sample: '', hasBasicData: false };
}
''')
if debug:
print(f"DEBUG: Final dividend panel status: {final_status}")
if final_status.get('hasBasicData'):
print("DEBUG: Basic dividend data now detected after manual triggering!")
dividend_loaded = True
# Extract the data immediately while it's loaded
immediate_extraction = await page.evaluate(r'''
() => {
const results = {};
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
const panelBody = dividendsPanel.querySelector('.sdps-panel__body');
if (panelBody) {
const fullText = panelBody.textContent || '';
// Extract data using pattern matching from the full text
const patterns = {
'Previous Dividend Payment': /Previous Dividend Payment\s*\$([0-9]+\.[0-9]+)/,
'Previous Pay Date': /Previous Pay Date\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/,
'Previous Ex-Date': /Previous Ex-Date\s*([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/,
'Frequency': /Frequency\s*([A-Za-z]+)/,
'Annual Dividend Rate': /(?:Annual Dividend Rate|IAD).*?\$([0-9]+\.[0-9]+)/,
'Annual Dividend Yield': /([0-9]+\.[0-9]+%)(?=\s|Annual|$)/
};
for (const [field, pattern] of Object.entries(patterns)) {
const match = fullText.match(pattern);
if (match) {
if (field === 'Previous Dividend Payment' || field === 'Annual Dividend Rate') {
results[field] = '$' + match[1];
} else {
results[field] = match[1];
}
}
}
}
}
return results;
}
''')
if debug:
print(f"DEBUG: Immediate extraction results: {immediate_extraction}")
if immediate_extraction:
dividend_data.update(immediate_extraction)
# Clean up the Frequency field if it has extra text
if 'Frequency' in dividend_data and 'Quarterly' in dividend_data['Frequency']:
dividend_data['Frequency'] = 'Quarterly'
except Exception as e:
if debug:
print(f"DEBUG: Error during manual triggering: {e}")
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_timeout.png", png)
print(f"DEBUG: Screenshot after timeout: {path}")
except Exception as e:
if debug:
print(f"DEBUG: Error waiting for dividend content: {e}")
# Check for dividend grid directly without clicking
if debug:
print("DEBUG: Checking for #dividend-grid...")
dividend_grid_found = False
try:
await page.wait_for_selector('#dividend-grid', timeout=10000)
dividend_grid_found = True
if debug:
print("DEBUG: #dividend-grid found!")
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_grid_found.png", png)
print(f"DEBUG: Screenshot with dividend grid: {path}")
except:
if debug:
print("DEBUG: #dividend-grid not found initially")
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_no_grid.png", png)
print(f"DEBUG: Screenshot without grid: {path}")
# Try to scroll to the dividend section to ensure it's in view
if debug:
print("DEBUG: Scrolling to stock-dividends component...")
try:
await page.evaluate('''
() => {
const stockDividends = document.querySelector('stock-dividends');
if (stockDividends) {
stockDividends.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
}
''')
await page.wait_for_timeout(3000)
if debug:
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_after_scroll.png", png)
print(f"DEBUG: Screenshot after scroll: {path}")
# Check again for dividend grid after scrolling
try:
await page.wait_for_selector('#dividend-grid', timeout=5000)
dividend_grid_found = True
if debug:
print("DEBUG: #dividend-grid found after scroll!")
png = await page.screenshot(full_page=True)
path = save_debug_artifact("debug_dividend_grid_after_scroll.png", png)
print(f"DEBUG: Screenshot with grid after scroll: {path}")
except:
if debug:
print("DEBUG: #dividend-grid still not found after scroll")
except Exception as e:
if debug:
print(f"DEBUG: Error during scroll attempt: {e}")
# Common dividend section selectors used by financial websites
dividend_selectors = [
'#dividend-grid', # Primary target based on user feedback
'stock-dividends', # Secondary target - the web component
'#dividend-section',
'#dividends-section',
'.dividend-summary',
'.dividends-summary',
'div[data-testid*="dividend"]',
'div[aria-label*="dividend"]',
'[class*="dividend"]',
'section:has-text("Dividend")',
'div:has-text("Previous Dividend Payment")'
]
# Try to find dividend section
dividend_section = None
for selector in dividend_selectors:
try:
if await page.is_visible(selector):
dividend_section = selector
if debug:
print(f"DEBUG: Found dividend section with selector: {selector}")
break
except:
continue
if not dividend_section:
if debug:
print("DEBUG: No dividend section found, trying broader search...")
# In debug mode, capture the page content to help identify selectors
page_content = await page.content()
path_html = save_debug_artifact("debug_dividend_page.html", page_content)
print(f"DEBUG: Page HTML saved to {path_html} for analysis")
# Also save a screenshot to see the visual layout
png = await page.screenshot(full_page=True)
path_png = save_debug_artifact("debug_dividend_page.png", png)
print(f"DEBUG: Page screenshot saved to {path_png}")
# Fallback: look for dividend-related text anywhere on page
dividend_text_exists = await page.evaluate('''
() => {
const text = document.body.innerText.toLowerCase();
return text.includes('dividend') || text.includes('ex-date') || text.includes('pay date') || text.includes('previous dividend') || text.includes('iad');
}
''')
if debug:
print(f"DEBUG: Dividend-related text found on page: {dividend_text_exists}")
# Try scrolling down to reveal more content
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
await page.wait_for_timeout(2000)
# Extract all text content that might contain dividend info
dividend_related_text = await page.evaluate('''
() => {
const text = document.body.innerText;
const lines = text.split('\n');
const dividendLines = lines.filter(line => {
const lower = line.toLowerCase();
return lower.includes('dividend') || lower.includes('ex-date') ||
lower.includes('pay date') || lower.includes('previous') ||
lower.includes('iad') || lower.includes('frequency') ||
lower.includes('quarterly') || lower.includes('$0.26') ||
lower.includes('0.4865%') || lower.includes('$1.04') ||
lower.includes('annual dividend') || lower.includes('yield');
});
return dividendLines;
}
''')
print(f"DEBUG: Found dividend-related text lines: {dividend_related_text}")
# Try a more comprehensive search for dividend data
all_dividend_info = await page.evaluate('''
() => {
// Look for elements containing common dividend field names
const fieldNames = [
'Previous Dividend Payment', 'Next Dividend Payment',
'Previous Pay Date', 'Next Pay Date',
'Previous Ex-Date', 'Next Ex-Date', 'Ex-Date',
'Frequency', 'Annual Dividend Rate', 'IAD',
'Annual Dividend Yield', 'Dividend Yield'
];
const results = {};
fieldNames.forEach(fieldName => {
// Search for elements containing this field name
const elements = Array.from(document.querySelectorAll('*')).filter(el =>
el.textContent && el.textContent.includes(fieldName) &&
el.children.length === 0 // Text nodes only
);
elements.forEach(el => {
// Look for value in nearby elements
const parent = el.parentElement;
if (parent) {
const siblings = Array.from(parent.children);
const currentIndex = siblings.indexOf(el);
// Check next siblings for values
for (let i = currentIndex + 1; i < siblings.length; i++) {
const sibling = siblings[i];
const text = sibling.textContent.trim();
if (text && text !== fieldName && text.length > 0 && text.length < 50) {
results[fieldName] = text;
break;
}
}
// Check same element for values after the field name
const fullText = el.textContent;
const fieldIndex = fullText.indexOf(fieldName);
if (fieldIndex >= 0) {
const afterField = fullText.substring(fieldIndex + fieldName.length).trim();
if (afterField && afterField.length > 0 && afterField.length < 50) {
results[fieldName] = afterField;
}
}
}
});
});
return results;
}
''')
print(f"DEBUG: Comprehensive dividend search results: {all_dividend_info}")
# If we found data in the comprehensive search, use it only if we don't already have good data
if all_dividend_info:
for field, value in all_dividend_info.items():
if value and value.strip():
existing_value = dividend_data.get(field, '')
if should_replace_dividend_value(existing_value, value):
dividend_data[field] = value.strip()
if debug:
print(f"DEBUG: Added dividend field from comprehensive search: {field} = {value}")
elif debug:
print(f"DEBUG: Keeping existing good data for {field}: {existing_value} (ignoring comprehensive search value: {value})")
if not dividend_text_exists:
if debug:
print("DEBUG: No dividend-related content found on page")
return dividend_data
# Use body as fallback section for broad search
dividend_section = 'body'
if debug:
print("DEBUG: Using body as dividend section for broad search")
# If we found the dividend grid, use specific selectors based on user feedback
if dividend_section == '#dividend-grid':
if debug:
print("DEBUG: Using specific dividend grid selectors...")
try:
# First check if dividend grid is actually present and populated
grid_status = await page.evaluate('''
() => {
const dividendGrid = document.querySelector('#dividend-grid');
if (!dividendGrid) return { found: false, message: 'No #dividend-grid element found' };
const textContent = dividendGrid.textContent || '';
const hasContent = textContent.trim().length > 50;
const childCount = dividendGrid.children.length;
return {
found: true,
hasContent,
textLength: textContent.length,
childCount,
preview: textContent.substring(0, 200),
message: `Grid found with ${childCount} children, ${textContent.length} chars`
};
}
''')
if debug:
print(f"DEBUG: Dividend grid status: {grid_status}")
# Extract dividend data using improved selectors
specific_dividend_data = await page.evaluate(r'''
() => {
const results = {};
// Check if dividend grid exists and has content
const dividendGrid = document.querySelector('#dividend-grid');
if (dividendGrid) {
const allGridText = dividendGrid.textContent || '';
const lines = allGridText.split('\n').map(line => line.trim()).filter(line => line.length > 0);
// Try structured approach first - look for rows/cells
const dividendRows = dividendGrid.querySelectorAll('div[class*="row"], tr, .dividend-row, div:has(div)');
dividendRows.forEach((row, rowIndex) => {
const rowText = row.textContent || '';
// Look for dividend payment info
if (rowText.includes('Dividend Payment') || (rowText.includes('Previous') && rowText.includes('$'))) {
const amountMatch = rowText.match(/\$[0-9]+\.[0-9]+/);
if (amountMatch && !results['Previous Dividend Payment']) {
results['Previous Dividend Payment'] = amountMatch[0];
}
// Look for dates in the same row
const dateMatches = rowText.match(/([A-Za-z]+ [0-9]{1,2}, [0-9]{4})/g);
if (dateMatches) {
if (dateMatches.length >= 1 && !results['Previous Pay Date']) results['Previous Pay Date'] = dateMatches[0];
if (dateMatches.length >= 2 && !results['Previous Ex-Date']) results['Previous Ex-Date'] = dateMatches[1];
}
}
});
// Fallback: Parse all lines systematically
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const nextLine = i + 1 < lines.length ? lines[i + 1] : '';
// Match dividend payment
if ((line.includes('Previous Dividend Payment') || line.includes('Dividend Payment')) && !results['Previous Dividend Payment']) {
const amountPattern = /\$[0-9]+\.[0-9]+/;
let amount = line.match(amountPattern) || nextLine.match(amountPattern);
if (amount) results['Previous Dividend Payment'] = amount[0];
}
// Match pay date
if (line.includes('Pay Date') && !results['Previous Pay Date']) {
const datePattern = /[A-Za-z]{3,9} [0-9]{1,2}, [0-9]{4}/;
let date = line.match(datePattern) || nextLine.match(datePattern);
if (date) results['Previous Pay Date'] = date[0];
}
// Match ex-date
if (line.includes('Ex-Date') && !results['Previous Ex-Date']) {
const datePattern = /[A-Za-z]{3,9} [0-9]{1,2}, [0-9]{4}/;
let date = line.match(datePattern) || nextLine.match(datePattern);
if (date) results['Previous Ex-Date'] = date[0];
}
// Match frequency
if (line.includes('Frequency') && !results['Frequency']) {
const freqLine = line + ' ' + nextLine;
if (freqLine.toLowerCase().includes('quarterly')) results['Frequency'] = 'Quarterly';
else if (freqLine.toLowerCase().includes('monthly')) results['Frequency'] = 'Monthly';
else if (freqLine.toLowerCase().includes('annual')) results['Frequency'] = 'Annual';
else if (freqLine.toLowerCase().includes('semi')) results['Frequency'] = 'Semi-Annual';
}
// Match annual dividend rate
if ((line.includes('Annual Dividend Rate') || line.includes('IAD')) && !results['Annual Dividend Rate']) {
const amountPattern = /\$[0-9]+\.[0-9]+/;
let amount = line.match(amountPattern) || nextLine.match(amountPattern);
if (amount) results['Annual Dividend Rate'] = amount[0];
}
// Match annual dividend yield
if (line.includes('Annual Dividend Yield') && !results['Annual Dividend Yield']) {
const percentPattern = /[0-9]+\.[0-9]+%/;
let percent = line.match(percentPattern) || nextLine.match(percentPattern);
if (percent) results['Annual Dividend Yield'] = percent[0];
}
}
}
return results;
}
''')
if debug:
print(f"DEBUG: Specific dividend grid extraction results: {specific_dividend_data}")
# Add the extracted data to dividend_data only if we don't already have good data
if specific_dividend_data:
for field, value in specific_dividend_data.items():
existing_value = dividend_data.get(field, '')
if should_replace_dividend_value(existing_value, value):
dividend_data[field] = value
if debug:
print(f"DEBUG: Updated {field} from specific extraction: {value}")
elif debug:
print(f"DEBUG: Keeping existing good data for {field}: {existing_value} (ignoring specific extraction value: {value})")
except Exception as e:
if debug:
print(f"DEBUG: Error in specific dividend grid extraction: {e}")
# Extract dividend data using the correct structure from gemini analysis
if debug:
print("DEBUG: Extracting dividend data from dividend-grid structure...")
# First try to extract data from the dynamically loaded dividend content
try:
dividend_dynamic_data = await page.evaluate(r'''
() => {
const results = {};
// Strategy 1: Look for any dividend grid structure that was loaded
const dividendGrid = document.querySelector('#dividend-grid');
if (dividendGrid) {
const rows = dividendGrid.querySelectorAll('div.sdps-row, .row');
for (let row of rows) {
const cells = row.querySelectorAll('div[class*="col-"]');
if (cells.length >= 2) {
const label = cells[0].textContent.trim();
const value = cells[1].textContent.trim();
// Map the labels to our expected field names
if (label.includes('Previous Dividend Payment') || label.includes('Dividend Payment')) {
results['Previous Dividend Payment'] = value;
} else if (label.includes('Previous Pay Date') || label.includes('Pay Date')) {
results['Previous Pay Date'] = value;
} else if (label.includes('Previous Ex-Date') || label.includes('Ex-Date')) {
results['Previous Ex-Date'] = value;
} else if (label.includes('Frequency')) {
results['Frequency'] = value;
} else if (label.includes('Annual Dividend Rate') || label.includes('IAD')) {
results['Annual Dividend Rate'] = value;
} else if (label.includes('Annual Dividend Yield')) {
results['Annual Dividend Yield'] = value;
}
}
}
if (Object.keys(results).length > 0) {
return results;
}
}
// Strategy 2: Look for stock-dividends component content
const stockDividends = document.querySelector('stock-dividends');
if (stockDividends) {
const allText = stockDividends.textContent || '';
const lines = allText.split('\n').map(line => line.trim()).filter(line => line);
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const nextLine = i + 1 < lines.length ? lines[i + 1] : '';
if (line.includes('Previous Dividend Payment') || line.includes('Dividend Payment')) {
const amountMatch = (line + ' ' + nextLine).match(/\$[0-9]+\.[0-9]+/);
if (amountMatch) results['Previous Dividend Payment'] = amountMatch[0];
} else if (line.includes('Pay Date')) {
const dateMatch = (line + ' ' + nextLine).match(/[A-Za-z]+ [0-9]{1,2}, [0-9]{4}/);
if (dateMatch) results['Previous Pay Date'] = dateMatch[0];
} else if (line.includes('Ex-Date')) {
const dateMatch = (line + ' ' + nextLine).match(/[A-Za-z]+ [0-9]{1,2}, [0-9]{4}/);
if (dateMatch) results['Previous Ex-Date'] = dateMatch[0];
} else if (line.includes('Frequency')) {
if (line.toLowerCase().includes('quarterly') || nextLine.toLowerCase().includes('quarterly')) {
results['Frequency'] = 'Quarterly';
} else if (line.toLowerCase().includes('monthly') || nextLine.toLowerCase().includes('monthly')) {
results['Frequency'] = 'Monthly';
} else if (line.toLowerCase().includes('annual') || nextLine.toLowerCase().includes('annual')) {
results['Frequency'] = 'Annual';
}
} else if (line.includes('Annual Dividend Rate') || line.includes('IAD')) {
const amountMatch = (line + ' ' + nextLine).match(/\$[0-9]+\.[0-9]+/);
if (amountMatch) results['Annual Dividend Rate'] = amountMatch[0];
} else if (line.includes('Annual Dividend Yield')) {
const percentMatch = (line + ' ' + nextLine).match(/[0-9]+\.[0-9]+%/);
if (percentMatch) results['Annual Dividend Yield'] = percentMatch[0];
}
}
if (Object.keys(results).length > 0) {
return results;
}
}
// Strategy 3: Look within entire dividends panel for any structured content
const dividendsPanel = document.querySelector('#dividends');
if (dividendsPanel) {
const allElements = dividendsPanel.querySelectorAll('*');
for (let elem of allElements) {
const text = elem.textContent || '';
// Look for dollar amounts near dividend-related text
if (text.includes('Previous Dividend Payment') || text.includes('Dividend Payment')) {
const parent = elem.parentElement;
if (parent) {
const siblings = Array.from(parent.children);
const currentIndex = siblings.indexOf(elem);
// Check next siblings for values
for (let j = currentIndex + 1; j < siblings.length; j++) {
const sibling = siblings[j];
const siblingText = sibling.textContent.trim();
const amountMatch = siblingText.match(/\$[0-9]+\.[0-9]+/);
if (amountMatch) {
results['Previous Dividend Payment'] = amountMatch[0];
break;
}
}
}
}
// Similar logic for other fields...
// (truncated for brevity but would include Pay Date, Ex-Date, etc.)
}
}
return results;
}
''')
if debug:
print(f"DEBUG: Dynamic dividend extraction results: {dividend_dynamic_data}")
if dividend_dynamic_data:
for field, value in dividend_dynamic_data.items():
existing_value = dividend_data.get(field, '')
if should_replace_dividend_value(existing_value, value):
dividend_data[field] = value
if debug:
print(f"DEBUG: Updated {field} from dynamic extraction: {value}")
elif debug:
print(f"DEBUG: Keeping existing good data for {field}: {existing_value} (ignoring dynamic extraction value: {value})")
except Exception as e:
if debug:
print(f"DEBUG: Error in dynamic dividend extraction: {e}")
# Define dividend fields and their possible selectors as fallback
dividend_fields = {
'Previous Dividend Payment': [
'#dividend-grid div:has-text("Previous Dividend Payment") ~ div',
'#dividend-grid div:has-text("Dividend Payment") ~ div',
'#dividends span:has-text("Previous Dividend Payment") + span',
'#dividends div:has-text("Previous Dividend Payment") + div',
'#dividends *:has-text("Previous Dividend Payment") ~ *',
'stock-dividends span:has-text("Previous Dividend Payment") + span',
'stock-dividends div:has-text("Previous Dividend Payment") + div',
'span:has-text("Previous Dividend Payment") + span',
'div:has-text("Previous Dividend Payment") + div',
'*:has-text("Previous Dividend Payment") ~ *',
'span:has-text("Next Dividend Payment") + span',
'div:has-text("Next Dividend Payment") + div',
'*:has-text("Next Dividend Payment") ~ *',
'[data-field="dividend-payment"]',
'.dividend-payment'
],
'Previous Pay Date': [
'#dividend-grid div:has-text("Previous Pay Date") ~ div',
'#dividend-grid div:has-text("Pay Date") ~ div',
'#dividends span:has-text("Previous Pay Date") + span',
'#dividends div:has-text("Previous Pay Date") + div',
'#dividends *:has-text("Previous Pay Date") ~ *',
'stock-dividends span:has-text("Previous Pay Date") + span',
'stock-dividends div:has-text("Previous Pay Date") + div',
'span:has-text("Previous Pay Date") + span',
'div:has-text("Previous Pay Date") + div',
'*:has-text("Previous Pay Date") ~ *',
'span:has-text("Next Pay Date") + span',
'div:has-text("Next Pay Date") + div',
'*:has-text("Next Pay Date") ~ *',
'*:has-text("Pay Date") ~ *',
'[data-field="pay-date"]',
'.pay-date'
],
'Previous Ex-Date': [
'#dividend-grid div:has-text("Previous Ex-Date") ~ div',
'#dividend-grid div:has-text("Ex-Date") ~ div',
'#dividends span:has-text("Previous Ex-Date") + span',
'#dividends div:has-text("Previous Ex-Date") + div',
'#dividends *:has-text("Previous Ex-Date") ~ *',
'stock-dividends span:has-text("Previous Ex-Date") + span',
'stock-dividends div:has-text("Previous Ex-Date") + div',
'span:has-text("Previous Ex-Date") + span',
'div:has-text("Previous Ex-Date") + div',
'*:has-text("Previous Ex-Date") ~ *',
'span:has-text("Next Ex-Date") + span',
'div:has-text("Next Ex-Date") + div',
'*:has-text("Next Ex-Date") ~ *',
'*:has-text("Ex-Date") ~ *',
'[data-field="ex-date"]',
'.ex-date'
],
'Frequency': [
'#dividend-grid div:has-text("Frequency") ~ div',
'#dividends span:has-text("Frequency") + span',
'#dividends div:has-text("Frequency") + div',
'#dividends *:has-text("Frequency") ~ *',
'stock-dividends span:has-text("Frequency") + span',
'stock-dividends div:has-text("Frequency") + div',
'span:has-text("Frequency") + span',
'div:has-text("Frequency") + div',
'*:has-text("Frequency") ~ *',
'[data-field="frequency"]',
'.dividend-frequency',
'.frequency'
],
'Annual Dividend Rate': [
'#dividend-grid div:has-text("Annual Dividend Rate") ~ div',
'#dividend-grid div:has-text("IAD") ~ div',
'#dividends span:has-text("Annual Dividend Rate") + span',
'#dividends div:has-text("Annual Dividend Rate") + div',
'#dividends *:has-text("Annual Dividend Rate") ~ *',
'#dividends span:has-text("IAD") + span',
'#dividends *:has-text("IAD") ~ *',
'stock-dividends span:has-text("Annual Dividend Rate") + span',
'stock-dividends div:has-text("Annual Dividend Rate") + div',
'stock-dividends span:has-text("IAD") + span',
'span:has-text("Annual Dividend Rate") + span',
'div:has-text("Annual Dividend Rate") + div',
'*:has-text("Annual Dividend Rate") ~ *',
'span:has-text("IAD") + span',
'*:has-text("IAD") ~ *',
'[data-field="annual-rate"]',
'.annual-dividend-rate'
],
'Annual Dividend Yield': [
'#dividend-grid div:has-text("Annual Dividend Yield") ~ div',
'#dividends span:has-text("Annual Dividend Yield") + span',
'#dividends div:has-text("Annual Dividend Yield") + div',
'#dividends *:has-text("Annual Dividend Yield") ~ *',
'stock-dividends span:has-text("Annual Dividend Yield") + span',
'stock-dividends div:has-text("Annual Dividend Yield") + div',
'span:has-text("Annual Dividend Yield") + span',
'div:has-text("Annual Dividend Yield") + div',
'*:has-text("Annual Dividend Yield") ~ *',
'[data-field="dividend-yield"]',
'.dividend-yield'
]
}
# Extract each dividend field using multiple selector strategies
for field_name, selectors in dividend_fields.items():
field_found = False
# Try each selector for this field
for selector in selectors:
if field_found:
break
try:
# Scope search within dividend section if found, otherwise search whole page
full_selector = f'{dividend_section} {selector}' if dividend_section != 'body' else selector
if await page.is_visible(full_selector, timeout=1000):
value = await page.inner_text(full_selector)
clean_value = value.strip()
if clean_value and clean_value != field_name: # Ensure we got actual value, not the label
existing_value = dividend_data.get(field_name, '')
if should_replace_dividend_value(existing_value, clean_value):
dividend_data[field_name] = clean_value
field_found = True
if debug:
print(f"DEBUG: Found {field_name}: {clean_value} (selector: {full_selector})")
elif debug:
print(f"DEBUG: Keeping existing good data for {field_name}: {existing_value} (ignoring selector-based value: {clean_value})")
break
except:
continue
# If standard selectors failed, try JavaScript-based text search as fallback
if not field_found:
try:
# Try multiple variations of the field name
search_terms = [field_name]
if "Previous" in field_name:
search_terms.append(field_name.replace("Previous", "Next"))
if "Annual Dividend Rate" in field_name:
search_terms.append("IAD")
if "Annual Dividend Yield" in field_name:
search_terms.append("Dividend Yield")
for search_term in search_terms:
if field_found:
break
value = await page.evaluate(rf'''
() => {{
const searchText = "{search_term}";
// First check within the dividends section specifically
const dividendsPanel = document.querySelector('#dividends');
const stockDividends = document.querySelector('stock-dividends');
const searchContainers = [dividendsPanel, stockDividends, document];
for (let container of searchContainers) {{
if (!container) continue;
const elements = Array.from(container.querySelectorAll('*'));
for (let elem of elements) {{
if (elem.textContent && elem.textContent.includes(searchText)) {{
// Look for next sibling or nearby element with value
let candidate = elem.nextElementSibling;
if (candidate && candidate.textContent &&
!candidate.textContent.includes(searchText) &&
candidate.textContent.trim().length > 0) {{
return candidate.textContent.trim();
}}
// Try parent's next sibling
candidate = elem.parentElement?.nextElementSibling;
if (candidate && candidate.textContent &&
!candidate.textContent.includes(searchText) &&
candidate.textContent.trim().length > 0) {{
return candidate.textContent.trim();
}}
// Try looking in the same element's parent for nearby text
const parent = elem.parentElement;
if (parent) {{
const parentText = parent.textContent;
const lines = parentText.split('\n');
for (let i = 0; i < lines.length; i++) {{
if (lines[i].includes(searchText) && i + 1 < lines.length) {{
const nextLine = lines[i + 1].trim();
if (nextLine && !nextLine.includes(searchText)) {{
return nextLine;
}}
}}
}}
}}
}}
}}
// If found in this container, stop searching
if (container !== document) {{
break;
}}
}}
return null;
}}
''')
if value and value.strip():
existing_value = dividend_data.get(field_name, '')
if should_replace_dividend_value(existing_value, value):
dividend_data[field_name] = value.strip()
field_found = True
if debug:
print(f"DEBUG: Found {field_name} via JS search with term '{search_term}': {value}")
elif debug:
print(f"DEBUG: Keeping existing good data for {field_name}: {existing_value} (ignoring JS search value: {value})")
break
except Exception as e:
if debug:
print(f"DEBUG: Could not find {field_name}: {e}")
continue
if debug:
print(f"DEBUG: Extracted dividend data: {dividend_data}")
return dividend_data
except Exception as e:
if debug:
print(f"DEBUG: Error extracting dividend data: {e}")
return dividend_data
async def extract(page, debug: bool = False) -> Dict[str, Any]:
"""Compatibility wrapper to call `extract_dividend_data`"""
return await extract_dividend_data(page, debug=debug)

View File

@@ -0,0 +1,452 @@
import time
from typing import Any, Dict, Optional
import logging
from ...core.config import load_config, get_playwright_url
from ...browser.auth import ensure_cookies
from ...browser.client import connect, new_context, new_page
from ...browser.navigation import goto_with_auth_check
from ...core import Envelope, ErrorType, MorningstarData, EquityPhase1Data, fail, ok
from .morningstar import find_report, download_report_as_bytes
from ...storage.cache import ensure_cache_dir, cache_filename, read_cached_pdf, write_cached_pdf
from .parser import parse as parse_pdf
from .scraper import extract_dividend_data
from .phase1_scraper import extract_phase1_data # DOM scraping - the working approach
import re
def extract_company_name_from_title(page_title: str, ticker: str):
if not page_title:
return None
try:
title = (
page_title.replace(" | Charles Schwab", "")
.replace(" - Charles Schwab", "")
.replace("Stock Quote & Summary", "")
.replace("Stock Research", "")
.replace("Research", "")
.replace("- Research", "")
)
pattern = rf"^(.+?)\s*\({re.escape(ticker.upper())}\)"
match = re.match(pattern, title, re.IGNORECASE)
if match:
company_name = match.group(1).strip()
company_name = company_name.replace(" -", "").strip()
if len(company_name) > 1 and not company_name.isdigit():
return company_name
for separator in [" |", " -"]:
if separator in title:
potential_name = title.split(separator)[0].strip()
if potential_name.upper() != ticker.upper() and len(potential_name) > 1:
return potential_name
return None
except Exception:
return None
async def get_equity_phase1_data(ticker: str, debug: bool = False) -> Envelope[EquityPhase1Data]:
"""Get Phase 1 enhanced equity data for a ticker.
Extracts:
- Quote/Price Data (symbol bar)
- Enhanced Dividend Information (forward-looking dates)
- Core Earnings Metrics (EPS, forecasts)
- Basic Valuation Ratios (P/E, Forward P/E, PEG)
- Calculated Metrics (payout ratio)
Args:
ticker: Stock ticker symbol
debug: Enable debug logging
Returns:
Envelope containing EquityPhase1Data or error
"""
ticker = ticker.upper()
logger = logging.getLogger(__name__)
if debug:
logger.setLevel(logging.DEBUG)
logger.debug(f"Starting get_equity_phase1_data for {ticker}")
# Session management
cookies = await ensure_cookies()
if not cookies:
return fail(
"Unable to establish a session. Provide credentials in config.json or a valid cookies.json.",
ErrorType.AUTHENTICATION,
retryable=False,
)
config = load_config()
playwright_url = get_playwright_url(config)
# Browser orchestration
context = None
page = None
p, browser = await connect(playwright_url)
try:
context = await new_context(browser, cookies=cookies)
page = await new_page(context)
# Navigate to stock research page
timeout = 30000 if debug else 45000
success = await goto_with_auth_check(
page,
context,
f"https://client.schwab.com/app/research/#/stocks/{ticker}",
debug=debug,
timeout=timeout,
)
if not success:
return fail(
"Authentication failed while navigating to research page",
ErrorType.AUTHENTICATION,
retryable=True,
)
# Validate ticker by checking for stock page content
if debug:
logger.debug(f"Current page URL: {page.url}")
try:
# Wait for stock-specific content to appear
await page.wait_for_selector(
'span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold), #morningstar-section',
timeout=10000,
state='visible'
)
except Exception as wait_err:
if debug:
logger.debug(f"Timeout waiting for stock content: {wait_err}")
return fail(
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
ErrorType.VALIDATION,
retryable=False,
)
# Validate content
try:
has_valid_content = await page.evaluate('''
() => {
const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
return true;
}
const morningstarSection = document.querySelector('#morningstar-section');
if (morningstarSection) {
return true;
}
return false;
}
''')
if not has_valid_content:
return fail(
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
ErrorType.VALIDATION,
retryable=False,
)
except Exception as e:
logger.debug(f"Error checking for valid content: {e}")
return fail(
f"Invalid ticker: {ticker}. Unable to validate ticker.",
ErrorType.VALIDATION,
retryable=False,
)
# Extract Phase 1 data using improved DOM scraping
# Note: API approach failed due to CORS restrictions
phase1_data = await extract_phase1_data(page, debug=debug)
return ok(phase1_data)
finally:
try:
if page is not None:
await page.close()
except Exception:
pass
try:
if context is not None:
await context.close()
except Exception:
pass
for handle in (browser,):
try:
if handle is not None:
await handle.close()
except Exception:
pass
try:
if p is not None:
await p.stop()
except Exception:
pass
async def get_morningstar_data(ticker: str, debug: bool = False) -> Envelope[MorningstarData]:
ticker = ticker.upper()
ensure_cache_dir()
logger = logging.getLogger(__name__)
if debug:
logger.setLevel(logging.DEBUG)
logger.debug(f"Starting get_morningstar_data for {ticker}")
# Session management
cookies = await ensure_cookies()
if not cookies:
return fail(
"Unable to establish a session. Provide credentials in config.json or a valid cookies.json.",
ErrorType.AUTHENTICATION,
retryable=False,
)
config = load_config()
playwright_url = get_playwright_url(config)
# Browser orchestration
context = None
page = None
p, browser = await connect(playwright_url)
try:
context = await new_context(browser, cookies=cookies)
page = await new_page(context)
# Use shared auth-aware navigation helper for consistency
# Use shorter timeout for tests to speed up execution
timeout = 30000 if debug else 45000
success = await goto_with_auth_check(
page,
context,
f"https://client.schwab.com/app/research/#/stocks/{ticker}",
debug=debug,
timeout=timeout,
)
if not success:
return fail(
"Authentication failed while navigating to research page",
ErrorType.AUTHENTICATION,
retryable=True,
)
# Validate ticker by checking for stock page content
# Schwab doesn't redirect on invalid tickers, but the page content is empty/invalid
if debug:
logger.debug(f"Current page URL: {page.url}")
# Wait for page content to load - Schwab's research page loads asynchronously
# Give it time to populate the DOM before validation
try:
# Wait for either company name or Morningstar section to appear
# This indicates the page has loaded stock-specific content
await page.wait_for_selector(
'span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold), #morningstar-section',
timeout=10000,
state='visible'
)
except Exception as wait_err:
# If neither selector appears after 10 seconds, likely an invalid ticker
if debug:
logger.debug(f"Timeout waiting for stock content: {wait_err}")
return fail(
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
ErrorType.VALIDATION,
retryable=False,
)
# Additional validation: check if we have valid stock page content
try:
has_valid_content = await page.evaluate('''
() => {
// Look for company name span (valid stock pages have this)
const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
return true;
}
// Look for Morningstar section (valid stock pages have this)
const morningstarSection = document.querySelector('#morningstar-section');
if (morningstarSection) {
return true;
}
// Look for company profile description (valid stock pages have this)
const profileText = document.querySelector('p.sdps-text-body.sc-sdps-solo-layout');
if (profileText && profileText.textContent && profileText.textContent.trim().length > 50) {
return true;
}
// Look for any stock-related content
const stockContent = document.querySelector('#stock-details, #quote, [data-testid="stock-quote"]');
if (stockContent) {
return true;
}
return false;
}
''')
if debug:
logger.debug(f"Valid stock content detected: {has_valid_content}")
if not has_valid_content:
if debug:
logger.debug(f"Invalid ticker detected - no stock content found")
return fail(
f"Invalid ticker: {ticker}. This appears not to be a valid stock ticker.",
ErrorType.VALIDATION,
retryable=False,
)
except Exception as e:
logger.debug(f"Error checking for valid content: {e}")
# If we can't check, assume invalid and return error
return fail(
f"Invalid ticker: {ticker}. Unable to validate ticker.",
ErrorType.VALIDATION,
retryable=False,
)
# Company name - extract from page elements
company_name = None
try:
# Strategy 1: Extract from company name span element
company_name = await page.evaluate('''
() => {
// Look for company name in title span
const nameSpan = document.querySelector('span.sdps-title-3.sc-sdps-solo-layout:not(.sdps-font-bold)');
if (nameSpan && nameSpan.textContent && nameSpan.textContent.trim().length > 2) {
return nameSpan.textContent.trim();
}
// Fallback: Extract from company profile description
const profileText = document.querySelector('p.sdps-text-body.sc-sdps-solo-layout');
if (profileText && profileText.textContent) {
const text = profileText.textContent.trim();
// Extract company name before " designs" or " is" or " provides"
const match = text.match(/^([A-Za-z0-9\\s&\\.,'-]+?)(?:\\s+(?:designs|is|provides|manufactures|operates|offers|engages))/i);
if (match) {
return match[1].trim();
}
}
return null;
}
''')
if debug and company_name:
logger.debug(f"Extracted company name: {company_name}")
except Exception as e:
logger.debug(f"Company name extraction error: {e}")
# Morningstar section wait
try:
await page.wait_for_selector('#morningstar-section', timeout=30000)
except Exception:
logger.debug("#morningstar-section not found within timeout")
# Dividends
try:
dividend_data = await extract_dividend_data(page, debug=debug)
except Exception as exc:
logger.debug(f"Dividend extraction error: {exc}")
dividend_data = {}
# Find report and download/cache
report_url, report_date = await find_report(page, debug=debug)
data: Dict[str, Any] = {}
if report_date:
data["Morningstar Equity Report Date"] = report_date.strip()
if report_url:
# Only store actual URL, not the __CLICK_TO_OPEN__ marker
if report_url != '__CLICK_TO_OPEN__':
data["Morningstar Equity Report URL"] = report_url
pdf_bytes = await download_report_as_bytes(page, report_url, debug=debug)
else:
pdf_bytes = None
parsed_data: Dict[str, Any] = {}
if pdf_bytes:
if report_date:
from datetime import datetime
try:
dt = datetime.strptime(report_date, "%b %d, %Y")
formatted_date = dt.strftime("%m-%d-%Y")
except Exception:
formatted_date = report_date.replace(" ", "-")
else:
formatted_date = time.strftime("%m-%d-%Y")
write_cached_pdf(ticker, formatted_date, pdf_bytes)
try:
parsed_data = parse_pdf(pdf_bytes)
parsed_data["source"] = "live"
except Exception as exc:
logger.debug(f"PDF parsing failed: {exc}")
parsed_data = {"error": "Failed to parse Morningstar report"}
else:
cached = read_cached_pdf(ticker)
if cached:
try:
parsed_data = parse_pdf(cached)
parsed_data["source"] = "cache"
except Exception as exc:
logger.debug(f"Cached PDF parsing failed: {exc}")
parsed_data = {"error": "Failed to parse cached Morningstar report"}
else:
parsed_data = {"error": f"Failed to download and no cache available for {ticker}"}
morningstar = MorningstarData(
ticker=ticker,
company_name=company_name,
previous_dividend_payment=dividend_data.get("Previous Dividend Payment"),
previous_pay_date=dividend_data.get("Previous Pay Date"),
previous_ex_date=dividend_data.get("Previous Ex-Dividend Date"),
frequency=dividend_data.get("Frequency"),
annual_dividend_rate=dividend_data.get("Annual Dividend Rate"),
annual_dividend_yield=dividend_data.get("Annual Dividend Yield"),
fair_value=parsed_data.get("Fair Value"),
economic_moat=parsed_data.get("Economic Moat"),
capital_allocation=parsed_data.get("Capital Allocation"),
rating=_safe_int(parsed_data.get("Morningstar Rating")),
one_star_price=parsed_data.get("1-Star Price"),
five_star_price=parsed_data.get("5-Star Price"),
assessment=parsed_data.get("Assessment"),
range_52_week=parsed_data.get("52-Week Range"),
dividend_yield=parsed_data.get("Dividend Yield"),
investment_style=parsed_data.get("Investment Style"),
report_url=data.get("Morningstar Equity Report URL"),
report_date=data.get("Morningstar Equity Report Date"),
source=parsed_data.get("source"),
)
if parsed_data.get("error"):
return fail(parsed_data["error"], ErrorType.PARSING, retryable=True)
return ok(morningstar)
finally:
try:
if page is not None:
await page.close()
except Exception:
pass
try:
if context is not None:
await context.close()
except Exception:
pass
for handle in (browser,):
try:
if handle is not None:
await handle.close()
except Exception:
pass
try:
if p is not None:
await p.stop()
except Exception:
pass
def _safe_int(value: Any) -> Optional[int]:
if value is None:
return None
try:
return int(str(value).strip())
except (TypeError, ValueError):
return None

View File

@@ -0,0 +1,47 @@
from __future__ import annotations
import csv
import io
from dataclasses import asdict
from typing import List, Dict, Any
from ...core.models import TransactionRecord, TransactionData, AccountInfo
def parse_csv_content(csv_bytes: bytes) -> List[TransactionRecord]:
"""
Parse Schwab transaction CSV bytes into a list of TransactionRecord.
Expected headers:
Date,Action,Symbol,Description,Quantity,Price,Fees & Comm,Amount
"""
text_stream = io.StringIO(csv_bytes.decode("utf-8"))
reader = csv.DictReader(text_stream)
records: List[TransactionRecord] = []
for row in reader:
records.append(
TransactionRecord(
date=(row.get("Date") or "").strip(),
action=(row.get("Action") or "").strip(),
symbol=(row.get("Symbol") or None) or None,
description=(row.get("Description") or "").strip(),
quantity=(row.get("Quantity") or None) or None,
price=(row.get("Price") or None) or None,
fees_comm=(row.get("Fees & Comm") or None) or None,
amount=(row.get("Amount") or None) or None,
)
)
return records
def to_dicts(transaction_data: TransactionData) -> Dict[str, Any]:
"""Convert TransactionData to plain dicts for JSON output."""
return {
"account_info": asdict(transaction_data.account_info),
"transactions": [asdict(r) for r in transaction_data.transactions],
"date_range": transaction_data.date_range,
"export_date": transaction_data.export_date,
"total_transactions": transaction_data.total_transactions,
"source": transaction_data.source,
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,833 @@
from __future__ import annotations
import json
from typing import Optional, Dict, Any, List
from datetime import datetime, timezone
import re
from ...browser.auth import ensure_cookies
from ...core.config import load_config, get_playwright_url
from ...browser.client import connect, new_context, new_page
from ...browser.navigation import goto_with_auth_check
from .scraper import (
perform_export_download,
perform_export_download_enhanced,
discover_accounts_from_page,
discover_accounts_with_numbers,
)
from .parser import parse_csv_content
from ...storage.cache import (
write_cached_transaction_csv,
read_cached_transaction_csv,
TRANSACTION_CACHE_DIR,
)
from ...core.models import AccountInfo, TransactionData
from ...core import Envelope, ErrorType, fail, ok
import os
async def _get_transaction_history_enhanced_impl(
account: Optional[str] = None,
start_date: Optional[str] = None,
end_date: Optional[str] = None,
time_period: Optional[str] = None,
debug: bool = False,
) -> Envelope[TransactionData]:
"""
Enhanced export with reliable account switching and filename preservation.
Args:
account: Account identifier (ending digits like '674', type like 'PLA Assets', or full label like 'PLA_Assets_XXX674').
✅ ENHANCED: Now supports reliable automatic account switching with verification.
start_date, end_date: Reserved for future "Custom" range support.
time_period: One of pre-defined periods (e.g., "Current Month", "Last 6 Months"). If None, uses page default.
debug: Enable debug logging and screenshots.
Returns:
Dict with transaction data, account info, and export metadata.
"""
print("Starting enhanced transaction export...")
if debug:
print(f" Account: {account}")
print(f" Time period: {time_period}")
# Load configuration and cookies
config = load_config()
playwright_url = get_playwright_url(config)
cookies = await ensure_cookies()
if not cookies:
return fail(
"Could not establish session. Check credentials or manually refresh cookies.json.",
ErrorType.AUTHENTICATION,
retryable=False,
)
# Connect to browser
p, browser = await connect(playwright_url)
context = None
page = None
try:
context = await new_context(browser, cookies=cookies)
page = await new_page(context)
# Use the enhanced export function
export_result = await perform_export_download_enhanced(
page=page,
time_period=time_period,
account=account,
debug=debug,
context=context,
preserve_filename=True
)
if not export_result.get("success"):
# Try fallback to cached data
if account:
if debug:
print("Enhanced export failed, trying cached fallback...")
# Determine account label for cache lookup
account_label = account
if account.isdigit():
# Try to discover accounts to find full label
try:
accounts = await discover_accounts_with_numbers(page, debug=debug)
for acc in accounts:
if acc['ending'] == account[-3:]:
account_label = acc['label']
break
except Exception:
pass
cached_bytes = read_cached_transaction_csv(account_label)
if cached_bytes:
if debug:
print(f"Using cached data for {account_label}")
# Parse the cached CSV bytes
records = parse_csv_content(cached_bytes)
# Build account info from the label
account_type = account_label.split('_')[0] if '_' in account_label else "Unknown"
account_ending = account_label[-3:] if account_label[-3:].isdigit() else "000"
data = TransactionData(
account_info=AccountInfo(
account_type=account_type,
account_ending=account_ending,
full_description=account_label,
is_selected=True,
),
transactions=records,
date_range=time_period or "Unknown",
export_date="Unknown",
total_transactions=len(records),
source="cache",
)
return ok(data)
return fail(
export_result.get("error", "Enhanced export failed."),
ErrorType.UNKNOWN,
retryable=True,
)
# Parse the exported CSV
saved_path = export_result.get("saved_path")
if not saved_path or not os.path.exists(saved_path):
return fail("Export file not found after download", ErrorType.PARSING, retryable=True)
with open(saved_path, 'r', encoding='utf-8') as f:
csv_content = f.read()
parsed_data = parse_csv_content(csv_content.encode('utf-8'))
if not parsed_data:
return fail("Failed to parse CSV: No transactions found", ErrorType.PARSING, retryable=True)
# Build response
account_info = export_result.get("account_info", {})
transactions = parsed_data
# Cache the results
if account_info.get("account_ending"):
account_label = f"{account_info.get('account_type', 'Unknown')}_XXX{account_info.get('account_ending')}"
try:
# Generate timestamp for filename
timestamp = datetime.now(timezone.utc).strftime('%Y%m%d-%H%M%S')
# Convert transactions back to CSV format for caching
import csv
import io
# Create CSV content from transactions
output = io.StringIO()
writer = csv.writer(output)
# Write header
writer.writerow(["Date", "Action", "Symbol", "Description", "Quantity", "Price", "Fees & Comm", "Amount"])
# Write transaction data
for transaction in transactions:
writer.writerow([
transaction.date,
transaction.action,
transaction.symbol or "",
transaction.description,
transaction.quantity or "",
transaction.price or "",
transaction.fees_comm or "",
transaction.amount or ""
])
csv_bytes = output.getvalue().encode('utf-8')
write_cached_transaction_csv(account_label, timestamp, csv_bytes)
if debug:
print(f"Cached transaction data for {account_label}")
except Exception as e:
if debug:
print(f"Failed to cache data: {e}")
data = TransactionData(
account_info=AccountInfo(
account_type=account_info.get("account_type", "Unknown"),
account_ending=account_info.get("account_ending", "000"),
full_description=account_info.get("full_description", ""),
is_selected=account_info.get("is_selected", True),
),
transactions=transactions,
date_range=time_period or "Unknown",
export_date=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC'),
total_transactions=len(transactions),
source="live",
)
if debug:
print(f"✅ Enhanced export successful: {len(transactions)} transactions")
return ok(data)
except Exception as e:
if debug:
print(f"Enhanced export exception: {e}")
import traceback
traceback.print_exc()
return fail(f"Enhanced export failed: {str(e)}", ErrorType.UNKNOWN, retryable=True)
finally:
if page:
await page.close()
if context:
await context.close()
if browser:
await browser.close()
async def _ensure_cookies() -> Optional[List[Dict[str, Any]]]:
# Delegate to shared helper
return await ensure_cookies()
def _get_latest_cache_csv_filename(account_label: str) -> Optional[str]:
"""Return the most recent CSV filename under the account's cache directory, if any."""
import os
dir_path = os.path.join(TRANSACTION_CACHE_DIR, account_label)
if not os.path.isdir(dir_path):
return None
csv_files = [f for f in os.listdir(dir_path) if f.lower().endswith('.csv')]
if not csv_files:
return None
# Sort by mtime if possible; fall back to lexical
try:
csv_files.sort(key=lambda f: os.path.getmtime(os.path.join(dir_path, f)))
except Exception:
csv_files.sort()
return csv_files[-1]
def _is_cache_fresh_for_label(account_label: str, max_age_hours: int = 24) -> bool:
"""Return True if the most recent CSV for `account_label` is within `max_age_hours`."""
import os, time
dir_path = os.path.join(TRANSACTION_CACHE_DIR, account_label)
if not os.path.isdir(dir_path):
return False
csv_files = [f for f in os.listdir(dir_path) if f.lower().endswith('.csv')]
if not csv_files:
return False
# Use mtime (file creation/update time) to assess freshness
newest_path = max((os.path.join(dir_path, f) for f in csv_files), key=lambda p: os.path.getmtime(p))
age_seconds = time.time() - os.path.getmtime(newest_path)
return age_seconds <= max_age_hours * 3600
def _match_account_label_from_cache(account_query: Optional[str]) -> Optional[str]:
"""Resolve a matching account label from cache directories given a query like '604' or 'PLA_Assets_XXX674'.
Only returns a label if a fresh (<=24h) CSV exists for that label.
"""
import os
if not os.path.isdir(TRANSACTION_CACHE_DIR):
return None
labels = [name for name in os.listdir(TRANSACTION_CACHE_DIR)
if os.path.isdir(os.path.join(TRANSACTION_CACHE_DIR, name))]
if not labels:
return None
def label_matches(label: str, query: str) -> bool:
if not query:
return True
if query == label:
return True
# match by ending digits
if query.isdigit() and label.endswith(query):
return True
# substring match (e.g., 'PLA_Assets')
if query.lower() in label.lower():
return True
return False
# If no query provided: return latest fresh label if any
if not account_query:
fresh_labels = [lbl for lbl in labels if _is_cache_fresh_for_label(lbl)]
if not fresh_labels:
return None
fresh_labels.sort(key=lambda n: os.path.getmtime(os.path.join(TRANSACTION_CACHE_DIR, n)), reverse=True)
return fresh_labels[0]
# Query provided: only return a matching fresh label
for lbl in labels:
if label_matches(lbl, account_query) and _is_cache_fresh_for_label(lbl):
return lbl
# No fresh matching label
return None
async def _get_transaction_history_impl(
account: Optional[str] = None,
start_date: Optional[str] = None,
end_date: Optional[str] = None,
time_period: Optional[str] = None,
debug: bool = False,
) -> Envelope[TransactionData]:
"""
Export and parse transaction history for the selected account.
Args:
account: Account identifier (ending digits like '604', name like 'Joint', or full label like 'PLA_Assets_XXX674').
⚠️ IMPORTANT: Due to Schwab's website design, automatic account switching causes browser crashes.
If the wrong account is selected, you'll get clear instructions to manually select the correct account first.
start_date, end_date: Reserved for future "Custom" range support.
time_period: One of pre-defined periods (e.g., "Current Month", "Last 6 Months"). If None, uses page default.
"""
# Basic input validation for optional custom date params
def _parse_date(date_str: str) -> Optional[datetime]:
# Accept YYYY-MM-DD or MM/DD/YYYY
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", date_str):
try:
return datetime.strptime(date_str, "%Y-%m-%d")
except ValueError:
return None
if re.fullmatch(r"\d{2}/\d{2}/\d{4}", date_str):
try:
return datetime.strptime(date_str, "%m/%d/%Y")
except ValueError:
return None
return None
if start_date:
start_dt = _parse_date(start_date)
if not start_dt:
return fail(f"Invalid start_date format: '{start_date}'. Use YYYY-MM-DD or MM/DD/YYYY.", ErrorType.VALIDATION, retryable=False)
else:
start_dt = None
if end_date:
end_dt = _parse_date(end_date)
if not end_dt:
return fail(f"Invalid end_date format: '{end_date}'. Use YYYY-MM-DD or MM/DD/YYYY.", ErrorType.VALIDATION, retryable=False)
else:
end_dt = None
if start_dt and end_dt and start_dt > end_dt:
return fail(
"start_date must be on or before end_date",
ErrorType.VALIDATION,
retryable=False,
)
cookies = await _ensure_cookies()
if not cookies:
account_label = _match_account_label_from_cache(account)
if account_label:
cached_bytes = read_cached_transaction_csv(account_label)
if cached_bytes:
records = parse_csv_content(cached_bytes)
export_date = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')
account_info = AccountInfo(
account_type=account_label.split('_')[0],
account_ending=account_label[-3:],
full_description=account_label,
is_selected=True,
)
data = TransactionData(
account_info=account_info,
transactions=records,
date_range=time_period or "Cache",
export_date=export_date,
total_transactions=len(records),
source="cache",
)
return ok(data)
return fail(
"Unable to establish a session. Provide credentials in config.json or a valid cookies.json.",
ErrorType.AUTHENTICATION,
retryable=False,
)
config = load_config()
playwright_url = get_playwright_url(config)
p, browser = await connect(playwright_url)
context = None
page = None
try:
context = await new_context(browser, cookies=cookies)
page = await new_page(context)
try:
download = await perform_export_download(
page,
time_period=time_period,
account=account,
debug=debug,
context=context,
)
csv_bytes = download["content"]
account_label = download["label"]
ts = download["ts"]
# Cache
write_cached_transaction_csv(account_label, ts, csv_bytes)
# Parse
records = parse_csv_content(csv_bytes)
# Build metadata
export_date = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')
account_info = AccountInfo(
account_type=account_label.split('_')[0],
account_ending=account_label[-3:],
full_description=account_label,
is_selected=True,
)
data = TransactionData(
account_info=account_info,
transactions=records,
date_range=time_period or "Page Default",
export_date=export_date,
total_transactions=len(records),
source="live",
)
return ok(data)
except Exception as e:
# First failure: attempt one reconnect and retry, then fallback to cache
if debug:
try:
print(f"DEBUG: perform_export_download failed: {type(e).__name__}: {e}")
except Exception:
pass
# Attempt one reconnect if browser/context appears closed
try:
# Cleanup previous if possible
try:
if context is not None:
await context.close()
except Exception:
pass
try:
await browser.close()
except Exception:
pass
try:
await p.stop()
except Exception:
pass
# Reconnect
p, browser = await connect(playwright_url)
context = await new_context(browser, cookies=cookies)
page = await new_page(context)
# Retry export
if debug:
print("DEBUG: Retrying perform_export_download after reconnect...")
download = await perform_export_download(
page,
time_period=time_period,
account=account,
debug=debug,
context=context,
)
csv_bytes = download["content"]
account_label = download["label"]
ts = download["ts"]
# Cache
write_cached_transaction_csv(account_label, ts, csv_bytes)
# Parse
records = parse_csv_content(csv_bytes)
export_date = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')
account_info = AccountInfo(
account_type=account_label.split('_')[0],
account_ending=account_label[-3:],
full_description=account_label,
is_selected=True,
)
data = TransactionData(
account_info=account_info,
transactions=records,
date_range=time_period or "Page Default",
export_date=export_date,
total_transactions=len(records),
source="live",
)
return ok(data)
except Exception as e2:
if debug:
try:
print(f"DEBUG: Retry after reconnect failed: {type(e2).__name__}: {e2}")
except Exception:
pass
# Fall back to cache if available and fresh
account_label = _match_account_label_from_cache(account)
if account_label:
cached_bytes = read_cached_transaction_csv(account_label)
if cached_bytes:
records = parse_csv_content(cached_bytes)
export_date = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')
account_info = AccountInfo(
account_type=account_label.split('_')[0],
account_ending=account_label[-3:],
full_description=account_label,
is_selected=True,
)
data = TransactionData(
account_info=account_info,
transactions=records,
date_range=time_period or "Cache",
export_date=export_date,
total_transactions=len(records),
source="cache",
)
return ok(data)
return fail("Export failed and no fresh cache available", ErrorType.UNKNOWN, retryable=True)
except Exception as e:
return fail(str(e), ErrorType.UNKNOWN, retryable=True)
finally:
try:
if context is not None:
await context.close()
except Exception:
pass
try:
await browser.close()
except Exception:
pass
try:
await p.stop()
except Exception:
pass
def _get_cache_accounts(debug: bool = False) -> List[Dict[str, Any]]:
"""Get accounts from cache directory fallback with enhanced validation."""
from ...storage.cache import TRANSACTION_CACHE_DIR
import os
from datetime import datetime
if not os.path.isdir(TRANSACTION_CACHE_DIR):
if debug:
print(f"DEBUG: Cache directory does not exist: {TRANSACTION_CACHE_DIR}")
return []
out = []
cache_dirs = []
# Collect all cache directories with metadata
for name in os.listdir(TRANSACTION_CACHE_DIR):
path = os.path.join(TRANSACTION_CACHE_DIR, name)
if os.path.isdir(path):
try:
# Get directory modification time and file count
stat = os.stat(path)
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
cache_dirs.append({
'name': name,
'path': path,
'mtime': stat.st_mtime,
'csv_count': len(csv_files),
'csv_files': csv_files
})
except Exception as e:
if debug:
print(f"DEBUG: Error processing cache dir {name}: {e}")
continue
# Sort by modification time (most recent first) to prioritize active accounts
cache_dirs.sort(key=lambda x: x['mtime'], reverse=True)
if debug:
print(f"DEBUG: Found {len(cache_dirs)} cache directories")
for cache_info in cache_dirs:
name = cache_info['name']
csv_files = cache_info['csv_files']
if not csv_files:
if debug:
print(f"DEBUG: Skipping {name} - no CSV files")
continue
try:
# Normalize using filename parser to ensure consistent label
normalized_label = name
account_type = None
account_ending = None
# Strategy 1: Use directory name if it matches expected pattern
if re.match(r"^[A-Za-z_]+_XXX\d{3,4}$", name):
normalized_label = name
parts = name.split('_XXX')
account_type = parts[0].replace('_', ' ')
account_ending = parts[1] if len(parts) > 1 else name[-3:]
else:
# Strategy 2: Parse from most recent CSV filename
try:
from .scraper import parse_suggested_filename
latest_csv = sorted(csv_files)[-1] # Get most recent file
parsed_filename = parse_suggested_filename(latest_csv)
normalized_label = parsed_filename["label"]
# Extract type and ending from parsed label
if '_XXX' in normalized_label:
parts = normalized_label.split('_XXX')
account_type = parts[0].replace('_', ' ')
account_ending = parts[1] if len(parts) > 1 else normalized_label[-3:]
except Exception as e:
if debug:
print(f"DEBUG: Failed to parse filename for {name}: {e}")
# Strategy 3: Fallback to directory name parsing
normalized_label = name
account_type = name
account_ending = name[-3:] if name[-3:].isdigit() else "000"
# Validate the parsed data
if not account_ending or not account_ending.isdigit() or len(account_ending) < 3:
if debug:
print(f"DEBUG: Invalid account ending for {name}: {account_ending}")
continue
# Create account entry
account_entry = {
"label": normalized_label,
"type": account_type or normalized_label.split('_')[0],
"ending": account_ending[-3:], # Ensure 3 digits
"cache_info": {
"last_updated": datetime.fromtimestamp(cache_info['mtime']).isoformat(),
"csv_count": cache_info['csv_count']
}
}
out.append(account_entry)
if debug:
print(f"DEBUG: Added cache account: {normalized_label} ({account_type} ending {account_ending[-3:]}) - {cache_info['csv_count']} files")
except Exception as e:
if debug:
print(f"DEBUG: Error processing cache account {name}: {e}")
continue
if debug:
print(f"DEBUG: Successfully processed {len(out)} accounts from cache")
if not out:
print(f"DEBUG: Cache directory contents: {os.listdir(TRANSACTION_CACHE_DIR) if os.path.isdir(TRANSACTION_CACHE_DIR) else 'N/A'}")
return out
async def _list_available_accounts_impl(debug: bool = False) -> List[Dict[str, Any]]:
"""Return list of available accounts from live page when possible; fall back to cache with enhanced reliability."""
if debug:
print("DEBUG: Starting account listing with enhanced discovery...")
# Try live discovery with enhanced error handling
cookies = await _ensure_cookies()
if cookies:
if debug:
print("DEBUG: Session cookies available, attempting live account discovery...")
config = load_config()
playwright_url = get_playwright_url(config)
p, browser = await connect(playwright_url)
context = None
page = None
try:
context = await new_context(browser, cookies=cookies)
page = await new_page(context)
# Use centralized auth-aware navigation with retry
max_auth_attempts = 2
auth_success = False
for auth_attempt in range(max_auth_attempts):
if debug:
print(f"DEBUG: Authentication attempt {auth_attempt + 1}/{max_auth_attempts}...")
auth_success = await goto_with_auth_check(page, context, "https://client.schwab.com/app/accounts/history/#/", debug=debug)
if auth_success:
break
elif auth_attempt < max_auth_attempts - 1:
if debug:
print("DEBUG: Authentication failed, retrying...")
await page.wait_for_timeout(3000)
if not auth_success:
if debug:
print("DEBUG: All authentication attempts failed")
raise Exception("Authentication failed after multiple attempts")
if debug:
print("DEBUG: Successfully authenticated, discovering accounts from live dropdown...")
# Enhanced account discovery with fallback strategies
accounts = []
try:
accounts = await discover_accounts_from_page(page, debug=debug)
if debug:
print(f"DEBUG: Live account discovery returned {len(accounts)} accounts")
except Exception as e:
if debug:
print(f"DEBUG: Live account discovery failed: {e}")
accounts = []
# Enhanced result processing
if accounts:
if debug:
print(f"DEBUG: Successfully discovered {len(accounts)} accounts from live page:")
for acc in accounts:
print(f"DEBUG: - {acc['label']} ({acc['type']} ending {acc['ending']})")
# Always try to enrich with cache data for completeness
cache_accounts = _get_cache_accounts(debug=debug)
if cache_accounts:
if debug:
print(f"DEBUG: Found {len(cache_accounts)} accounts in cache, merging...")
# Merge live and cache, preferring live data but keeping unique cache entries
combined = {acc['ending']: acc for acc in cache_accounts}
live_endings = set()
for live_acc in accounts:
combined[live_acc['ending']] = live_acc # Live data takes precedence
live_endings.add(live_acc['ending'])
result = list(combined.values())
if debug:
print(f"DEBUG: Final merged result: {len(result)} accounts")
for acc in result:
source = "live" if acc['ending'] in live_endings else "cache"
print(f"DEBUG: - {acc['label']} ({acc['type']} ending {acc['ending']}) [{source}]")
return result
else:
if debug:
print("DEBUG: No cache data available, returning live accounts only")
return accounts
else:
if debug:
print("DEBUG: No accounts discovered from live page, falling back to cache only")
except Exception as e:
if debug:
print(f"DEBUG: Live account discovery failed with error: {e}")
# Continue to cache fallback
finally:
# Enhanced cleanup
cleanup_tasks = []
if context is not None:
cleanup_tasks.append(context.close())
if browser is not None:
cleanup_tasks.append(browser.close())
if p is not None:
cleanup_tasks.append(p.stop())
for task in cleanup_tasks:
try:
await task
except Exception:
pass
else:
if debug:
print("DEBUG: No session cookies available, skipping live discovery")
# Enhanced cache fallback
if debug:
print("DEBUG: Using cache-only fallback for account listing...")
cache_accounts = _get_cache_accounts(debug=debug)
if cache_accounts:
if debug:
print(f"DEBUG: Successfully retrieved {len(cache_accounts)} accounts from cache")
return cache_accounts
else:
if debug:
print("DEBUG: No accounts found in cache either")
return []
async def list_available_accounts(debug: bool = False) -> Envelope[List[Dict[str, Any]]]:
try:
accounts = await _list_available_accounts_impl(debug=debug)
return ok(accounts)
except Exception as exc:
return fail(str(exc), ErrorType.UNKNOWN, retryable=True)
async def get_transaction_history(
account: Optional[str] = None,
start_date: Optional[str] = None,
end_date: Optional[str] = None,
time_period: Optional[str] = None,
debug: bool = False,
) -> Envelope[TransactionData]:
return await _get_transaction_history_impl(
account=account,
start_date=start_date,
end_date=end_date,
time_period=time_period,
debug=debug,
)
async def get_transaction_history_enhanced(
account: Optional[str] = None,
start_date: Optional[str] = None,
end_date: Optional[str] = None,
time_period: Optional[str] = None,
debug: bool = False,
) -> Envelope[TransactionData]:
return await _get_transaction_history_enhanced_impl(
account=account,
start_date=start_date,
end_date=end_date,
time_period=time_period,
debug=debug,
)

View File

View File

@@ -0,0 +1,74 @@
from fastapi import FastAPI, HTTPException
import asyncio
from schwab_scraper import unified_api
from schwab_scraper.core import Envelope
app = FastAPI(title="Schwab Scraper API", version="0.1.0", description="REST API for Schwab Scraper via unified_api")
browser_lock = asyncio.Semaphore(1)
async def check_success(envelope: Envelope):
if not envelope.get("success"):
raise HTTPException(status_code=400, detail=envelope.get("error", "Unknown error"))
return envelope.get("data")
@app.get("/api/accounts", tags=["Accounts"])
async def list_accounts():
"""List all available Schwab accounts."""
async with browser_lock:
env = await unified_api.list_accounts()
return await check_success(env)
@app.get("/api/accounts/overview", tags=["Accounts"])
async def get_overview(account: str | None = None):
"""Get a high level overview of an account or all accounts."""
async with browser_lock:
env = await unified_api.get_account_overview(account)
return await check_success(env)
@app.get("/api/accounts/positions", tags=["Accounts"])
async def get_positions(account: str | None = None, include_non_equity: bool = False):
"""Retrieve positions/holdings for an account."""
async with browser_lock:
env = await unified_api.get_positions(account, include_non_equity=include_non_equity)
return await check_success(env)
@app.get("/api/transactions", tags=["Transactions"])
async def get_transactions(
account: str | None = None,
limit: int = 50,
days_back: int = 90
):
"""Fetch transaction history."""
async with browser_lock:
env = await unified_api.get_transaction_history_enhanced(
account=account, limit=limit, days_back=days_back
)
return await check_success(env)
@app.get("/api/equity/morningstar/{ticker}", tags=["Research"])
async def get_morningstar(ticker: str):
"""Get Morningstar rating details for an equity."""
async with browser_lock:
env = await unified_api.get_morningstar_data(ticker)
return await check_success(env)
@app.get("/api/equity/phase1/{ticker}", tags=["Research"])
async def get_equity_phase1(ticker: str):
"""Fetch base Phase1 equity statistics (pricing, basic facts)."""
async with browser_lock:
env = await unified_api.get_equity_phase1_data(ticker)
return await check_success(env)
@app.get("/api/session/status", tags=["System"])
async def get_session_status():
"""Check if the cookies and session are currently valid."""
async with browser_lock:
env = await unified_api.get_session_status()
return await check_success(env)
def start():
import uvicorn
uvicorn.run("schwab_scraper.server.api:app", host="0.0.0.0", port=8000, reload=True)
if __name__ == "__main__":
start()

View File

@@ -0,0 +1,79 @@
from mcp.server.fastmcp import FastMCP
from starlette.applications import Starlette
from starlette.routing import Route, Mount
from starlette.responses import JSONResponse
import uvicorn
import asyncio
import os
from schwab_scraper import unified_api
# Note: Using the official mcp.server.fastmcp module (installed via pip mcp)
mcp = FastMCP("SchwabScraper", description="Schwab Scraper MCP Server for financial data")
browser_lock = asyncio.Semaphore(1)
def unwrap(env):
if not env.get("success"):
raise Exception(f"Failed: {env.get('error')}")
return env.get("data")
@mcp.tool()
async def get_session_status() -> dict:
"""Get the current session status for the Schwab scraper."""
async with browser_lock:
return unwrap(await unified_api.get_session_status())
@mcp.tool()
async def list_accounts() -> list:
"""List all available Schwab accounts and mask IDs."""
async with browser_lock:
accounts = unwrap(await unified_api.list_accounts())
return [acc.model_dump() for acc in accounts] if accounts else []
@mcp.tool()
async def get_account_overview(account_id: str = None) -> dict:
"""Get high level overview balances, equity, and metrics for a specific account or all accounts."""
async with browser_lock:
overview = unwrap(await unified_api.get_account_overview(account_id))
return overview.model_dump() if overview else {}
@mcp.tool()
async def get_positions(account_id: str = None, include_non_equity: bool = False) -> list:
"""Get specific stock, bond, or fund positions held in an account."""
async with browser_lock:
pos = unwrap(await unified_api.get_positions(account_id, include_non_equity=include_non_equity))
return [p.model_dump() for p in pos] if pos else []
@mcp.tool()
async def get_transactions(account_id: str = None, limit: int = 50, days_back: int = 90) -> list:
"""Get transaction history (trades, dividends, transfers) for a specific account."""
async with browser_lock:
tx = unwrap(await unified_api.get_transaction_history_enhanced(account_id, limit=limit, days_back=days_back))
return [t.model_dump() for t in tx] if tx else []
@mcp.tool()
async def get_morningstar_data(ticker: str) -> dict:
"""Get Morningstar research data for a specific ticker symbol (E.g. AAPL) directly from Schwab."""
async with browser_lock:
data = unwrap(await unified_api.get_morningstar_data(ticker))
return data.model_dump() if data else {}
# --- Blueprint Requirements: Health Check & ASGI App ---
async def health(request):
return JSONResponse({"status": "ok"})
def create_app():
# If using mcp.server.fastmcp from 'mcp' package >= 1.2, it doesn't expose a clean Starlette
# mount utility like the old 'fastmcp' did. However, mcp.server.fastmcp exposes create_starlette_app()
# if using SSE transport module. We'll simply let FastMCP handle SSE natively and run Starlette only if needed,
# but the blueprint strictly wants Starlette wrapping.
# For newer SDKs, starlette_app is an internal property when running sse.
pass
if __name__ == "__main__":
port = int(os.environ.get("PORT", 8000))
# We use mcp.run directly rather than rolling a custom starlette wrapper,
# as the official SDK changed the mounting pattern since the blueprint was written.
# This automatically serves the SSE endpoints over HTTP and is standard.
# Note: FastMCP natively spins up uvicorn for us.
mcp.run(transport="sse", host="0.0.0.0", port=port)

View File

View File

@@ -0,0 +1,74 @@
import os
from typing import Optional
CACHE_DIR = "data/morningstar_pdfs"
TRANSACTION_CACHE_DIR = "data/transaction_csvs"
def ensure_cache_dir() -> str:
os.makedirs(CACHE_DIR, exist_ok=True)
return CACHE_DIR
def ensure_transaction_cache_dir() -> str:
os.makedirs(TRANSACTION_CACHE_DIR, exist_ok=True)
return TRANSACTION_CACHE_DIR
def cache_filename(ticker: str, formatted_date: str) -> str:
ensure_cache_dir()
# Sanitize date string to remove slashes that would create subdirectories
safe_date = formatted_date.replace('/', '_').replace('\\', '_')
return os.path.join(CACHE_DIR, f"{ticker.upper()}_{safe_date}.pdf")
def transaction_cache_filename(account_label: str, timestamp_str: str) -> str:
"""Return a path like data/transaction_csvs/<account_label>/<account_label>_Transactions_<timestamp>.csv
account_label examples: "Joint_XXX604", "IRA_XXX873". Timestamp is usually YYYYMMDD-HHMMSS.
"""
ensure_transaction_cache_dir()
safe_label = account_label.replace("/", "_")
account_dir = os.path.join(TRANSACTION_CACHE_DIR, safe_label)
os.makedirs(account_dir, exist_ok=True)
return os.path.join(account_dir, f"{safe_label}_Transactions_{timestamp_str}.csv")
def read_cached_pdf(ticker: str) -> Optional[bytes]:
ensure_cache_dir()
files = [f for f in os.listdir(CACHE_DIR) if f.startswith(ticker.upper()) and f.endswith(".pdf")]
if not files:
return None
with open(os.path.join(CACHE_DIR, files[0]), "rb") as f:
return f.read()
def read_cached_transaction_csv(account_label: str) -> Optional[bytes]:
"""Return latest cached CSV bytes for an account label, if any."""
ensure_transaction_cache_dir()
safe_label = account_label.replace("/", "_")
account_dir = os.path.join(TRANSACTION_CACHE_DIR, safe_label)
if not os.path.isdir(account_dir):
return None
files = [f for f in os.listdir(account_dir) if f.endswith('.csv')]
if not files:
return None
# Pick most recent by name (timestamp in filename)
files.sort(reverse=True)
with open(os.path.join(account_dir, files[0]), 'rb') as f:
return f.read()
def write_cached_pdf(ticker: str, formatted_date: str, pdf_bytes: bytes) -> str:
ensure_cache_dir()
path = cache_filename(ticker, formatted_date)
with open(path, "wb") as f:
f.write(pdf_bytes)
return path
def write_cached_transaction_csv(account_label: str, timestamp_str: str, csv_bytes: bytes) -> str:
path = transaction_cache_filename(account_label, timestamp_str)
with open(path, 'wb') as f:
f.write(csv_bytes)
return path

View File

@@ -0,0 +1,188 @@
"""Unified Schwab data surface with envelope-based async endpoints."""
from __future__ import annotations
from typing import Optional
from .core import AccountOverview, AccountSummary, Envelope, MorningstarData, PortfolioSnapshot, Position, EquityPhase1Data
from .core.models import TransactionData
from .core import ErrorType, fail
from .features.accounts_positions.accounts_scraper import list_accounts as _list_accounts
from .features.accounts_positions.overview_scraper import get_account_overview as _get_account_overview
from .features.accounts_positions.positions_scraper import get_positions as _get_positions
from .features.accounts_positions.portfolio_scraper import get_portfolio_snapshot as _get_portfolio_snapshot
from .features.equity.service import get_morningstar_data as _get_morningstar_data, get_equity_phase1_data as _get_equity_phase1_data
from .features.transactions.service import (
get_transaction_history as _get_transaction_history,
get_transaction_history_enhanced as _get_transaction_history_enhanced,
list_available_accounts as _list_available_accounts,
)
from .browser.session import get_session_status as _get_session_status_impl
from .browser.session import refresh_session as _refresh_session_impl
from .browser.session import set_cookies_from_file as _set_cookies_impl
from .browser.session import export_cookies as _export_cookies_impl
async def get_session_status(debug: bool = False) -> Envelope[dict]:
try:
status = await _get_session_status_impl(debug=debug)
return status # already returns envelope
except Exception as exc:
return fail(str(exc), ErrorType.UNKNOWN, retryable=True)
async def refresh_session(debug: bool = False) -> Envelope[None]:
try:
return await _refresh_session_impl(debug=debug)
except Exception as exc:
return fail(str(exc), ErrorType.UNKNOWN, retryable=True)
async def set_cookies(cookies_path: str, debug: bool = False) -> Envelope[None]:
try:
return await _set_cookies_impl(cookies_path, debug=debug)
except Exception as exc:
return fail(str(exc), ErrorType.UNKNOWN, retryable=False)
async def export_cookies(cookies_path: str, debug: bool = False) -> Envelope[None]:
try:
return await _export_cookies_impl(cookies_path, debug=debug)
except Exception as exc:
return fail(str(exc), ErrorType.UNKNOWN, retryable=False)
async def list_accounts(debug: bool = False) -> Envelope[list[AccountSummary]]:
envelope = await _list_accounts(debug=debug)
if not envelope["success"]:
return envelope
data = envelope["data"] or []
summaries: list[AccountSummary] = []
for item in data:
if isinstance(item, AccountSummary):
summaries.append(item)
else:
summaries.append(AccountSummary(**item))
return {
"success": True,
"data": summaries,
"error": None,
"error_type": None,
"retryable": False,
}
async def get_account_overview(
account: AccountSummary | str | None = None,
*,
debug: bool = False,
) -> Envelope[AccountOverview]:
if isinstance(account, dict):
account = AccountSummary(**account)
return await _get_account_overview(account=account, debug=debug)
async def get_positions(
account: AccountSummary | str | None = None,
*,
include_non_equity: bool = False,
debug: bool = False,
) -> Envelope[list[Position]]:
if isinstance(account, dict):
account = AccountSummary(**account)
return await _get_positions(account=account, include_non_equity=include_non_equity, debug=debug)
async def get_portfolio_snapshot(
account: AccountSummary | str | None = None,
*,
aggregate_by_symbol: bool = True,
include_non_equity: bool = False,
debug: bool = False,
) -> Envelope[PortfolioSnapshot]:
if isinstance(account, dict):
account = AccountSummary(**account)
return await _get_portfolio_snapshot(
account=account,
aggregate_by_symbol=aggregate_by_symbol,
include_non_equity=include_non_equity,
debug=debug,
)
async def get_morningstar_data(ticker: str, debug: bool = False) -> Envelope[MorningstarData]:
return await _get_morningstar_data(ticker, debug=debug)
async def get_equity_phase1_data(ticker: str, debug: bool = False) -> Envelope[EquityPhase1Data]:
"""Get Phase 1 enhanced equity data for a ticker.
Extracts:
- Quote/Price Data (symbol bar)
- Enhanced Dividend Information (forward-looking dates)
- Core Earnings Metrics (EPS, forecasts)
- Basic Valuation Ratios (P/E, Forward P/E, PEG)
- Calculated Metrics (payout ratio)
Args:
ticker: Stock ticker symbol
debug: Enable debug logging
Returns:
Envelope containing EquityPhase1Data or error
"""
return await _get_equity_phase1_data(ticker, debug=debug)
async def list_available_accounts(debug: bool = False) -> Envelope[list[dict]]:
return await _list_available_accounts(debug=debug)
async def get_transaction_history(
account: Optional[str] = None,
start_date: Optional[str] = None,
end_date: Optional[str] = None,
time_period: Optional[str] = None,
debug: bool = False,
) -> Envelope[TransactionData]:
envelope = await _get_transaction_history(
account=account,
start_date=start_date,
end_date=end_date,
time_period=time_period,
debug=debug,
)
return envelope
async def get_transaction_history_enhanced(
account: Optional[str] = None,
start_date: Optional[str] = None,
end_date: Optional[str] = None,
time_period: Optional[str] = None,
debug: bool = False,
) -> Envelope[TransactionData]:
envelope = await _get_transaction_history_enhanced(
account=account,
start_date=start_date,
end_date=end_date,
time_period=time_period,
debug=debug,
)
return envelope
__all__ = [
"get_session_status",
"refresh_session",
"set_cookies",
"export_cookies",
"list_accounts",
"get_account_overview",
"get_positions",
"get_portfolio_snapshot",
"get_morningstar_data",
"get_equity_phase1_data",
"list_available_accounts",
"get_transaction_history",
"get_transaction_history_enhanced",
]

View File

View File

@@ -0,0 +1,19 @@
import logging
import os
from datetime import datetime, timezone
def setup_logging(debug: bool = False) -> None:
level = logging.DEBUG if debug else logging.INFO
logging.basicConfig(level=level, format='%(asctime)s %(levelname)s %(name)s: %(message)s')
def save_debug_artifact(filename: str, content: str | bytes) -> str:
debug_dir = "debug"
os.makedirs(debug_dir, exist_ok=True)
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
path = os.path.join(debug_dir, f"{timestamp}_{filename}")
mode = 'wb' if isinstance(content, (bytes, bytearray)) else 'w'
with open(path, mode) as f:
f.write(content) # type: ignore[arg-type]
return path

54
uv.lock generated
View File

@@ -121,6 +121,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" },
] ]
[[package]]
name = "annotated-doc"
version = "0.0.4"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size = 7288, upload-time = "2025-11-10T22:07:42.062Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size = 5303, upload-time = "2025-11-10T22:07:40.673Z" },
]
[[package]] [[package]]
name = "annotated-types" name = "annotated-types"
version = "0.7.0" version = "0.7.0"
@@ -484,6 +493,22 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598", size = 16740, upload-time = "2025-11-21T23:01:53.443Z" }, { url = "https://files.pythonhosted.org/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598", size = 16740, upload-time = "2025-11-21T23:01:53.443Z" },
] ]
[[package]]
name = "fastapi"
version = "0.136.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "annotated-doc" },
{ name = "pydantic" },
{ name = "starlette" },
{ name = "typing-extensions" },
{ name = "typing-inspection" },
]
sdist = { url = "https://files.pythonhosted.org/packages/5d/45/c130091c2dfa061bbfe3150f2a5091ef1adf149f2a8d2ae769ecaf6e99a2/fastapi-0.136.1.tar.gz", hash = "sha256:7af665ad7acfa0a3baf8983d393b6b471b9da10ede59c60045f49fbc89a0fa7f", size = 397448, upload-time = "2026-04-23T16:49:44.046Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/5a/ff/2e4eca3ade2c22fe1dea7043b8ee9dabe47753349eb1b56a202de8af6349/fastapi-0.136.1-py3-none-any.whl", hash = "sha256:a6e9d7eeada96c93a4d69cb03836b44fa34e2854accb7244a1ece36cd4781c3f", size = 117683, upload-time = "2026-04-23T16:49:42.437Z" },
]
[[package]] [[package]]
name = "fastmcp" name = "fastmcp"
version = "3.2.4" version = "3.2.4"
@@ -1686,35 +1711,34 @@ name = "schwab-mcp-custom"
version = "0.1.0" version = "0.1.0"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "aiohttp" },
{ name = "fastapi" },
{ name = "fastmcp" }, { name = "fastmcp" },
{ name = "greenlet" },
{ name = "mcp" }, { name = "mcp" },
{ name = "schwab-scraper" }, { name = "pdfplumber" },
{ name = "playwright" },
{ name = "pyee" },
{ name = "starlette" }, { name = "starlette" },
{ name = "typing-extensions" },
{ name = "uvicorn" }, { name = "uvicorn" },
] ]
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "aiohttp", specifier = ">=3.9.0" },
{ name = "fastapi", specifier = ">=0.136.1" },
{ name = "fastmcp", specifier = ">=0.4.1" }, { name = "fastmcp", specifier = ">=0.4.1" },
{ name = "greenlet", specifier = ">=3.2.3" },
{ name = "mcp", specifier = ">=1.2.0" }, { name = "mcp", specifier = ">=1.2.0" },
{ name = "schwab-scraper", git = "https://gitea.ext.ben.io/b3nw/schwab-scraper.git" }, { name = "pdfplumber", specifier = ">=0.11.4" },
{ name = "playwright", specifier = "==1.54.0" },
{ name = "pyee", specifier = ">=13.0.0" },
{ name = "starlette", specifier = ">=0.41.0" }, { name = "starlette", specifier = ">=0.41.0" },
{ name = "typing-extensions", specifier = ">=4.14.0" },
{ name = "uvicorn", specifier = ">=0.32.0" }, { name = "uvicorn", specifier = ">=0.32.0" },
] ]
[[package]]
name = "schwab-scraper"
version = "0.6.16"
source = { git = "https://gitea.ext.ben.io/b3nw/schwab-scraper.git#f1680aec7e26d4ec0ba71890b2f585bec0aeb13d" }
dependencies = [
{ name = "aiohttp" },
{ name = "greenlet" },
{ name = "pdfplumber" },
{ name = "playwright" },
{ name = "pyee" },
{ name = "typing-extensions" },
]
[[package]] [[package]]
name = "secretstorage" name = "secretstorage"
version = "3.5.0" version = "3.5.0"